From 9cedc21fe5d34047668c5b688cd685f466df0bb7 Mon Sep 17 00:00:00 2001 From: Grzegorz Daniluk <grzegorz.daniluk@cern.ch> Date: Thu, 21 Jan 2016 16:14:05 +0100 Subject: [PATCH] doc/wrs_failures: merging procedures with snmp objects description --- doc/wrs_failures/snmp_exports.tex | 85 ++++-- doc/wrs_failures/snmp_objects.tex | 463 +++++++++++++++++++++++------- doc/wrs_failures/wrs_failures.tex | 19 +- 3 files changed, 438 insertions(+), 129 deletions(-) diff --git a/doc/wrs_failures/snmp_exports.tex b/doc/wrs_failures/snmp_exports.tex index a4164bcce..722cc46d0 100644 --- a/doc/wrs_failures/snmp_exports.tex +++ b/doc/wrs_failures/snmp_exports.tex @@ -1,23 +1,67 @@ -\section{SNMP exports} +\section{SNMP diagnostics and solving problems} \label{sec:snmp_exports} This section describes SNMP objects exported by the WR Switch. Objects within -the \texttt{WR\--SWITCH\--MIB} are divided into two categories: +the \texttt{WR\--SWITCH\--MIB} are divided into two groups: \begin{itemize} - \item operator/basic objects (section \ref{sec:snmp_exports:basic}) - - providing basic status of the switch. It should be used by a control system - operators and people without a deep knowledge of the White Rabbit internals. - These values report a general status of the device and high level errors. + \item General status objects for operators (section + \ref{sec:snmp_exports:basic}) - provide a summary about the status of a + switch and several main subsystems (like timing, networking, OS). These + should be used by control system operators and users without a + comprehensive knowledge of the White Rabbit internals. These exports provide + a general status of the device and high level errors which is enough in most + cases to perform a quick repair. - \item expert/extended status objects (section \ref{sec:snmp_exports:expert}) - + \item Expert objects (section \ref{sec:snmp_exports:expert}) - can be used by White Rabbit experts for the in-depth diagnosis of the switch - failures. These values are verbose and should not be used by the operators. + failures. These values are verbose and normally should not be used by the + operators. \end{itemize} -\subsection{Operator/basic objects} +Description of the general status objects in section +\ref{sec:snmp_exports:basic} includes also a list of actions to follow if a +particular object reports an error. These repair procedures don't require any +in-depth knowledge about White Rabbit. Independently of an error reported, there +are some common remarks that apply to all situations: +\begin{itemize} + \item Linux inside the WR Switch enumerates WR interfaces starting from 0. + This means we have to use internally port indexes 0..17. However, the + port numbers printed on the front panel are 1..18. Syslog messages + generated from the switch use the Linux port numbering. The consequence is + that every time Syslog says there is a problem on port X, this refers to + port index X+1 on the front panel of the switch. + \item If a procedure given for a specific SNMP object does not solve the + problem. Please contact WR experts to perform more in-depth analysis of your + network. For this, you should provide a complete dump of the WRS status + generated in the first step of each procedure. + \item First action in most of the procedures below named \emph{Dump state} + requires simply calling a tool provided by WR developers that reads all the + detailed information from the switch and writes it to a single file that can + be later analyzed by the experts.\\ + {\bf TODO: point to the tool once it's done} + \item If solving procedure requires restarting or replacing a broken WR + Switch, please make sure that after the repair, all other WR devices + connected to the affected switch are synchronized and do not report any + problems. + \item If a procedure requires replacing switch with a new unit, the broken one + should be handled to WR experts or the switch manufacturer to investigate + the problem. +\end{itemize} + +\subsection{General status objects for operators} \label{sec:snmp_exports:basic} -This section describes the general status MIB objects that are calculated based -on the other SNMP (detailed) exports. Most of the status objects described in -this section can have one of the following values: +This section describes the general status MIB objects that represent the overall +status of a device and its subsystems. They are organized in a tree structure +(fig.\ref{fig:snmp_oper}) where each object reports a problem based on the +status of its child objects. SNMP object in the third layer of this tree are +calculated based on the SNMP expert objects. Most of the status objects +described in this section can have one of the following values: +\begin{figure}[ht] + \begin{center} + \includegraphics[width=.8\textwidth]{img/snmp_obj.pdf} + \caption{The structure of general status objects for operators} + \label{fig:snmp_oper} + \end{center} +\end{figure} \begin{itemize}%[leftmargin=0pt] \item \texttt{NA} -- status value was not calculated at all (returned value is 0). Something bad has happened. @@ -36,33 +80,34 @@ this section can have one of the following values: object. If you see this please report to WR developers. \end{itemize} -\noindent {\bf General Status objects}: +\paragraph*{SNMP objects:} % SNMP status objects \printnoidxglossary[type=snmp_status,title=,style=objtree,sort=def] \newpage -\subsection{Expert/extended status} +\subsection{Expert objects} \label{sec:snmp_exports:expert} -\noindent {\bf Expert Status}: +\paragraph*{SNMP objects:} % SNMP expert objects \printnoidxglossary[type=snmp_expert,style=objtree,sort=def] -\subsection{Other's MIB objects} -\label{sec:snmp_exports:others} +%\subsection{Other's MIB objects} +%\label{sec:snmp_exports:others} +\vspace{12pt} \noindent {\bf Objects from other MIBs}: % other objects \printnoidxglossary[type=snmp_other,style=objtree,sort=def] -\subsection{Sorted list of MIB objects} -\label{sec:snmp_exports:sorted} +%\subsection{Sorted list of MIB objects} +%\label{sec:snmp_exports:sorted} % print alphabetical list -\printnoidxglossary[type=snmp_all,style=tree,sort=letter] +%\printnoidxglossary[type=snmp_all,style=tree,sort=letter] %%%%%%%%%%%%%%%%%%5 %% Other notes diff --git a/doc/wrs_failures/snmp_objects.tex b/doc/wrs_failures/snmp_objects.tex index a306abeab..d0a7522cf 100644 --- a/doc/wrs_failures/snmp_objects.tex +++ b/doc/wrs_failures/snmp_objects.tex @@ -3,128 +3,379 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Add status entries in the order as the appear in the MIB %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\snmpentrys{WR-SWITCH-MIB}{}{wrsGeneralStatusGroup} {\\ - Group containing collective statuses - of various subsystems and the main system status, describing the status of - entire switch.} +\snmpentrys{WR-SWITCH-MIB}{}{wrsGeneralStatusGroup}{ + Group containing collective status of the switch and its various + subsystems.} - \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsMainSystemStatus} {\\ + \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsMainSystemStatus}{ WRS general status of a switch can be \texttt{OK}, \texttt{Warning} or - \texttt{Error}. When there is an error or warning please check the values of + \texttt{Error}. In case of an error or warning, please check the values of \texttt{wrsOSStatus}, \texttt{wrsTimingStatus} and \texttt{wrsNetworkingStatus} to find out which subsystem causes the problem.} - \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsOSStatus} {\\ - Collective status of the \texttt{wrsOSStatusGroup}. For details please check - the group's content.} - \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsTimingStatus} {\\ - Collective status of the \texttt{wrsTimingStatusGroup}. For details please - check the group's content.} - \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsNetworkingStatus} {\\ - Collective status of the \texttt{wrsNetworkingStatusGroup}. For details - please check the group's content.} - -\snmpentrys{WR-SWITCH-MIB}{}{wrsDetailedStatusesGroup} {\\ + \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsOSStatus}{ + Collective status of the operating system running on WR switch. In case of + an error or warning, please check status objects in the + \texttt{wrsOSStatusGroup}.} + \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsTimingStatus}{ + Collective status of the synchronization subsystem. In case of an + error or warning, please check status objects in the + \texttt{wrsTimingStatusGroup}.} + \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsNetworkingStatus}{ + Collective status of the Ethernet switching subsystem. In case of an error + or warning, please check status objects in the + \texttt{wrsNetworkingStatusGroup}.\vspace{12pt}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\snmpentrys{WR-SWITCH-MIB}{}{wrsDetailedStatusesGroup}{ Branch with collective statuses of various switch subsystems.} - \snmpentrys{WR-SWITCH-MIB}{wrsDetailedStatusesGroup}{wrsOSStatusGroup} {\\ + %------------------------------------------------------------------------ + + \snmpentrys{WR-SWITCH-MIB}{wrsDetailedStatusesGroup}{wrsOSStatusGroup}{ + \underline{Description:} Group with collective statuses of the embedded operating system running on the switch.} - \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsBootSuccessful} {\\ + + \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsBootSuccessful}{ + \underline{Description:} Grouped status of \texttt{wrsBootStatusGroup}, indicating whether boot was successful. \texttt{Error} when dot-config source is wrong, unable to get the dot-config, unable to get URL to the dot-config, dot-config contains errors, unable to read the hwinfo, unable to load the FPGA bitstream, unable to load the LM32 software, any kernel modules or userspace daemons are - missing.} - \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsTemperatureWarning} {\\ - Report whether the temperature thresholds are not set or are exceeded.} - \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsMemoryFreeLow} {\\ - \texttt{Warning} when 50\% of the memory is used, error when more than 80\% - of the memory is used.} - \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsCpuLoadHigh} {\\ - \texttt{Warning} when the average CPU load is more than 2 for the past 1min, + missing.\\ + \underline{On error:} + \begin{pck_proc} + \item Dump state + \item Check \texttt{wrsBootConfigStatus}, if it reports an + error, please verify your WRS configuration. + \item Restart the switch + \item Please consult WR experts if the problem persists. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsTemperatureWarning}{ + \underline{Description:} + Reports whether the temperature thresholds are not set or are exceeded.\\ + \underline{On error:} + \begin{pck_proc} + \item Dump state + \item Verify if your switch configuration contains valid temperature + thresholds. By default, they are all set to 80 \textdegree C. + \item Verify if cooling of the rack where WR Switch is installed works + properly. + \item Verify if both cooling fans in the back of the WR Switch case are + working. + \item Replace the switch with a new unit and consult the WR Switch + manufacturer for a repair. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsMemoryFreeLow}{ + \underline{Description:} + Reports \texttt{Warning} when more than 50\%, or \texttt{Error} when more + than 80\% of the memory is used.\\ + \underline{On error:} + \begin{pck_proc} + \item Dump state + \item Restart the switch + \item Send the dumped state of the switch to WR experts for analysis as + this might mean there is some internal problem in the WRS firmware. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsCpuLoadHigh}{ + \underline{Description:} + Reports \texttt{Warning} when the average CPU load is more than 2 for the past 1min, 1.5 for 5min or 1 for 15min. \texttt{Error} when the average CPU load is - more than 3 for the past 1min, 2 for 5min or 1.5 for 15min.} - \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsDiskSpaceLow} {\\ + more than 3 for the past 1min, 2 for 5min or 1.5 for 15min.\\ + \underline{On error:} + \begin{pck_proc} + \item Dump state + \item Restart the switch + \item Send the dumped state of the switch to WR experts for analysis as + this might mean there is some internal problem in the WRS firmware. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsOSStatusGroup}{wrsDiskSpaceLow}{ + \underline{Description:} \texttt{Warning} when more than 80\% of any disk partition is used. - \texttt{Error} when more than 90\% of any disk partition is used.} - - \snmpentrys{WR-SWITCH-MIB}{wrsDetailedStatusesGroup}{wrsTimingStatusGroup} {\\ - Group with collective statuses of the timing subsystem.} - \snmpentrys{WR-SWITCH-MIB}{wrsTimingStatusGroup}{wrsPTPStatus} {\\ - \texttt{Error} when any of PTP error counters in - \texttt{wrsPtpDataTable} (\texttt{wrsPtpServoStateErrCnt}, - \texttt{wrsPtpClockOffsetErrCnt} or\\ \texttt{wrsPtpRTTErrCnt}) has - increased since the last scan (issue + \texttt{Error} when more than 90\% of any disk partition is used.\\ + \underline{On error:} + \begin{pck_proc} + \item Dump state + \item Check the values of \emph{CONFIG\_WRS\_LOG\_*} configuration options + on the switch. These are the parameters describing where log messages + should be sent from various processes in the switch. Normally users + don't need to modify them, but if any of them is set to a file in the + WRS filesystem (e.g. /tmp/snmp.log) this may reduce the free space after + some time of operation. + \item Restart the switch + \item Send the dumped state of the switch to WR experts for analysis as + this might mean there is some internal problem in the WRS firmware. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + %------------------------------------------------------------------------ + + \snmpentrys{WR-SWITCH-MIB}{wrsDetailedStatusesGroup}{wrsTimingStatusGroup} { + \underline{Description:} + Group with collective statuses of the timing subsystem.} %\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsTimingStatusGroup}{wrsPTPStatus}{ + \underline{Description:} + Reports the status of PTP daemon running on the switch.\\ + \texttt{Error} when any of PTP error counters in \texttt{wrsPtpDataTable}\\ + (\texttt{wrsPtpServoStateErrCnt}, \texttt{wrsPtpClockOffsetErrCnt} or\\ + \texttt{wrsPtpRTTErrCnt}) has increased since the last scan (issue \ref{fail:timing:ppsi_track_phase}, \ref{fail:timing:offset_jump}, \ref{fail:timing:rtt_jump}), at least one of the $\Delta_{TXM}$, $\Delta_{RXM}$, $\Delta_{TXS}$, $\Delta_{RXS}$ is 0 (issue \ref{fail:timing:deltas_report}) or PTP servo update counter is not - increasing.} - \snmpentrys{WR-SWITCH-MIB}{wrsTimingStatusGroup}{wrsSoftPLLStatus} {\\ + increasing.\\ + \underline{On error:} + \begin{pck_proc} + \item Dump state + \item Check \texttt{wrsSoftPLLStatus} on the Master (WR device one step + higher in a timing hierarchy). Eventually proceed to investigate the + problem on the Master switch. Otherwise, continue with the primary WRS. + \item Verify if the link to WR Master was not lost by checking the + object\\ \texttt{wrsSlaveLinksStatus}. + \item If this is not the case, restart the switch. + \item If the problem persists replace the switch with a new unit. + %(see \ref{cern:wrs_replacement}). + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsTimingStatusGroup}{wrsSoftPLLStatus}{ + \underline{Description:} + Reports the status of the PLLs inside the switch.\\ \texttt{Error} when \texttt{wrsSpllSeqState} is not \emph{Ready}, or \texttt{wrsSpllAlignState} is not \emph{Locked} (for Grand Master mode), or - any of \texttt{wrsSpllHlock}, \texttt{wrsSpllMlock} equals to 0 (for Slave - mode) (issue \ref{fail:timing:spll_unlock}).\\ + any of \texttt{wrsSpllHlock}, \texttt{wrsSpllMlock} equals to 0 (for + Boundary Clock mode).\\ \texttt{Warning} when \texttt{wrsSpllDelCnt} $>$ 0 (for Grand Master mode) - or \texttt{wrsSpllDelCnt} has changed (for all other modes).} - \snmpentrys{WR-SWITCH-MIB}{wrsTimingStatusGroup}{wrsSlaveLinksStatus} {\\ - \texttt{Error} when link to Master is down for a switch in the Slave mode - (issue \ref{fail:timing:master_down}). Additionally, \texttt{Error} when the - link to Master is up for a switch in the Free-running Master or Grand - Master mode.} - \snmpentrys{WR-SWITCH-MIB}{wrsTimingStatusGroup}{wrsPTPFramesFlowing} {\\ - \texttt{Error} when PTP Tx/Rx frame counters on active links (Master / Slave - ports) are not being incremented. Report the first run.} - - \snmpentrys{WR-SWITCH-MIB}{wrsDetailedStatusesGroup}{wrsNetworkingStatusGroup} {\\ - Group with collective statuses of the networking subsystem.} - \snmpentrys{WR-SWITCH-MIB}{wrsNetworkingStatusGroup}{wrsSFPsStatus} {\\ + or \texttt{wrsSpllDelCnt} has changed (for all other modes).\\ + \underline{On error:}\\ + For GrandMaster WRS: + \begin{pck_proc} + \item Dump state + \item Check 1-PPS and 10 MHz signals coming from an external source. + Verify if they are properly connected and, in case of GPS receiver, + check if it is synchronized and locked. + \item Restart the GrandMaster switch. + \item If the problem persists, replace the switch with a new unit. + %(see \ref{cern:wrs_replacement}). + \end{pck_proc} + \glspar For Boundary Clock WRS: + \begin{pck_proc} + \item Dump state + \item Check \texttt{wrsSoftPLLStatus} on the Master. Eventually proceed to + investigate the problem on the Master switch. + \item Verify if the link to WR Master was not lost by checking the + object\\ \texttt{wrsSlaveLinksStatus}. + \item Restart the switch. + \item If the problem persists, replace the switch with a new unit. + %(see \ref{cern:wrs_replacement}). + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsTimingStatusGroup}{wrsSlaveLinksStatus}{ + \underline{Description:} + Reports the status of the link on WR ports configured to slave role.\\ + \texttt{Error} when link to master is down for a switch in the Boundary + Clock mode. Additionally, \texttt{Error} is generated when the + link to master is up for a switch in the Free-running Master or Grand + Master mode.\\ + \underline{On error:}\\ + For Master/GrandMaster WRS: + \begin{pck_proc} + \item Check the configuration of the switch. Especially if the + \emph{Timing Mode} is correctly set (i.e. if it was not accidentally set + to \emph{Boundary Clock}). + \item Check the role of each port timing configuration. They should be all + set to \emph{master}. If any of them is set to \emph{slave} you should + verify if there is no WR Master connected to it. + \end{pck_proc} + \glspar For Boundary Clock WRS: + \begin{pck_proc} + \item Check the fiber connection on the slave port of the WRS. + \item Check the configuration of the switch. Especially if the + \emph{Timing Mode} is correctly set (i.e. if it was not accidentally set + to \emph{Grand-Master} or \emph{Free-Running Master}). + \item Check the status of the WR Master connected to the slave port of the + WRS. + \item Replace the faulty switch with a new unit, if this does not solve + the problem, make sure your fiber link is not broken. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsTimingStatusGroup}{wrsPTPFramesFlowing}{ + \underline{Description:} + Reports \texttt{Error} when PTP frames are not being sent and received on + active WR ports - Tx/Rx frame counters on active links (master / slave + ports) are not being incremented. Reports also \texttt{FirstRead} value.\\ + \underline{On error:} + \begin{pck_proc} + \item Check Syslog message to determine the WR port on which the + problem is reported. You should see a message similar to this one:\\ + \texttt{SNMP: wrsPTPFramesFlowing failed for port 1} + \item Check your network layout and the WR Switch configuration. If you + have some non-WR devices connected to ports of the WR Switch (e.g. + computer sending/receiving only data, without the need of + synchronization), these ports should have their role in the timing + configuration set to \emph{non-wr}. + \item Check the status of a WR device connected to the reported port. + \item Restart the switch. + \item If the problem persists, please contact WR experts for in-depth + investigation. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + %------------------------------------------------------------------------ + + \snmpentrys{WR-SWITCH-MIB}{wrsDetailedStatusesGroup}{wrsNetworkingStatusGroup}{ + \underline{Description:} + Group with collective statuses of the networking subsystem.\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsNetworkingStatusGroup}{wrsSFPsStatus}{ + \underline{Description:} + Reports the status of SFP transceivers inserted to the switch.\\ \texttt{Error} when any of the SFPs reports an error. To find out which SFP - caused the problem check \texttt{wrsPortStatusSfpError.<n>}.} - \snmpentrys{WR-SWITCH-MIB}{wrsNetworkingStatusGroup}{wrsEndpointStatus} {\\ + caused the problem check \texttt{wrsPortStatusSfpError.<n>}.\\ + \underline{On error:} + \begin{pck_proc} + \item Check \texttt{wrsPortStatusSfpError.*} SNMP objects or Syslog + messages to determine the WR port on which the problem is reported. In + case of Syslog, you should see a message similar to this one:\\ + \texttt{Unknown SFP vn="AVAGO" pn="ABCU-5710RZ" vs="AN1151PD8A" on port + wr1} + \item If the reported port is intended to be used to connect a device that + does not require WR synchronization (e.g. using a copper SFP module), + then you should verify whether the role in the timing configuration for + this port is set to \emph{non-wr}. + \item Otherwise, you should use a WR-supported SFP module and make sure it + is declared together with calibration values in the WRS configuration. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsNetworkingStatusGroup}{wrsEndpointStatus}{ + \underline{Description:} + Reports the status of Ethernet MAC endpoints on WR ports\\ \texttt{Error} when there is a fault in the Endpoint's - transmission/reception path (issue \ref{fail:data:ep_txrx}).} - \snmpentrys{WR-SWITCH-MIB}{wrsNetworkingStatusGroup}{wrsSwcoreStatus} {\\ - Not used in the current release. Always reports \texttt{OK}.} - \snmpentrys{WR-SWITCH-MIB}{wrsNetworkingStatusGroup}{wrsRTUStatus} {\\ - \texttt{Error} when RTU is full and cannot accept more requests (issue - \ref{fail:data:rtu_full}).} - - \snmpentrys{WR-SWITCH-MIB}{wrsDetailedStatusesGroup}{wrsVersionGroup} {\\ + transmission/reception path.\\ + \underline{On error:} + \begin{pck_proc} + \item Make several state dumps. + \item Restart the switch. + \item Check Syslog messages to determine the WR port on which the problem + is reported. You should see a message similar to this one:\\ + \texttt{SNMP: wrsEndpointStatus failed for port 1} + \item Check the fiber link on a reported port, i.e. try replacing SFP + transceivers on both sides of the link, try using another fiber. + \item If the problem persists, please contact WR experts for in-depth + investigation. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsNetworkingStatusGroup}{wrsSwcoreStatus}{ + \underline{Description:} + Reports the status of the Ethernet switching module.\\ + Status object not implemented in the current firmware release. Always + reports \texttt{OK}.\\ + \underline{On error:} + \begin{pck_proc} + \item Dump state. + \item Restart the switch. + \item Please contact WR experts since this might mean that either there is + too much high priority traffic in your network, or there is some + internal problem in the WRS firmware. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + \snmpentrys{WR-SWITCH-MIB}{wrsNetworkingStatusGroup}{wrsRTUStatus}{ + \underline{Description:} + Reports the status of the routing module responsible for deciding where (to + which port) incoming Ethernet frames should be forwarded.\\ + \texttt{Error} when RTU is overloaded and cannot accept more requests.\\ + \underline{On error:} + \begin{pck_proc} + \item Dump state + \item Restart the switch. + \item If possible, try reducing the load of small Ethernet frames flowing + through your switch. If possible in your application, try using larger + Ethernet frames with lower load to transfer information. + \end{pck_proc} + \glspar \underline{Related problems:}\vspace{12pt}} + + %------------------------------------------------------------------------ + + \snmpentrys{WR-SWITCH-MIB}{wrsDetailedStatusesGroup}{wrsVersionGroup}{ + \underline{Description:} Hardware, gateware and software versions. Additionally the serial number and other hardware information for the WRS.} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwVersion} {\\ - software version (as returned from the \texttt{git describe} at build - time).} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwBuildBy} {\\ - software build-by (as returned from the \texttt{git config --get-all - user.name} at build time).} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwBuildDate} {\\ - software build date (\texttt{\_\_DATE\_\_} at build time).} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionBackplaneVersion} {\\ - hardware version of the minibackplane PCB.} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionFpgaType} {\\ + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwVersion}{ + \underline{Description:} + Software version in the form of release version and eventually git commit + from the repository (information provided from \emph{git describe} command + at build time).} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwBuildBy}{ + \underline{Description:} + Information who has built the firmware running on the switch (provided from + \texttt{git config --get-all user.name} command at build time).} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwBuildDate}{ + \underline{Description:} + Firmware build date (\texttt{\_\_DATE\_\_} at build time).} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionBackplaneVersion}{ + \underline{Description:} + Hardware version of the Minibackplane board.} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionFpgaType}{ + \underline{Description:} FPGA model inside the switch.} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionManufacturer} {\\ - name of the manufacturing company.} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwitchSerialNumber} {\\ - serial number (or string) of the switch.} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionScbVersion} {\\ - version of the SCB (the motherboard).} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionGwVersion} {\\ - version of the gateware (FPGA bitstream).} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionGwBuild} {\\ - build ID of the gateware (FPGA bitstream).} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwitchHdlCommitId} {\\ - gateware version: commit ID from the \texttt{wr\_switch\_hdl} repository.} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionGeneralCoresCommitId} {\\ - gateware version: commit ID from the \texttt{general-cores} repository.} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionWrCoresCommitId} {\\ - gateware version: commit ID from the \texttt{wr-cores} repository.} - \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionLastUpdateDate} {\\ - date and time of last firmware update, this information may not be accurate, - due to hard restarts or lack of the proper time at update.} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionManufacturer}{ + \underline{Description:} + Name of the manufacturing company.} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwitchSerialNumber}{ + \underline{Description:} + Serial number of the switch.} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionScbVersion}{ + \underline{Description:} + Hardware version of the main SCB board.} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionGwVersion}{ + \underline{Description:} + Version of the FPGA bitstream (Gateware).} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionGwBuild}{ + \underline{Description:} + Build ID of the FGPA bitstream - the synthesis date} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionSwitchHdlCommitId}{ + \underline{Description:} + FPGA bitstream commit ID from the main \texttt{wr\_switch\_hdl} repository.} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionGeneralCoresCommitId}{ + \underline{Description:} + FPGA bitstream commit ID from the \texttt{general-cores} repository.} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionWrCoresCommitId}{ + \underline{Description:} + FPGA bitstream commit ID from the \texttt{wr-cores} repository.} + + \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionLastUpdateDate}{ + \underline{Description:} + Date and time of the last firmware update, this information may not be + accurate, due to hard restarts or lack of the proper time during the + upgrade.} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -145,17 +396,17 @@ \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsFaultLR}{Not implemented} \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsConfigSource}{} \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsConfigSourceUrl}{} - \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsRestartReasonMonit}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsRestartReasonMonit}{ Process that caused \texttt{monit} to trigger a restart.} \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsBootConfigStatus}{} \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsBootHwinfoReadout}{} \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsBootLoadFPGA}{} \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsBootLoadLM32}{} - \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsBootKernelModulesMissing}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsBootKernelModulesMissing}{ List of kernel modules is defined in the source code.} - \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsBootUserspaceDaemonsMissing}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsBootUserspaceDaemonsMissing}{ List of daemons is defined in the source code.} - \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsGwWatchdogTimeouts}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsBootStatusGroup}{wrsGwWatchdogTimeouts}{ Number of times the watchdog has restarted the HDL module responsible for the Ethernet switching process} @@ -180,7 +431,7 @@ \snmpentrye{WR-SWITCH-MIB}{wrsCpuLoadGroup}{wrsCPULoadAvg5min}{} \snmpentrye{WR-SWITCH-MIB}{wrsCpuLoadGroup}{wrsCPULoadAvg15min}{} - \snmpentrye{WR-SWITCH-MIB}{wrsOperationStatus}{wrsDiskTable}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsOperationStatus}{wrsDiskTable}{ Table with a row for every partition.} \snmpentrye{WR-SWITCH-MIB}{wrsDiskTable}{wrsDiskIndex.<n>}{} \snmpentrye{WR-SWITCH-MIB}{wrsDiskTable}{wrsDiskMountPath.<n>}{} @@ -269,11 +520,11 @@ \snmpentrye{WR-SWITCH-MIB}{}{wrsPtpDataTable}{Table with a row per PTP servo instance.} \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpIndex.<n>}{} - \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpPortName.<n>}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpPortName.<n>}{ The port on which the instance is running.} - \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpGrandmasterID.<n>}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpGrandmasterID.<n>}{ Not implemented.} - \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpOwnID.<n>}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpOwnID.<n>}{ Not implemented.} \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpMode.<n>}{} \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpServoState.<n>}{} @@ -290,12 +541,12 @@ \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpDeltaRxM.<n>}{} \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpDeltaTxS.<n>}{} \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpDeltaRxS.<n>}{} - \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpServoStateErrCnt.<n>}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpServoStateErrCnt.<n>}{ Number of the servo updates when servo is out of the TRACK\_PHASE.} - \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpClockOffsetErrCnt.<n>}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpClockOffsetErrCnt.<n>}{ Number of servo updates when offset is larger than 500ps or smaller than -500ps.} - \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpRTTErrCnt.<n>}{\\ + \snmpentrye{WR-SWITCH-MIB}{wrsPtpDataTable}{wrsPtpRTTErrCnt.<n>}{ Number of servo updates when RTT delta between subsequent updates is larger than 1000ps or smaller than -1000ps.} diff --git a/doc/wrs_failures/wrs_failures.tex b/doc/wrs_failures/wrs_failures.tex index 4a746d7e3..eba4f15f9 100644 --- a/doc/wrs_failures/wrs_failures.tex +++ b/doc/wrs_failures/wrs_failures.tex @@ -12,6 +12,7 @@ \usepackage[latin1]{inputenc} \usepackage{verbatim} \usepackage{amsmath} +\usepackage{textcomp} \usepackage{times,mathptmx} \usepackage{chngcntr} \usepackage{hyperref} @@ -51,6 +52,13 @@ \setlength{\parsep}{0pt} }{\end{itemize}} +\newenvironment{pck_proc}{ +\begin{enumerate}[topsep=2pt] + \setlength{\itemsep}{1pt} + \setlength{\parskip}{0pt} + \setlength{\parsep}{0pt} +}{\end{enumerate}} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % creating subsubsubsection notation @@ -252,7 +260,12 @@ %\bibliographystyle{unsrt} %\bibliography{references} - +\appendix +\newpage +\section{Sorted list of all MIB objects} +\label{sec:snmp_exports:sorted} +% print alphabetical list +\printnoidxglossary[type=snmp_all,style=tree,sort=letter] % add not used entries, but don't display their's section @@ -263,7 +276,7 @@ \ifglsused{\thislabel}{}{\glsadd[format=ignore]{\thislabel}}% } -\newpage -\input{procedures.tex} +%\newpage +%\input{procedures.tex} \end{document} -- GitLab