From e63ffd2dcc61ff775b8003d99b86b6abfdb49503 Mon Sep 17 00:00:00 2001 From: "Matthias P. Braendli" Date: Mon, 7 Aug 2017 17:10:54 +0200 Subject: Add xymon monitoring script and update README --- doc/README.md | 20 +- doc/retodrs.pl | 645 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 662 insertions(+), 3 deletions(-) create mode 100644 doc/retodrs.pl diff --git a/doc/README.md b/doc/README.md index b35d77d..fe4748e 100644 --- a/doc/README.md +++ b/doc/README.md @@ -37,7 +37,21 @@ space used within the common interleaved frame. Fast Information Ch. Main Service Channel COMMON INTERLEAVED FRAME -Configuration -============= -The configuration is given in a file, please see the example in doc/example.mux +Files in this folder +==================== +The configuration is given in a file, this folder contains examples. + +A basic example is in the file *example.mux*, a more complete view of the +settings is available in *advanced.mux* and configuration settings related to +service linking are shown in *servicelinking.mux* + +An explanation on how to use the remote control is in *remote_control.txt*, and +*zmq_remote.py* illustrates how to control ODR-DabMux using the ZMQ remote +control interface. + +Two scripts are used for monitoring systems: *stats_dabmux_multi.py* for Munin, +and *retodrs.pl* for Xymon. + +*DabMux.1* is an old manpage that describes the command line options that +existed in past versions. It is kept for archive. diff --git a/doc/retodrs.pl b/doc/retodrs.pl new file mode 100644 index 0000000..f457ca2 --- /dev/null +++ b/doc/retodrs.pl @@ -0,0 +1,645 @@ +#!/usr/bin/perl -w +# +# RETrieve_Open_Digital_Radio_Status, retodrs: +# Retrieve the status and statistics of an Opendigitalradio service and report +# the results to Xymon. The information is retrieved from the management server +# within ODR-DabMux. +# +# NOTE: This script MUST be run on the same machine as DabMux is running! The +# management server within DabMux is only accessible from localhost. +# Moreover, the check on availability only works for localhost. +# +# Written by W.J.M. Nelis, wim.nelis@ziggo.nl, 2016.12 +# +use strict ; +use Time::Piece ; # Format time +use ZMQ::LibZMQ3 ; # Message passing +use ZMQ::Constants qw(ZMQ_REQ) ; +use JSON::PP ; # Decode server message + +# +# Installation constants. +# ----------------------- +# +my $XyDisp = $ENV{XYMSRV} ; # Name of monitor server +my $XySend = $ENV{XYMON} ; # Monitor interface program +my $FmtDate = "%Y.%m.%d %H:%M:%S" ; # Default date format + $FmtDate = $ENV{XYMONDATEFORMAT} if exists $ENV{XYMONDATEFORMAT} ; +my $HostName= 'OzoNop' ; # 'Source' of this test +my $TestName= 'odr_mux' ; # Test name +my $XyInfo = "hostinfo host=$HostName" ; # Extract host info from xymon + +my @ColourOf= ( 'red', 'yellow', 'clear', 'green' ) ; + +# +# Define the URL to access the management server in DabMux. The access is +# limited to access from localhost! +# +my $ODRMgmtSrvr= 'tcp://127.0.0.1:12720' ; # URL of server + +# +# Define the parameters to show in the table and how to enter them in an RRD. +# From this definition a list of counter-like variables is compiled in hash +# %Counters. The values of these variables need to be saved from one pass of +# this script to the next. +# +my @Params= ( +# OdrName TableName RrdDefinition + [ 'state' , 'State' , '' ], + [ 'peak_left' , 'Peak left [dB]' , 'DS:PeakLeft:GAUGE:600:-100:100' ], + [ 'peak_right' , 'Peak right [dB]', 'DS:PeakRight:GAUGE:600:-100:100' ], + [ 'num_underruns', '' , 'DS:Underrun:DERIVE:600:0:U' ], + [ 'num_overruns' , '' , 'DS:Overrun:DERIVE:600:0:U' ], + [ 'rate_underruns', 'Underrun [/s]' , '' ], + [ 'rate_overruns' , 'Overrun [/s]' , '' ], + [ 'min_fill' , '' , 'DS:BufferMin:GAUGE:600:-1:U' ], + [ 'max_fill' , '' , 'DS:BufferMax:GAUGE:600:0:U' ] +) ; +my %Counters= () ; +foreach ( @Params ) { + next unless $$_[2] =~ m/DERIVE/ ; + $Counters{$$_[0]}= $$_[0] ; # Save name of counter-like variable + $Counters{$$_[0]}=~ s/^num_/rate_/ ; # Build name of derived variable +} # of foreach + +# +# Define the thresholds for the various DabMux statistics and any derived +# value. +# +my %Thresholds= ( + state => { red => qr/^(?:NoData)$/ }, + rate_underruns => { red => '20.0'}, + rate_overruns => { red => '20.0'}, +# peak_left => { yellow => ['< -80', '80'] }, +# peak_right => { yellow => ['< -80', '80'] } +) ; + +# +# Define the name of the file to hold the values of the counter-type variables. +# +my $SaveFile= '/usr/lib/xymon/client/ext/retodrs.sav' ; + +# +# Global variables. +# ----------------- +# +my $Now= localtime ; # Timestamp of tests + $Now= $Now->strftime( $FmtDate ) ; +my $Colour= $#ColourOf ; # Test status +my $Result= '' ; # Message to sent to Xymon + +my %HostInfo ; # Host information from xymon +my %Table0= () ; # Tables with results +my %Table1= () ; +my @SubChannel= () ; # Subchannel assignment +my %SubChannel= () ; # in both directions +my %ErrMsg ; # Error messages + $ErrMsg{$_}= [] foreach ( @ColourOf ) ; +my ($CurTime,$PrvTime) ; # Times of measurement +my %Prev= () ; # Variables in previous pass + +# +# Save an error message in intermediate list %ErrMsg. Function InformXymon will +# move these messages to the start of the xymon status message. +# +sub LogError($$) { + my $clr= shift ; # Status/colour of message + my $msg= shift ; # Error message + return unless defined $msg ; + return unless $msg ; + + chomp $msg ; # Clean up message, just to be sure + $msg=~ s/^\s+// ; $msg=~ s/\s+$// ; + + if ( exists $ErrMsg{$clr} ) { + push @{$ErrMsg{$clr}}, $msg ; + } else { + push @{$ErrMsg{clear}}, $msg ; + } # of else +} # of LogError + +# +# Issue a message the the logfile. As this script is run periodically by Xymon, +# StdOut will be redirected to the logfile. +# +sub LogMessage { + my $Msg= shift ; + my @Time= (localtime())[0..5] ; + $Time[4]++ ; $Time[5]+= 1900 ; + chomp $Msg ; + printf "%4d%02d%02d %02d%02d%02d %s\n", reverse(@Time), $Msg ; +} # of LogMessage + +sub max($$) { return $_[0] > $_[1] ? $_[0] : $_[1] ; } +sub min($$) { return $_[0] < $_[1] ? $_[0] : $_[1] ; } + + +# +# Function AnError is given a short description and a boolean value, whose value +# is false if the operation associated with the description failed. The result +# of this function is the opposite of the boolean value supplied. If failed, the +# description is entered in the error message list %ErrMsg, including the +# content of $!, if the latter is not empty. +# +sub AnError($$) { + if ( $_[1] ) { + return 0 ; # Return a false value + } else { + my $msg= $! ; # Retrieve any error message + if ( $msg eq '' ) { + LogError( 'clear', "$_[0] failed" ) ; + } else { + LogError( 'clear', "$_[0] failed:" ) ; + LogError( 'clear', " $msg" ) ; + } # of else + return 1 ; # Return a true value + } # of else +} # of AnError + +# +# Function ApplyThresholds determines for which channels threshold checks should +# be performed. Then it checks the collected statistics against their +# thresholds, and sets the status of those statistics accordingly. The status of +# the statistics which are not checked against a threshold are set to 'clear'. +# +sub ApplyThresholds() { + my $hr ; # Reference in a multi-level hash + + # + # Set flag ThresholdCheck at each subchannel. It is set to true if threshold + # checks should be performed. + # + if ( exists $HostInfo{select}{list} ) { + $hr= $HostInfo{select}{list} ; + $Table1{$_}{ThresholdCheck}= exists $$hr{$_} ? 1 : 0 foreach ( keys %Table1 ) ; + } else { + $Table1{$_}{ThresholdCheck}= 1 foreach ( keys %Table1 ) ; + } # of else + + # + # Invoke function CheckValue for each pair {subchannel,statistic} for which a + # threshold check should and can be performed. + # + foreach my $sub ( keys %Table1 ) { + next unless $Table1{$sub}{ThresholdCheck} ; + $hr= $Table1{$sub} ; # Ref to subchannel info + foreach my $var ( keys %Thresholds ) { + next unless exists $$hr{$var} ; + CheckValue( $$hr{$var}, $Thresholds{$var} ) ; + } # of foreach + } # of foreach +} # of ApplyThresholds + +# +# Function BuildMessage takes the the status and statistics in hash %Table and +# builds a message for Xymon. +# +sub BuildMessage() { + my $RrdMsg ; # RRD message + my $sub ; # Name of a sub channel + my ($Value,$Status) ; # Value and status of one statistic + my @Values ; # Values of one subchannel + my $hr ; # Reference into a hash + + # + # Check the subchannel assignment against the list of named subchannels. They + # should match. + # + for ( my $i= 0 ; $i<= $#SubChannel ; $i++ ) { + $hr= $SubChannel[$i] ; + next unless defined $hr ; + next if exists $Table1{$hr} ; + $SubChannel[$i]= undef ; + delete $SubChannel{$hr} ; + } # of for + foreach my $sub ( sort keys %Table1 ) { + next if exists $SubChannel{$sub} ; + $hr= $#SubChannel + 1 ; + $SubChannel[$hr]= $sub ; + } # of foreach + + # + # Build a table showing the services. + # + $Result = "\n" ; + foreach ( sort keys %Table0 ) { + $Result.= " \n" ; + } # of foreach + $Result.= "
$_ $Table0{$_}
\n\n" ; + + # + # Build the first part of the table to enter the statistics into RRD's and + # ultimately into graphs. + # + $RrdMsg = "\n" ; + $RrdMsg.= "" ; + $Result.= $RrdMsg ; +} # of BuildMessage + +# +# Function CheckPortStatus checks if the TCP port to access the management +# server is available in listen mode. The function result is true if the TCP +# port is found in the listen state, false otherwise. +# +sub CheckPortStatus($) { + my $url= shift ; # The url to check + my $Found= 0 ; # Function result + my @F ; # Fields of a line image + + my @netstat= `netstat -ln4` ; # Retrieve port status info + foreach ( @netstat ) { + chomp ; + @F= split ; +# next unless @F == 6 ; +# next unless $F[5] eq 'LISTEN' ; + next unless "$F[0]://$F[3]" eq $url ; + $Found= 1 ; # Port in listen state found + last ; # Terminate search + } # of foreach + return $Found ; +} # of CheckPortStatus + +# +# Function CheckValue checks the value of a statistic against its threshold(s). +# A reference to the value and a reference to the threshold definition are +# passed. +# +sub CheckValue($$) { + my $vr= shift ; # Reference to the variable + my $tr= shift ; # Reference to the threshold descriptor + my $clr ; + + $$vr{Status}= $#ColourOf ; # Default result + return if $$vr{Value} eq 'wait' ; + + for ( my $i= $#ColourOf ; $i >= 0 ; $i-- ) { + $clr= $ColourOf[$i] ; + next unless exists $$tr{$clr} ; + if ( ref($$tr{$clr}) eq 'ARRAY' ) { + foreach ( @{$$tr{$clr}} ) { + if ( ref($_) eq 'Regexp' ) { # Text check + $$vr{Status}= $i if $$vr{Value} =~ m/$_/ ; + } elsif ( m/^[-+\d\.]+$/ ) { # Numeric upperbound + $$vr{Status}= $i if $$vr{Value} > $_ ; + } elsif ( m/^<\s*([-+\d\.]+)$/ ) { # Numeric lowerbound + $$vr{Status}= $i if $$vr{Value} < $1 ; + } # of elsif + } # of foreach + } else { + if ( ref($$tr{$clr}) eq 'Regexp' ) { # Text check + $$vr{Status}= $i if $$vr{Value} =~ m/$$tr{$clr}/ ; + } elsif ( $$tr{$clr} =~ m/^[-+\d\.]+$/ ) { # Numeric upperbound + $$vr{Status}= $i if $$vr{Value} > $$tr{$clr} ; + } elsif ( $$tr{$clr} =~ m/^<\s*([-+\d\.]+)$/ ) { # Numeric lowerbound + $$vr{Status}= $i if $$vr{Value} < $1 ; + } # of elsif + } # of else + } # of for + +} # of CheckValue + +# +# Function ComputeRates computes the the rate of change of the counter-like +# variables. +# +sub ComputeRates() { + my $hr ; + my $val ; + + foreach my $sub ( keys %Table1 ) { + $hr= $Table1{$sub} ; # Ref into hash + + foreach my $var ( keys %Counters ) { + $$hr{$Counters{$var}}{Value}= 'wait' ; + $$hr{$Counters{$var}}{State}= undef ; + if ( exists $Prev{$sub}{$var} and + defined $$hr{$var}{Value} and + defined $PrvTime ) { + if ( $$hr{$var}{Value} >= $Prev{$sub}{$var} ) { + $val= ( $$hr{$var}{Value} - $Prev{$sub}{$var} ) / + ( $CurTime - $PrvTime ) ; + $$hr{$Counters{$var}}{Value}= sprintf( '%.2f', $val ) ; + } # of if + } # of if + } # of foreach + + } # of foreach +} # of ComputeRates + +# +# Function GetOneReply sends the supplied request to the management server and +# returns the result as a reference to a hash. If something went wrong, the +# result will be undef and an (appropiate?) error message is entered in %ErrMsg. +# +sub GetOneReply($$) { + my $socket = shift ; # Socket object + my $request= shift ; # Request string + + my $reqlng= length( $request ) ; # Length of request string + my $rc= zmq_send( $socket, $request, $reqlng ) ; + return undef if AnError( "Request \"$request\"", $rc == $reqlng ) ; + + my $reply= zmq_recvmsg( $socket ) ; + return undef if AnError( "Reply on \"$request\"", defined $reply ) ; + + $reply= decode_json( zmq_msg_data($reply) ) ; # Convert to Perl structure + return $reply ; +} # of GetOneReply + +# +# Function GetStatistics retrieves both the status and the statistics from the +# server within the DabMux. The results are collected in hash %Table. Subhash +# %{$Table{0}} will contain the service information, %{$Table{1}} will contain +# the subchannel status and statistics. +# +sub GetStatistics() { + my ($ctxt,$socket) ; # Connection variables + my ($reply,$rv) ; # Request/reply variables + my ($hr,$vr) ; # Refs into multi-level hash + + $CurTime= undef ; # No data collected yet + # + # Build a connection to the DabMux server. + # + $ctxt= zmq_ctx_new ; + return undef if AnError( 'Building context object', defined $ctxt ) ; + $socket= zmq_socket( $ctxt, ZMQ_REQ ) ; + return undef if AnError( 'Creating socket', defined $socket ) ; + $rv= zmq_connect( $socket, $ODRMgmtSrvr ) ; + return undef if AnError( 'Connecting to DabMux', $rv == 0 ) ; + + $reply= GetOneReply( $socket, 'info' ) ; + return undef unless defined $reply ; + %Table0= %$reply ; # Save overview of services + + my $Once= 1 ; # Loop control variable + while ( $Once ) { + $Once= 0 ; # Only one iteration. + # + # Retrieve the subchannel assignment. + # + $reply= GetOneReply( $socket, 'getptree' ) ; + if ( defined $reply ) { + foreach my $sub ( keys %{$$reply{subchannels}} ) { + $hr= $$reply{subchannels}{$sub} ; + next unless exists $$hr{id} ; + next unless $$hr{id} =~ m/^\d+$/ ; + $SubChannel[$$hr{id}]= $sub ; + $SubChannel{$sub}= $$hr{id} ; + } # of foreach + } else { + next ; # Skip rest of retrievals + } # of else + + # + # Retrieve the status and the statistics. + # + $reply= GetOneReply( $socket, 'state' ) ; + if ( defined $reply ) { + foreach my $sub ( keys %$reply ) { + $Table1{$sub}= {} ; # Preset result area + $hr= $Table1{$sub} ; + foreach ( keys %{$$reply{$sub}} ) { + $$hr{$_}{Value} = $$reply{$sub}{$_} ; + $$hr{$_}{Status}= undef ; + } # of foreach + } # of foreach + } else { + next ; # Skip retrieval of statistics + } # of else + + $reply= GetOneReply( $socket, 'values' ) ; + if ( defined $reply and exists $$reply{values} ) { + $CurTime= time ; # Save time of retrieval + foreach my $sub ( keys %{$$reply{values}} ) { + next unless exists $Table1{$sub} ; + next unless exists $$reply{values}{$sub}{inputstat} ; + $hr= $Table1{$sub} ; # Ref to destination + $vr= $$reply{values}{$sub}{inputstat} ; # Ref to source + foreach ( keys %$vr ) { + $$hr{$_}{Value} = $$vr{$_} ; + $$hr{$_}{Status}= undef ; + } # of foreach + } # of foreach +# } else { +# next ; + } # of else + + # + # Terminate the connection to the DabMux server. + # + } continue { + $rv= zmq_close( $socket ) ; + AnError( 'Closing socket', $rv == 0 ) ; + return 0 ; # Return a defined value + } # of continue / while +} # of GetStatistics + + +# +# Function GetXymonHostInfo retrieves the configuration of host $HostName from +# the xymon configuration file hosts.cfg. If tag ODR is present, it is handled. +# +sub GetXymonHostInfo() { + %HostInfo= ( select => { default => '^.+$' } ) ; # Default result + + my @Lines= `$XySend $XyDisp \"$XyInfo\"` ; # Retrieve host info + if ( @Lines != 1 ) { # Handle error condition + LogError( 'clear', 'Retrieval of host information from Xymon failed' ) ; + return ; + } # of if + + my ($Tag)= $Lines[0] =~ m/\b(ODR[^\s\|]+)/ ; # Extract tag ODR + return unless defined $Tag ; +# return if $Tag eq 'ODR' ; + + $Tag=~ s/^ODR\:// ; # Remove tag header + foreach my $sub ( split( /,/, $Tag ) ) { + if ( $sub =~ m/select\((.+)\)$/ ) { + $HostInfo{select}{list}{$_}= 0 foreach ( split(/;/,$1) ) ; + delete $HostInfo{select}{default} ; + } # of if + } # of foreach + +} # of GetXymonHostInfo + +# +# Function InformXymon sends the message, in global variable $Result, to the +# Xymon server. Any error messages in %ErrMsg are prepended to the message and +# the status (colour) of the message is adapted accordingly. +# +sub InformXymon() { + my $ErrMsg= '' ; + my $Clr ; # Colour of one sub-test + + for ( my $i= 0 ; $i < @ColourOf ; $i++ ) { + $Clr= $ColourOf[$i] ; + next unless @{$ErrMsg{$Clr}} ; + $Colour= min( $Colour, $i ) ; + $ErrMsg.= "&$Clr $_\n" foreach ( @{$ErrMsg{$Clr}} ) ; + } # of foreach + $ErrMsg.= "\n" if $ErrMsg ; + + $Colour= $ColourOf[$Colour] ; + $Result= "\"status $HostName.$TestName $Colour $Now\n" . + "Open Digital Radio DabMux status\n\n" . + "$ErrMsg$Result\"\n" ; + `$XySend $XyDisp $Result` ; # Inform Xymon + + $Result= '' ; # Reset message parameters + $Colour= $#ColourOf ; + $ErrMsg{$_}= [] foreach ( @ColourOf ) ; +} # of InformXymon + + +# +# Function RestoreCounters restores the values of counter-like variables, +# collected in the previous pass of this script, in hash %Prev. However, if the +# information is too old, nothing is restored. +# +sub RestoreCounters() { + my @F ; # Fields in a line image + + %Prev= () ; # Clear save area + $PrvTime= undef ; + unless ( open( FH, '<', $SaveFile ) ) { + LogError( 'yellow', "Can't read file $SaveFile : $!" ) ; + LogMessage( "Can't read file $SaveFile : $!" ) ; + return ; + } # of unless + + while ( ) { + chomp ; + @F= split ; + if ( $F[0] eq 'Time' ) { + last unless ( time - $F[1] < 1000 ) ; + $PrvTime= $F[1] ; + } elsif ( $F[0] eq 'Counter' ) { + $Prev{$F[1]}{$F[2]}= $F[3] ; + } # of elsif + } # of while + close( FH ) ; +} # of RestoreCounters + +# Function SaveCounters saves the counter-type variables in a file. They are +# retrieved in the next pass of this script, and will be used to calculate the +# rate in which these variables increase. +# +sub SaveCounters() { + # + # If the retrieval of the statistics failed, nothing should be saved. Perhaps + # the information saved at the (a?) previous pass is usable in the next pass. + # + return unless defined $CurTime ; + + unless ( open( FH, '>', $SaveFile ) ) { + LogError( 'yellow', "Can't write file $SaveFile : $!" ) ; + LogMessage( "Can't write file $SaveFile : $!" ) ; + return ; + } # of unless + + print FH "Time $CurTime\n" ; + foreach my $sub ( sort keys %Table1 ) { + foreach my $var ( sort keys %{$Table1{$sub}} ) { + next unless exists $Counters{$var} ; + next unless defined $Table1{$sub}{$var}{Value} ; + print FH "Counter $sub $var $Table1{$sub}{$var}{Value}\n" ; + } # of foreach + } # of foreach + close( FH ) ; +} # of SaveCounters + + +# +# MAIN PROGRAM. +# ============= +# +unless ( CheckPortStatus($ODRMgmtSrvr) ) { # If server down, + LogMessage( "URL \"$ODRMgmtSrvr\" is not available" ) ; + LogError( 'red', 'Server is not available' ) ; + InformXymon ; # report error via xymon + exit ; # and stop +} # of unless + +unless ( defined GetStatistics ) { # If retrieval fails, + InformXymon ; # report error via xymon + exit ; # and stop +} # of unless + +GetXymonHostInfo ; # Retrieve additional host info +if ( keys %Counters ) { + RestoreCounters ; # Get counters from previous pass + ComputeRates ; # Compute their rate of change + SaveCounters ; # Save counters for next pass +} # of if + +ApplyThresholds ; # Check for out-of-range values +BuildMessage ; # Build xymon message +InformXymon ; # Send message to xymon + -- cgit v1.2.3