From 2ed49ee65a76a3917eda653c52a69bfc5e17ee0e Mon Sep 17 00:00:00 2001 From: "Matthias P. Braendli" Date: Mon, 30 Mar 2020 15:31:45 +0200 Subject: Add zmq2edi munin stats script --- doc/stats_zmq2edi_munin.py | 148 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 doc/stats_zmq2edi_munin.py diff --git a/doc/stats_zmq2edi_munin.py b/doc/stats_zmq2edi_munin.py new file mode 100644 index 0000000..cf84afc --- /dev/null +++ b/doc/stats_zmq2edi_munin.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +# +# A munin plugin for ODR-ZMQ2EDI +# +# Reads the logfile, and the previously rotated logfile (suffixed by .1) and +# analyses the output. Generates a graph with percentage of frames late, and a +# graph with min/max wait time. +# +# Copy this to /etc/munin/plugins/stats_zmq2edi_munin +# and make it executable (chmod +x) +# +# Then make sure that zmq2edi log output gets written to LOGFILE below, +# and setup up a logrotate script to rotate the log. The rotated log +# filename must be appended with .1 + +# Every six seconds a line is output. We are polled in 5 min = 300s intervals +NUM_LINES = int(300 / 6) +LOGFILE = "/var/log/supervisor/zmq2edi.log" + +import time +import sys +import os +import re + +munin_config = """ +multigraph wait_time_zmq2edi +graph_title zmq2edi wait_time +graph_order high low +graph_args --base 1000 +graph_vlabel max/min wait times during last ${graph_period} +graph_category zmq2edi +graph_info This graph shows the min and max wait times + +high.info Max wait time +high.label Max wait time ms +high.min 0 +high.warning 1: +low.info Min wait time +low.label Min wait time ms +low.min -6000 +low.warning 1: + +multigraph late_packets_zmq2edi +graph_title EDI packets delivered too late +graph_order late +graph_args --base 1000 +graph_vlabel late packets during last ${graph_period} +graph_category zmq2edi +graph_info This graph shows the number late EDI packets (250 packets = 6 seconds) + +late.info Number of late packets +late.label Number of late packets +late.min 0 +late.max %s +late.warning 0:0 +""" % (NUM_LINES * 250,) + +def parse_logs(): + # example lines: + # Buffering time statistics [milliseconds]: min: 907.799 max: 981.409 mean: 944.335 stdev: 26.827 late: 0 of 250 (0%) + # Values might also be in scientific form, e.g. -1.80938e+07 + re_logline = re.compile(r"""Buffering time statistics.* min: (.+) max: (.+) mean: (.+) stdev: (.+) late: (.+) of 250""", flags=re.ASCII) + + # The following lines are output at startup and during a reset respectively: + startup_pattern = "starting up" + backoff_pattern = "Backoff" + + lines = [] + + # Check that the file exists and was last written to in the previous 2* 6s, + # otherwise assume the tool isn't running + if not os.path.exists(LOGFILE) or (time.time() - os.stat(LOGFILE).st_mtime) > 12: + num_late = None + t_min_period = None + t_max_period = None + else: + # Keep only the last NUM_LINES + + # Read the previously rotated logfile too to make sure we have enough data + for fname in [LOGFILE+ ".1", LOGFILE]: + if os.path.exists(fname): + with open(fname, "r") as fd: + for line in fd: + lines.append(line) + if len(lines) > NUM_LINES: + del lines[0] + + # Calculate min, max over the whole period, and sum the number of late + num_late = 0 + t_min_period = None + t_max_period = None + num_statistics = 0 + + for line in lines: + if startup_pattern in line: + num_late += 250 + elif backoff_pattern in line: + num_late += 250 + else: + match = re_logline.search(line) + if match: + num_statistics += 1 + t_min = float(match.group(1)) + t_max = float(match.group(2)) + t_mean = float(match.group(3)) + stdev = float(match.group(4)) + late = int(match.group(5)) + + if t_min_period is None or t_min < t_min_period: + t_min_period = t_min + + if t_max_period is None or t_max > t_max_period: + t_max_period = t_max + + if num_late is None: + num_late = 0 + + num_late += late + + # The min can take extremely low values, we clamp it here to -6 seconds + # to keep the graph readable + if t_min_period is not None and t_min_period < -6000: + t_min_period = -6000 + + return num_late, round(t_min_period) if t_min_period is not None else None, round(t_max_period) if t_max_period is not None else None + +def muninify(value): + """ According to http://guide.munin-monitoring.org/en/latest/develop/plugins/plugin-concise.html#plugin-concise + "If the plugin - for any reason - has no value to report, then it may send the value U for undefined." + """ + return 'U' if value is None else value + +# No arguments means that munin wants values +if len(sys.argv) == 1: + num_late, t_min, t_max = parse_logs() + + munin_values = "multigraph wait_time_zmq2edi\n" + munin_values += "high.value {}\n".format(muninify(t_max)) + munin_values += "low.value {}\n".format(muninify(t_min)) + + munin_values += "multigraph late_packets_zmq2edi\n" + munin_values += "late.value {}\n".format(muninify(num_late)) + print(munin_values) + +elif len(sys.argv) == 2 and sys.argv[1] == "config": + print(munin_config) +else: + sys.exit(1) -- cgit v1.2.3