diff options
| -rw-r--r-- | doc/stats_zmq2edi_munin.py | 148 | 
1 files changed, 148 insertions, 0 deletions
diff --git a/doc/stats_zmq2edi_munin.py b/doc/stats_zmq2edi_munin.py new file mode 100644 index 0000000..cf84afc --- /dev/null +++ b/doc/stats_zmq2edi_munin.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +# +# A munin plugin for ODR-ZMQ2EDI +# +# Reads the logfile, and the previously rotated logfile (suffixed by .1) and +# analyses the output. Generates a graph with percentage of frames late, and a +# graph with min/max wait time. +# +# Copy this to /etc/munin/plugins/stats_zmq2edi_munin +# and make it executable (chmod +x) +# +# Then make sure that zmq2edi log output gets written to LOGFILE below, +# and setup up a logrotate script to rotate the log. The rotated log +# filename must be appended with .1 + +# Every six seconds a line is output. We are polled in 5 min = 300s intervals +NUM_LINES = int(300 / 6) +LOGFILE = "/var/log/supervisor/zmq2edi.log" + +import time +import sys +import os +import re + +munin_config = """ +multigraph wait_time_zmq2edi +graph_title zmq2edi wait_time +graph_order high low +graph_args --base 1000 +graph_vlabel max/min wait times during last ${graph_period} +graph_category zmq2edi +graph_info This graph shows the min and max wait times + +high.info Max wait time +high.label Max wait time ms +high.min 0 +high.warning 1: +low.info Min wait time +low.label Min wait time ms +low.min -6000 +low.warning 1: + +multigraph late_packets_zmq2edi +graph_title EDI packets delivered too late +graph_order late +graph_args --base 1000 +graph_vlabel late packets during last ${graph_period} +graph_category zmq2edi +graph_info This graph shows the number late EDI packets (250 packets = 6 seconds) + +late.info Number of late packets +late.label Number of late packets +late.min 0 +late.max %s +late.warning 0:0 +""" % (NUM_LINES * 250,) + +def parse_logs(): +    # example lines: +    #  Buffering time statistics [milliseconds]: min: 907.799 max: 981.409 mean: 944.335 stdev: 26.827 late: 0 of 250 (0%) +    # Values might also be in scientific form, e.g. -1.80938e+07 +    re_logline = re.compile(r"""Buffering time statistics.* min: (.+) max: (.+) mean: (.+) stdev: (.+) late: (.+) of 250""", flags=re.ASCII) + +    # The following lines are output at startup and during a reset respectively: +    startup_pattern = "starting up" +    backoff_pattern = "Backoff" + +    lines = [] + +    # Check that the file exists and was last written to in the previous 2* 6s, +    # otherwise assume the tool isn't running +    if not os.path.exists(LOGFILE) or (time.time() - os.stat(LOGFILE).st_mtime) > 12: +        num_late = None +        t_min_period = None +        t_max_period = None +    else: +        # Keep only the last NUM_LINES + +        # Read the previously rotated logfile too to make sure we have enough data +        for fname in [LOGFILE+ ".1", LOGFILE]: +            if os.path.exists(fname): +                with open(fname, "r") as fd: +                    for line in fd: +                        lines.append(line) +                        if len(lines) > NUM_LINES: +                            del lines[0] + +        # Calculate min, max over the whole period, and sum the number of late +        num_late = 0 +        t_min_period = None +        t_max_period = None +        num_statistics = 0 + +        for line in lines: +            if startup_pattern in line: +                num_late += 250 +            elif backoff_pattern in line: +                num_late += 250 +            else: +                match = re_logline.search(line) +                if match: +                    num_statistics += 1 +                    t_min = float(match.group(1)) +                    t_max = float(match.group(2)) +                    t_mean = float(match.group(3)) +                    stdev = float(match.group(4)) +                    late = int(match.group(5)) + +                    if t_min_period is None or t_min < t_min_period: +                        t_min_period = t_min + +                    if t_max_period is None or t_max > t_max_period: +                        t_max_period = t_max + +                    if num_late is None: +                        num_late = 0 + +                    num_late += late + +    # The min can take extremely low values, we clamp it here to -6 seconds +    # to keep the graph readable +    if t_min_period is not None and t_min_period < -6000: +        t_min_period = -6000 + +    return num_late, round(t_min_period) if t_min_period is not None else None, round(t_max_period) if t_max_period is not None else None + +def muninify(value): +    """ According to http://guide.munin-monitoring.org/en/latest/develop/plugins/plugin-concise.html#plugin-concise +    "If the plugin - for any reason - has no value to report, then it may send the value U for undefined." +    """ +    return 'U' if value is None else value + +# No arguments means that munin wants values +if len(sys.argv) == 1: +    num_late, t_min, t_max = parse_logs() + +    munin_values = "multigraph wait_time_zmq2edi\n" +    munin_values += "high.value {}\n".format(muninify(t_max)) +    munin_values += "low.value {}\n".format(muninify(t_min)) + +    munin_values += "multigraph late_packets_zmq2edi\n" +    munin_values += "late.value {}\n".format(muninify(num_late)) +    print(munin_values) + +elif len(sys.argv) == 2 and sys.argv[1] == "config": +    print(munin_config) +else: +    sys.exit(1)  | 
