Repository
Munin (contrib)
Last change
2020-08-25
Graph Categories
Family
contrib
Capabilities
Keywords
Language
Python (3.x)
License
GPL-3.0-only
Authors

nginx_upstream_multi_

Sadly there is no documentation for this plugin.

#!/usr/bin/env python3
#
# Munin plugin to monitor requests number, cache statuses, http status codes and average request
# times of specified nginx upstreams.
#
# Copyright Igor Borodikhin
#
# License : GPLv3
#
# Configuration parameters:
# env.graphs - which graphs to produce (optional, list of graphs separated by spaces, default -
#              cache http time request)
# env.log - log file path (mandatory, ex.: /var/log/nginx/upstream.log)
# env.upstream - list of upstreams to monitor (mandatory, including port numbers separated by
#                space, e.g.: 10.0.0.1:80 10.0.0.2:8080)
# env.statuses - list of http status codes to monitor (optional, default - all statuses,
#                e.g.: 200 403 404 410 500 502)
# env.percentiles - which percentiles to draw on time graphs (optional, list of percentiles
#                   separated by spaces, default - 80)
#
# ## Installation
# Copy file to directory /usr/share/munin/pligins/ and create symbolic link(s) for each log file
# you wish to monitor.
#
# Specify log_format at /etc/nginx/conf.d/upstream.conf:
# log_format upstream "ua=[$upstream_addr] ut=[$upstream_response_time] us=[$upstream_status] \
#     cs=[$upstream_cache_status]"
#
# Use it in your site configuration (/etc/nginx/sites-enabled/anything.conf):
# access_log /var/log/nginx/upstream.log upstream;
#
# Attention! Since the default user (nobody) does not have read permission for nginx log files we
# need to run it as root.
#
# And specify some options in /etc/munin/plugin-conf.d/munin-node:
#
#     [nginx_upstream_multi_upstream]
#     user root
#     env.graphs cache http time request
#     env.log /var/log/nginx/upstream.log
#     env.upstream 10.0.0.1:80 10.0.0.2:8080 unix:/tmp/upstream3
#     env.statuses 200 403 404 410 500 502
#     env.percentiles 50 80
#
#  #%# family=contrib

import copy
import math
import os
import re
import sys
import time


# How we've been called
progName = sys.argv[0]
progName = progName[progName.rfind("/") + 1:]


# Where to store plugin state
stateDir = os.environ.get("MUNIN_PLUGSTATE", None)

# Which site configuration we should use
siteName = progName[len("nginx_upstream_multi_"):]

# Log path
logPath = os.environ.get("log", "/var/log/nginx/access.log")

# Http statuses list
httpStatusString = (
    "100:Continue;101:Switching protocols;102:Processing;200:OK;201:Created;202:Accepted;"
    "203:Non-Authoritative Information;204:No content;205:Reset content;206:Partial content;"
    "207:Multi-status;226:IM used;300:Multiple choices;301:Moved permanently;"
    "302:Moved temporarily;303:See other;304:Not modified;305:Use proxy;307:Temporary redirect;"
    "400:Bad request;401:Unauthorized;402:Payment required;403:Forbidden;404:Not found;"
    "405:Method not allowed;406:Not acceptable;407:Proxy Authentication Required;"
    "408:Request timeout;409:Conflict;410:Gone;411:Length required;412:Precondition failed;"
    "413:Request entity too large;414:Request URI too large;415:Unsupported media type;"
    "416:Request range not satisfiable;417:Expectation failed;422:Unprocessable entity;"
    "423:Locked;424:Failed dependency;425:Unordered collection;426:Upgrade required;"
    "449:Retry with;456:Unrecoverable error;500:Internal server error;501:Not implemented;"
    "502:Bad gateway;503:Service unavailable;504:Gateway timeout;505:HTTP version not supported;"
    "506:Variant also negotiates;507:Insufficient storage;508:Loop detected;"
    "509:Bandwidth limit exceeded;510:Not extended")

# an empty list of wanted statuses is interpreted as: all statuses
statuses = os.environ.get("statuses", "").split()

httpStatusList = {}
for statusString in httpStatusString.split(";"):
    [code, title] = statusString.split(":")
    if len(statuses) > 0 and code in statuses or len(statuses) == 0:
        httpStatusList[code] = {
            "title": title,
            "requests": 0
        }

cacheStatusList = {"MISS": 0, "BYPASS": 0, "EXPIRED": 0, "UPDATING": 0, "STALE": 0, "HIT": 0}

# Parse upstreams
upstreams = {}
if "upstream" in os.environ:
    upstreamString = os.environ["upstream"]
    upstreamList = upstreamString.split()
    for upstream in upstreamList:
        upstreams[upstream] = {
            "requests": 0,
            "time": 0,
            "times": [],
            "cache": copy.deepcopy(cacheStatusList),
            "http": copy.deepcopy(httpStatusList)
        }
else:
    raise Exception("No upstreams specified")

percentiles = os.environ.get("percentiles", "80").split()

graphs_enabled = os.environ.get("graphs", "cache http time request").split()

now = int(time.time())

lastBytePath = os.path.join(stateDir, "nginx_upstream_multi_{}_lastByte.txt".format(siteName))
try:
    lastRun = os.path.getmtime(lastBytePath)
except OSError:
    lastRun = now


def sanitize(string):
    return string.replace(".", "_").replace(":", "_").replace("/", "_").replace("-", "_")


if len(sys.argv) == 2 and sys.argv[1] == "config":
    # Parent graph declaration
    print("multigraph nginx_upstream_multi_%s" % siteName.replace(".", "_"))
    print("graph_title Requests number")
    print("graph_vlabel rps")
    print("graph_category webserver")
    for upstream in upstreams.keys():
        print("us%s_requests.label %s" % (sanitize(upstream), upstream))

    # Requests graph declaration
    if "request" in graphs_enabled:
        for upstream in upstreams.keys():
            print()
            print("multigraph nginx_upstream_multi_%s.%s_requests"
                  % (sanitize(siteName), sanitize(upstream)))
            print("graph_title Requests number - %s" % upstream)
            print("graph_vlabel rps")
            print("graph_category webserver")
            print("us%s_requests.label %s" % (sanitize(upstream), upstream))
            print()

    # Times graph declaration
    if "time" in graphs_enabled:
        for upstream in upstreams.keys():
            print()
            print("multigraph nginx_upstream_multi_%s.%s_times"
                  % (sanitize(siteName), sanitize(upstream)))
            print("graph_title Request time - %s" % upstream)
            print("graph_vlabel sec.")
            print("graph_category webserver")
            print("us%s_times.label average" % (sanitize(upstream)))
            for percentile in percentiles:
                print("us%s_times_percentile_%s.label %s-percentile"
                      % (sanitize(upstream), percentile, percentile))
            print()

    # HTTP Status codes graph declaration
    if "http" in graphs_enabled:
        for upstream in upstreams.keys():
            print()
            print("multigraph nginx_upstream_multi_%s.%s_statuses"
                  % (sanitize(siteName), sanitize(upstream)))
            print("graph_title HTTP - %s" % upstream)
            print("graph_vlabel rps")
            print("graph_category webserver")
            for status in sorted(httpStatusList.keys()):
                print("http%s_%s_status.label %s - %s"
                      % (status, sanitize(upstream), status, httpStatusList[status]["title"]))
            print()

    # Cache status graph declaration
    if "cache" in graphs_enabled:
        for upstream in upstreams.keys():
            print()
            print("multigraph nginx_upstream_multi_%s.%s_cache"
                  % (sanitize(siteName), sanitize(upstream)))
            print("graph_title Cache - %s" % upstream)
            print("graph_vlabel rps")
            print("graph_category webserver")
            for status in cacheStatusList:
                print("us%s_%s_cache.label %s" % (sanitize(status), sanitize(upstream), status))
            print()
else:
    timeElapsed = now - lastRun

    lastByteHandle = None

    try:
        lastByteHandle = open(lastBytePath, "r")
        lastByte = int(lastByteHandle.read())
    except Exception:
        lastByte = 0

    if lastByteHandle is not None:
        lastByteHandle.close()

    try:
        logHandle = open(logPath, "r")
    except Exception as e:
        print("Log file %s not readable: %s" % (logPath, e.strerror), file=sys.stderr)
        sys.exit(1)

    try:
        logSize = int(os.path.getsize(logPath))
    except ValueError:
        logSize = 0

    if logSize < lastByte:
        lastByte = 0

    regExp = re.compile(r"ua=\[(.*?)\]\s+ut=\[(.*?)\]\s+us=\[(.*?)\]\s+cs=\[(.*?)\]")

    logHandle.seek(lastByte)
    for line in logHandle:
        match = regExp.search(line)
        if (match):
            # Extract data
            address = match.group(1)
            request_time = match.group(2)
            status = match.group(3)
            cache = match.group(4)

            # Replace separators by space
            address = address.replace(",", " ")
            address = address.replace(" : ", " ")
            address = re.sub(r"\s+", " ", address)

            request_time = request_time.replace(",", " ")
            request_time = request_time.replace(" : ", " ")
            request_time = re.sub(r"\s+", " ", request_time)

            status = status.replace(",", " ")
            status = status.replace(" : ", " ")
            status = re.sub(r"\s+", " ", status)

            cache = cache.replace(",", " ")
            cache = cache.replace(" : ", " ")
            cache = re.sub(r"\s+", " ", cache)

            addresses = address.split()
            times = request_time.split()
            statuses = status.split()
            caches = cache.split()

            index = 0
            for uAddress in addresses:
                if uAddress in upstreams.keys():
                    try:
                        uTime = float(times[index])
                    except ValueError:
                        uTime = 0

                    if index < len(statuses):
                        uStatus = statuses[index]
                    else:
                        uStatus = "-"

                    if index < len(caches):
                        uCache = caches[index]
                    else:
                        uCache = "-"

                    if uAddress != "-":
                        upstreams[uAddress]["requests"] += 1
                    if uTime != "-":
                        upstreams[uAddress]["time"] += uTime
                        upstreams[uAddress]["times"].append(uTime)
                    if uStatus != "-" and uStatus in upstreams[uAddress]["http"].keys():
                        upstreams[uAddress]["http"][uStatus]["requests"] += 1
                    if uCache != "-":
                        upstreams[uAddress]["cache"][uCache] += 1
                index += 1

    try:
        lastByteHandle = open(lastBytePath, "w")
        lastByteHandle.write(str(logHandle.tell()))
        lastByteHandle.close()
    except Exception as e:
        print("Failed to write status file (%s): %s" % (lastBytePath, e.strerror), file=sys.stderr)
        sys.exit(1)

    logHandle.close()

    # Parent graph data
    for upstream in upstreams.keys():
        value = 0
        if timeElapsed > 0:
            value = upstreams[upstream]["requests"] / timeElapsed

        print("us%s_requests.value %s" % (sanitize(upstream), value))

    # Requests graph data
    if "request" in graphs_enabled:
        for upstream in upstreams.keys():
            print()
            print("multigraph nginx_upstream_multi_%s.%s_requests"
                  % (sanitize(siteName), sanitize(upstream)))
            value = 0
            if timeElapsed > 0:
                value = upstreams[upstream]["requests"] / timeElapsed
            print("us%s_requests.value %s" % (sanitize(upstream), value))
            print()

    # Times graph data
    if "time" in graphs_enabled:
        for upstream in upstreams.keys():
            uTime = 0
            if upstreams[upstream]["requests"] > 0:
                uTime = upstreams[upstream]["time"] / upstreams[upstream]["requests"]
                upstreams[upstream]["times"].sort()
            print()
            print("multigraph nginx_upstream_multi_%s.%s_times"
                  % (sanitize(siteName), sanitize(upstream)))
            print("us%s_times.value %s" % (sanitize(upstream), uTime))
            for percentile in percentiles:
                percentileValue = 0
                if upstreams[upstream]["requests"] > 0:
                    uTime = upstreams[upstream]["time"] / upstreams[upstream]["requests"]
                    percentileKey = int(percentile) * len(upstreams[upstream]["times"]) / 100
                    if len(upstreams[upstream]["times"]) % 2 > 0:
                        low = int(math.floor(percentileKey))
                        high = int(math.ceil(percentileKey))
                        percentileValue = (upstreams[upstream]["times"][low]
                                           + upstreams[upstream]["times"][high]) / 2
                    else:
                        percentileValue = upstreams[upstream]["times"][int(percentileKey)]
                print("us%s_times_percentile_%s.value %s"
                      % (sanitize(upstream), percentile, percentileValue))
            print()

    # HTTP Status codes graph data
    if "http" in graphs_enabled:
        for upstream in upstreams.keys():
            print()
            print("multigraph nginx_upstream_multi_%s.%s_statuses"
                  % (sanitize(siteName), sanitize(upstream)))
            for status in sorted(httpStatusList.keys()):
                value = 0
                if timeElapsed > 0:
                    value = upstreams[upstream]["http"][status]["requests"] / timeElapsed

                print("http%s_%s_status.value %s" % (status, sanitize(upstream), value))
            print()

    # Cache status graph data
    if "cache" in graphs_enabled:
        for upstream in upstreams.keys():
            print()
            print("multigraph nginx_upstream_multi_%s.%s_cache"
                  % (sanitize(siteName), sanitize(upstream)))
            for status in cacheStatusList:
                value = 0
                if timeElapsed > 0:
                    value = upstreams[upstream]["cache"][status] / timeElapsed

                print("us%s_%s_cache.value %s" % (sanitize(status), sanitize(upstream), value))
            print()