Repository
Munin (contrib)
Last change
2021-01-26
Graph Categories
Family
contrib
Capabilities
Keywords
Language
Perl

raid

Sadly there is no documentation for this plugin.

#!/usr/bin/perl -w
#
# (c) 2007 Nathan Rutman nathan@clusterfs.com
#
# Plugin to monitor RAID status
#
# Results are % of healthy drives in a raid device
# and % rebuilt of devices that are resyncing.
#
#%# family=contrib
#%# capabilities=autoconf

if ($ARGV[0] and $ARGV[0] eq "autoconf") {
    if (-r "/proc/mdstat" and `grep md /proc/mdstat`) {
        print "yes\n";
    } else {
        print "no (no RAID devices found)\n";
    }
    exit 0;
}

if ( $ARGV[0] and $ARGV[0] eq "config" ) {
    print "graph_title RAID status\n";
    print "graph_category disk\n";
    print "graph_info This graph monitors RAID disk health.  Values are percentage of healthy drives in each raid group.  Degraded devices are marked Critical.\n";
    print "graph_args --base 1000 -l 0\n";
    print "graph_vlabel % healthy/rebuilt\n";
    print "graph_scale  no\n";
}

open(my $mdstat, "/proc/mdstat");
my(@text) = <$mdstat>;
# contents of <$mdstat> may be changed at next reading, so fetch the contents at a time
close($mdstat);

my($devinfo_re, $devstat_re, $action_re) = (
    '(md\d+)\s+:\s+active\s+(\(read-only\)\s+|\(auto-read-only\)\s+|)(\w+)\s+(.*)',
    '.*\[(\d+)\/(\d+)]\s+\[(\w+)]',
    '.*(reshape|check|resync|recovery)\s*=\s*(\d+\.\d+%|\w+)(.*finish=(.*min))?',
);
# Interestingly, swap is presented as "active (auto-read-only)"
# and mdadm has '--readonly' option to make the array 'active (read-only)'

my($dev, $ro, $type, $members, $failed, $nmem, $nact, $status, $action, $proc, $minute);
while (@text) {
    my $line = shift @text;
    if ($line =~ /$devinfo_re/) {
        # first line should like "active raid1 sda1[0] sdc1[2] sdb1[1]"
        $dev = $1;
        $ro = $2 || '';
        $type = $3;
        $members = $4;
        $failed = $members;
        $failed =~ s/[^F]+//g;
        $failed = length($failed);

        $line = shift @text;
        if ($line =~ /$devstat_re/) {
            # second line should like "123456 blocks super 1.2 [2/2] [UU]"
            $nmem = $1;
            $nact = $2;
            $status = $3;
        }
        else {
            # second line did not exist on /proc/mdstat
            next;
        }

        $line = shift @text;
        if ($line =~ /$action_re/) {
            # third line should like " [==>..................]  check = 10.0% (12345/123456) finish=123min speed=12345/sec"
            # this line will appear only when the array is in action
            $action = $1;
            my $percent = $2;
            $minute = $4 || '';
            if ($percent =~ /(\d+\.\d+)%/) {
                $proc = $1;
            }
            else {
                # 'resync=DELAYED' or 'resync=PENDING'
                $action .= " ($percent)";
                $proc = -1;
            }
        }
        else {
            # array is not in action
            $action = 'idle';
            $minute = '';
            unshift(@text, $line);
        }
    }
    else {
        # skip until first line is found
        next;
    }

    if ( $ARGV[0] and $ARGV[0] eq "config" ) {
        print "$dev.label $dev\n";
        print "$dev.info $type $ro$members\n";
        # 100: means less than 100
        # Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100.
        print "$dev.critical 98:\n";
        print $dev, "_rebuild.label $dev reshape/recovery\n";
        print $dev, "_rebuild.info $action $minute\n";
        # Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100.
        print $dev, "_rebuild.critical 98:\n";
        print $dev, "_check.label $dev check/resync \n";
        print $dev, "_check.info $action $minute\n";
        print $dev, "_failed.label $dev failed disks \n";
        print $dev, "_failed.info $action $minute\n";
        print $dev, "_failed.critical 0:0\n";
    } else {
        my $pct = 100 * $nact / $nmem;
        my $rpct = 100;
        my $cpct = 100;
        if ($action =~ /reshape|recovery/) {
            $rpct = $proc;
            $cpct = 0;  # check/resync is not done
        }
        elsif ($action =~ /check|resync/) {
            if ($proc < 0) {
                # array is on DELAYED or PENDING, further info is unknown
                $rpct = "U";
                $cpct = 0;
            }
            else {
                # reshape/recovery was done, $rpct => 100
                $cpct = $proc;
            }
        }

        print "$dev.value $pct\n";
        print $dev, "_rebuild.value $rpct\n";
        print $dev, "_check.value $cpct\n";
        print $dev, "_failed.value $failed\n";
    }
}

exit 0;