Repository
Munin (contrib)
Last change
2021-05-13
Graph Categories
Family
auto
Capabilities
Keywords
Language
Python (3.x)
License
LGPL-3.0-only
Authors

btrfs_device_stats

Name

btrfs_device_stats - Script to monitor btrfs device statistics

Configuration

Simply create a symlink in your plugins directory like with any other plugin. Must be run as root.

[btrfs_device_stats] user root

You can optionaly configure the warning and critical limits. By default warning is set to 1 and critical is not set at all. You can set the limits either for the entire plugin or per individual metric and down to a specific device. The more specific values take precedence over the general ones. See the following example:

[btrfs_device_stats] user root env.warning 2 env.critical 4 env.flags_warning 23 env.read_errs_critical 42 env.generation_errs_a04f3d6b_438c_4b61_979b_e5fda7fb858c_1_warning 187

Default Configuration

Bugs

Author

2019-2021, HaseHarald

Magic Markers

#%# family=auto
#%# capabilities=autoconf

License

LGPLv3

#!/usr/bin/env python3


"""
=pod

=head1 NAME

btrfs_device_stats - Script to monitor btrfs device statistics

=head1 CONFIGURATION

Simply create a symlink in your plugins directory like with any other plugin.
Must be run as root.

[btrfs_device_stats]
user root

You can optionaly configure the warning and critical limits. By default warning
is set to 1 and critical is not set at all. You can set the limits either for
the entire plugin or per individual metric and down to a specific device. The
more specific values take precedence over the general ones.
See the following example:

[btrfs_device_stats]
user root
env.warning 2
env.critical 4
env.flags_warning 23
env.read_errs_critical 42
env.generation_errs_a04f3d6b_438c_4b61_979b_e5fda7fb858c_1_warning 187

=head2 DEFAULT CONFIGURATION

=head1 BUGS

=head1 AUTHOR

2019-2021, HaseHarald

=head1 MAGIC MARKERS

 #%# family=auto
 #%# capabilities=autoconf

=head1 LICENSE

LGPLv3

=cut
"""


# This file contains a munin-plugin to gather btrfs statistics per device.
#
# This is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this plugin.  If not, see <http://www.gnu.org/licenses/>.


import btrfs
import os
import sys


def munin_config(fs):
    fsid = str(fs.fsid).replace('-', '_')
    print("multigraph btrfs_device_stats_" + fsid)
    print("graph_args --base 1000 -l 0")
    print("graph_vlabel total btrfs attribute value")
    print("graph_title btrfs total device stats for " + fs.path)
    print("graph_category disk")
    print("graph_info This graph shows the total stats of devices used by btrfs")

    print("corruption_errs_total.label Corruption Errors")
    print("flush_errs_total.label Flush Errors")
    print("generation_errs_total.label Generation Errors")
    print("read_errs_total.label Read Errors")
    print("write_errs_total.label Write Errors")
    print("nr_items_total.label Nr. of Items")
    print("flags_total.label Nr. of Flags")

    print("")

    devices = fs.devices()
    for this_device in devices:
        # Set defaults
        warning = os.getenv('warning', default="1")
        critical = os.getenv('critical', default=False)
        # Get device informations
        this_dev_info = fs.dev_info(this_device.devid)
        this_dev_name = this_dev_info.path.replace('/dev/', '')

        print("multigraph btrfs_device_stats_" + fsid + "." +
              str(this_device.devid))
        print("graph_args --base 1000 -l 0")
        print("graph_vlabel btrfs attribute value")
        print("graph_title btrfs device stats for " + this_dev_name)
        print("graph_category disk")
        print("graph_info This graph shows stats of devices used by btrfs")

        # Labels and warning/critical values for Corruption Errors
        this_corr_errs_warn = os.getenv('corruption_errs_warning',
                                        default=warning)
        this_corr_errs_warn = os.getenv('corruption_errs_' + fsid + "_" +
                                        str(this_device.devid) + '_warning',
                                        default=this_corr_errs_warn)
        this_corr_errs_crit = os.getenv('corruption_errs_critical',
                                        default=critical)
        this_corr_errs_crit = os.getenv('corruption_errs_' + fsid + "_" +
                                        str(this_device.devid) + '_critical',
                                        default=this_corr_errs_crit)
        print("corruption_errs.label Corruption Errors")
        print("corruption_errs.warning " + this_corr_errs_warn)
        if this_corr_errs_crit:
            print("corruption_errs.critical " + this_corr_errs_crit)

        # Labels and warning/critical values for Flush Errors
        this_flush_errs_warn = os.getenv('flush_errs_warning', default=warning)
        this_flush_errs_warn = os.getenv('flush_errs_' + fsid + "_" +
                                         str(this_device.devid) + '_warning',
                                         default=this_flush_errs_warn)
        this_flush_errs_crit = os.getenv('flush_errs_critical',
                                         default=critical)
        this_flush_errs_crit = os.getenv('flush_errs_' + fsid + "_" +
                                         str(this_device.devid) + '_critical',
                                         default=this_flush_errs_crit)
        print("flush_errs.label Flush Errors")
        print("flush_errs.warning " + this_flush_errs_warn)
        if this_flush_errs_crit:
            print("flush_errs.critical " + this_flush_errs_crit)

        # Labels and warning/critical values for Generation Errors
        this_gen_errs_warn = os.getenv('generation_errs_warning',
                                       default=warning)
        this_gen_errs_warn = os.getenv('generation_errs_' + fsid + "_" +
                                       str(this_device.devid) + '_warning',
                                       default=this_gen_errs_warn)
        this_gen_errs_crit = os.getenv('generation_errs_critical',
                                       default=critical)
        this_gen_errs_crit = os.getenv('generation_errs_' + fsid + "_" +
                                       str(this_device.devid) + '_critical',
                                       default=this_gen_errs_crit)
        print("generation_errs.label Generation Errors")
        print("generation_errs.warning " + this_gen_errs_warn)
        if this_gen_errs_crit:
            print("generation_errs.critical " + this_gen_errs_crit)

        # Labels and warning/critical values for Read Errors
        this_read_errs_warn = os.getenv('read_errs_warning', default=warning)
        this_read_errs_warn = os.getenv('read_' + fsid + "_" +
                                        str(this_device.devid) + '_warning',
                                        default=this_read_errs_warn)
        this_read_errs_crit = os.getenv('read_errs_critical', default=critical)
        this_read_errs_crit = os.getenv('read_errs_' + fsid + "_" +
                                        str(this_device.devid) + '_critical',
                                        default=this_read_errs_crit)
        print("read_errs.label Read Errors")
        print("read_errs.warning " + this_read_errs_warn)
        if this_read_errs_crit:
            print("read_errs.critical " + this_read_errs_crit)

        # Labels and warning/critical values for Write Errors
        this_write_errs_warn = os.getenv('write_errs_warning', default=warning)
        this_write_errs_warn = os.getenv('write_errs_' + fsid + "_" +
                                         str(this_device.devid) + '_warning',
                                         default=this_write_errs_warn)
        this_write_errs_crit = os.getenv('write_errs_critical',
                                         default=critical)
        this_write_errs_crit = os.getenv('write_errs_' + fsid + "_" +
                                         str(this_device.devid) + '_critical',
                                         default=this_write_errs_crit)
        print("write_errs.label Write Errors")
        print("write_errs.warning " + this_write_errs_warn)
        if this_write_errs_crit:
            print("write_errs.critical " + this_write_errs_crit)

        print("nr_items.label Nr. of Items")

        # Labels and warning/critical values for Flags
        this_flags_warn = os.getenv('flags_warning', default=warning)
        this_flags_warn = os.getenv('flags_' + fsid + "_" +
                                    str(this_device.devid) + '_warning',
                                    default=this_flags_warn)
        this_flags_crit = os.getenv('flags_critical', default=critical)
        this_flags_crit = os.getenv('flags_' + fsid + "_" +
                                    str(this_device.devid) + '_critical',
                                    default=this_flags_crit)
        print("flags.label Nr. of Flags")
        print("flags.warning " + this_flags_warn)
        if this_flags_crit:
            print("flags.critical " + this_flags_crit)

        print("")


def munin_values(fs):
    corruption_errs_total = 0
    flush_errs_total = 0
    generation_errs_total = 0
    read_errs_total = 0
    write_errs_total = 0
    nr_items_total = 0
    flags_total = 0

    fsid = str(fs.fsid).replace('-', '_')
    devices = fs.devices()

    for this_device in devices:
        this_dev_stat = fs.dev_stats(this_device.devid, False)

        corruption_errs = this_dev_stat.corruption_errs
        flush_errs = this_dev_stat.flush_errs
        generation_errs = this_dev_stat.generation_errs
        read_errs = this_dev_stat.read_errs
        write_errs = this_dev_stat.write_errs
        nr_items = this_dev_stat.nr_items
        flags = this_dev_stat.flags

        corruption_errs_total = corruption_errs_total + corruption_errs
        flush_errs_total = flush_errs_total + flush_errs
        generation_errs_total = generation_errs_total + generation_errs
        read_errs_total = read_errs_total + read_errs
        write_errs_total = write_errs_total + write_errs
        nr_items_total = nr_items_total + nr_items
        flags_total = flags_total + flags

        print("multigraph btrfs_device_stats_" + fsid + "." +
              str(this_device.devid))

        print("corruption_errs.value " + str(corruption_errs))
        print("flush_errs.value " + str(flush_errs))
        print("generation_errs.value " + str(generation_errs))
        print("read_errs.value " + str(read_errs))
        print("write_errs.value " + str(write_errs))
        print("nr_items.value " + str(nr_items))
        print("flags.value " + str(flags))

        print("")

    print("multigraph btrfs_device_stats_" + fsid)

    print("corruption_errs_total.value " + str(corruption_errs_total))
    print("flush_errs_total.value " + str(flush_errs_total))
    print("generation_errs_total.value " + str(generation_errs_total))
    print("read_errs_total.value " + str(read_errs_total))
    print("write_errs_total.value " + str(write_errs_total))
    print("nr_items_total.value " + str(nr_items_total))
    print("flags_total.value " + str(flags_total))

    print("")


def main():
    for path in btrfs.utils.mounted_filesystem_paths():
        with btrfs.FileSystem(path) as fs:
            if len(sys.argv) > 1 and sys.argv[1] == "config":
                munin_config(fs)
            else:
                munin_values(fs)


if __name__ == "__main__":
    main()

exit(0)