Repository
Munin (contrib)
Last change
2020-03-26
Graph Categories
Capabilities
Keywords
Language
Perl
Authors

nvidia_smi_

Sadly there is no documentation for this plugin.

#!/usr/bin/perl

#
# Copyright and BSD license
#
# Copyright (c) 2011 NVIDIA Corporation
# All rights reserved.
#
# Redistribution and use in source and binary forms are permitted
# provided that the above copyright notice and this paragraph are
# duplicated in all such forms and that any documentation,
# advertising materials, and other materials related to such
# distribution and use acknowledge that the software was developed
# by NVIDIA Corporation.  The name of the NVIDIA Corporation may not be
# used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#

#
# This script collects GPU information for use as a munin plugin
# Inspired by Matthew Ritchie and Vadim Bulakh's nvidia_smi_ plugin
#

#
# This requires the NVML bindings and NVIDIA driver >= R270
# $ sudo cpan install nvidia::ml
# http://search.cpan.org/~nvbinding/nvidia-ml-pl/lib/nvidia/ml.pm
#

use strict;
use nvidia::ml qw(:all);

my $runType = "normal";
my @runTypes = qw( normal config autoconf );
if ($#ARGV + 1 == 1)
{
    if (grep $_ eq $ARGV[0], @runTypes)
    {
        $runType = $ARGV[0];
    }
    else
    {
        print "Invalid argument: $ARGV[0].\n";
        print "Valid arguments: @runTypes.\n";
        exit(1);
    }
}

my $ret = nvmlInit();
exit(1) unless $ret == $NVML_SUCCESS;

($ret, my $gpuCount) = nvmlDeviceGetCount();
exit(1) unless $ret == $NVML_SUCCESS;

($ret, my $driverVersion) = nvmlSystemGetDriverVersion();
$driverVersion = "Unknown" if $ret != $NVML_SUCCESS;

for (my $i = 0; $i < $gpuCount; $i++)
{
    ($ret, my $handle) = nvmlDeviceGetHandleByIndex($i);
    next if $ret != $NVML_SUCCESS;

    ($ret, my $pciInfo) = nvmlDeviceGetPciInfo($handle);
    my $gpuName = $pciInfo->{'busId'} if $ret == $NVML_SUCCESS;

    if ($runType eq "config")
    {
        # only print the graph information once
        if ($i == 0)
        {
            print "graph_title GPU\n";
            print "graph_args --upper-limit 120 -l 0\n";
            print "graph_vlabel Percent or Degrees C\n";
            print "graph_category sensors\n";
            print "graph_info Information for NVIDIA GPUs using driver version $driverVersion\n";
        }

        # metrics are collected for all the GPUs to a single graph
        print "GPU_UTIL_$i.label GPU$i - $gpuName : GPU utilization\n";
        print "GPU_FANSPEED_$i.label GPU$i - $gpuName : fan speed\n";
        print "GPU_MEM_UTIL_$i.label GPU$i - $gpuName : GPU memory utilization\n";
        print "GPU_TEMP_$i.label GPU$i - $gpuName : GPU temperature\n";
    }
    elsif ($runType eq "autoconf")
    {
        print "yes\n";
        exit(0);
    }
    else
    {
        ($ret, my $gpuTemp) = nvmlDeviceGetTemperature($handle,
                                                       $NVML_TEMPERATURE_GPU);
        $gpuTemp = "N/A" if $ret != $NVML_SUCCESS;

        ($ret, my $gpuFanSpeed) = nvmlDeviceGetFanSpeed($handle);
        $gpuFanSpeed = "N/A" if $ret != $NVML_SUCCESS;

        ($ret, my $utilRates) = nvmlDeviceGetUtilizationRates($handle);
        my $gpuUtil;
        my $memUtil;
        if ($ret == $NVML_SUCCESS)
        {
            $gpuUtil = $utilRates->{'gpu'};
            $memUtil = $utilRates->{'memory'};
        }
        else
        {
            $gpuUtil = "N/A";

            ($ret, my $memory) = nvmlDeviceGetMemoryInfo($handle);
            if ($ret == $NVML_SUCCESS)
            {
                $memUtil = $memory->{"used"} / $memory->{"total"} * 100;
            }
            else
            {
                $memUtil = "N/A";
            }
        }

        print "GPU_TEMP_$i.value $gpuTemp\n";
        print "GPU_FANSPEED_$i.value $gpuFanSpeed\n";
        print "GPU_UTIL_$i.value $gpuUtil\n";
        print "GPU_MEM_UTIL_$i.value $memUtil\n";
    }
}