Repository
Munin (contrib)
Last change
2021-07-26
Graph Categories
Family
auto
Capabilities
Keywords
Language
Bash
License
GPL-2.0-only
Authors

nvidia_gpu_

Name

nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, usually bundled with NVIDIA GPU driver, to obtain information.

Configuration

This is a wildcard plugin. The wildcard prefix link name should be the value to monitor.

This plugin uses the following configuration variables:

[nvidia_gpu_*]
 env.smiexec - Location of nvidia-smi executable.
 env.warning - Warning temperature
 env.critical - Critical temperature

Default Configuration

The default configuration is to set “env.smiexec” to /usr/bin/nvidia-smi and assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.

Example Wildcard Usage

ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp

…will monitor the temperature of available GPUs.

Todo

Author

Nuno Fachada faken@fakenmc.com

License

GNU General Public License, version 2
http://www.gnu.org/licenses/gpl-2.0.html

Magic Markers

#%# family=auto
#%# capabilities=autoconf suggest
#!/bin/bash
# -*- sh -*-

: << =cut

=head1 NAME

nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
usually bundled with NVIDIA GPU driver, to obtain information.

=head1 CONFIGURATION

This is a wildcard plugin. The wildcard prefix link name should be the
value to monitor.

This plugin uses the following configuration variables:

 [nvidia_gpu_*]
  env.smiexec - Location of nvidia-smi executable.
  env.warning - Warning temperature
  env.critical - Critical temperature

=head2 DEFAULT CONFIGURATION

The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.

=head2 EXAMPLE WILDCARD USAGE

C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>

...will monitor the temperature of available GPUs.

=head1 TODO

=over 4

=item *

Add support for specific professional GPU features such as number of compute processes, clocks and so on.

=item *

Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput).

=back

=head1 AUTHOR

Nuno Fachada
faken@fakenmc.com

=head1 LICENSE

 GNU General Public License, version 2
 http://www.gnu.org/licenses/gpl-2.0.html

=head1 MAGIC MARKERS

 #%# family=auto
 #%# capabilities=autoconf suggest

=cut

# Determine name of parameter to monitor
name=$(basename "$0" | sed 's/^nvidia_gpu_//g')

# Get location of nvidia-smi executable or use default
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}

# Check if autoconf was requested
if [ "$1" = "autoconf" ]; then
	# Autoconf only returns yes if nvidia-smi exists and is executable
	if [ -x "$nvSmiExec" ]; then
		echo yes
		exit 0
	else
		echo "no (nvidia-smi executable not found)"
		exit 0
	fi
fi

# Check if suggest was requested
if [ "$1" = "suggest" ]; then
	echo "temp"
	echo "mem"
	echo "fan"
	echo "power"
	echo "utilization"
	echo "rx"
	echo "tx"
	exit 0
fi

# Get number of GPUs
nGpusOutput=$("$nvSmiExec" -L)
nGpus=$(echo "$nGpusOutput" | wc -l)
if [ "$nGpus" -eq 0 ]; then
	# Exit if no GPUs found
	echo "No NVIDIA GPUs detected. Exiting."
	exit 1
fi

# Get full output from nvidia-smi
smiOutput=$("$nvSmiExec" -q)

# Check if config was requested
if [ "$1" = "config" ]; then

	# Get driver version
	driverVersion=$(echo "$smiOutput" | grep "Driver Version" | cut -d : -f 2 | tr -d ' ')

	# Configure graph depending on what which quantity will be plotted
	case $name in
		temp)
			echo 'graph_title GPU temperature'
			echo 'graph_args -l 0 -u 120'
			echo 'graph_vlabel degrees Celsius'
			echo 'graph_category sensors'
			echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
			nGpusCounter=0
			while [ $nGpusCounter -lt "$nGpus" ]
			do
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
				echo "${name}${nGpusCounter}.warning ${warning:-75}"
				echo "${name}${nGpusCounter}.critical ${critical:-95}"
				echo "${name}${nGpusCounter}.info Temperature information for $gpuName"
				: $((nGpusCounter=nGpusCounter+1))
			done
			;;
		mem)
			# First determine total memory of each GPU...
			gpusTotalMemOutput=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' ')
			gpusTotalMem=''
			nGpusCounter=0
			while [ $nGpusCounter -lt "$nGpus" ]
			do
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
				echo "${name}${nGpusCounter}.info Memory information for $gpuName"
				gpuMem=$(echo "$gpusTotalMemOutput"| sed -n $((nGpusCounter+1))p)
				gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
				: $((nGpusCounter=nGpusCounter+1))
				if [ "$nGpusCounter" -lt "$nGpus" ]; then
					gpusTotalMem="${gpusTotalMem}, "
				fi
			done
			# ...then output config data.
			echo 'graph_title GPU memory usage'
			echo 'graph_args -l 0 -u 100'
			echo 'graph_vlabel %'
			echo 'graph_category memory'
			echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
			;;
		fan)
			echo 'graph_title GPU fan speed'
			echo 'graph_args -l 0 -u 100'
			echo 'graph_vlabel %'
			echo 'graph_category sensors'
			echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
			nGpusCounter=0
			while [ $nGpusCounter -lt "$nGpus" ]
			do
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
				echo "${name}${nGpusCounter}.info Fan information for $gpuName"
				: $((nGpusCounter=nGpusCounter+1))
				done
			;;
		power)
			echo 'graph_title GPU power consumption'
			echo 'graph_vlabel Watt'
			echo 'graph_category sensors'
			echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion"
			nGpusCounter=0
			while [ $nGpusCounter -lt "$nGpus" ]
			do
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
				echo "${name}${nGpusCounter}.info power consumption of $gpuName"
				: $((nGpusCounter=nGpusCounter+1))
				done
			;;
		utilization)
			echo 'graph_title GPU utilization'
			echo 'graph_args -l 0 -u 100'
			echo 'graph_vlabel %'
			echo 'graph_category system'
			echo "graph_info GPU utilization of NVIDIA GPUs using driver version $driverVersion"
			nGpusCounter=0
			while [ $nGpusCounter -lt "$nGpus" ]
			do
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
				echo "${name}${nGpusCounter}.info GPU utilization information for $gpuName"
				: $((nGpusCounter=nGpusCounter+1))
				done
			;;
		rx)
			echo 'graph_title GPU Rx Throughput'
			echo 'graph_vlabel Bits/sec'
			echo 'graph_category system'
			echo "graph_info Rx Throughput of NVIDIA GPUs using driver version $driverVersion"
			nGpusCounter=0
			while [ $nGpusCounter -lt "$nGpus" ]
			do
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
				echo "${name}${nGpusCounter}.info Rx Throughput of $gpuName"
				: $((nGpusCounter=nGpusCounter+1))
				done
			;;
		tx)
			echo 'graph_title GPU Tx Throughput'
			echo 'graph_vlabel Bits/sec'
			echo 'graph_category system'
			echo "graph_info Tx Throughput of NVIDIA GPUs using driver version $driverVersion"
			nGpusCounter=0
			while [ $nGpusCounter -lt "$nGpus" ]
			do
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
				echo "${name}${nGpusCounter}.info Tx Throughput of $gpuName"
				: $((nGpusCounter=nGpusCounter+1))
				done
			;;
		*)
			echo "Can't run without a proper symlink. Exiting."
			echo "Try running munin-node-configure --suggest."
			exit 1
			;;
	esac

	# Common stuff for all quantities
	nGpusCounter=0
	while [ $nGpusCounter -lt "$nGpus" ]
	do
		gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
		echo "${name}${nGpusCounter}.label $gpuName"
		: $((nGpusCounter=nGpusCounter+1))
		#print_warning $name
		#print_critical $name
	done

	exit 0
fi

# Get requested value
case $name in
	temp)
		valueGpus=$(echo "$smiOutput" | grep "GPU Current Temp" | cut -d : -f 2 | cut -d ' ' -f 2)
		;;
	mem)
		totalMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2)
		usedMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2)
		valueGpus=''
		nGpusCounter=0
		while [ $nGpusCounter -lt "$nGpus" ]
		do
			totalMemGpu=$(echo "$totalMemGpus" | sed -n $((nGpusCounter+1))p)
			usedMemGpu=$(echo "$usedMemGpus" | sed -n $((nGpusCounter+1))p)
			percentMemUsed=$((usedMemGpu*100/totalMemGpu))
			valueGpus="${valueGpus}${percentMemUsed}"$'\n'
			: $((nGpusCounter=nGpusCounter+1))
		done
		;;
	fan)
		valueGpus=$(echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2)
		;;
	power)
		valueGpus=$(echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2)
		;;
	utilization)
		valueGpus=$(echo "$smiOutput" | grep "Gpu" | cut -d ':' -f 2 | cut -d ' ' -f 2)
		;;
	rx)
		rxGpus=$(echo "$smiOutput" | grep "Rx Throughput" | cut -d ':' -f 2 | cut -d ' ' -f 2)
		valueGpus=''
		nGpusCounter=0
		while [ $nGpusCounter -lt "$nGpus" ]
		do
			kiloBitsPerSecond=$(echo "$rxGpus" | sed -n $((nGpusCounter+1))p)
			bitsPerSecond=$((kiloBitsPerSecond*1024))
			valueGpus="${valueGpus}${bitsPerSecond}"$'\n'
			: $((nGpusCounter=nGpusCounter+1))
		done
		;;
	tx)
		txGpus=$(echo "$smiOutput" | grep "Tx Throughput" | cut -d ':' -f 2 | cut -d ' ' -f 2)
		valueGpus=''
		nGpusCounter=0
		while [ $nGpusCounter -lt "$nGpus" ]
		do
			kiloBitsPerSecond=$(echo "$txGpus" | sed -n $((nGpusCounter+1))p)
			bitsPerSecond=$((kiloBitsPerSecond*1024))
			valueGpus="${valueGpus}${bitsPerSecond}"$'\n'
			: $((nGpusCounter=nGpusCounter+1))
		done
		;;
	*)
		echo "Can't run without a proper symlink. Exiting."
		echo "Try running munin-node-configure --suggest."
		exit 1
		;;
	esac


# Print requested value
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
	value=$(echo "$valueGpus" | sed -n $((nGpusCounter+1))p)
	echo "${name}${nGpusCounter}.value $value"
	: $((nGpusCounter=nGpusCounter+1))
done