Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / gpu / nvidia_gpu_ @ 10b1de81

Historique | Voir | Annoter | Télécharger (6,12 ko)

1 426bba44 Nuno Fachada
#!/bin/sh
2
# -*- sh -*-
3
4
: << =cut
5
6
=head1 NAME
7
8
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
9
usually bundled with NVIDIA GPU driver, to obtain information.
10
11
=head1 CONFIGURATION
12
13
This is a wildcard plugin. The wildcard prefix link name should be the 
14
value to monitor.
15
16
This plugin uses the following configuration variables:
17
18
 [nvidia_gpu_*]
19
  env.smiexec - Location of nvidia-smi executable.
20 10b1de81 Nuno Fachada
  env.warning - Warning temperature
21
  env.critical - Critical temperature
22 426bba44 Nuno Fachada
23
=head2 DEFAULT CONFIGURATION
24
25
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi.
26
27
=head2 EXAMPLE WILDCARD USAGE
28
29
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
30
31
...will monitor the temperature of available GPUs.
32
33
=head1 AUTHOR
34
35
Nuno Fachada
36
faken@fakenmc.com
37
38
=head1 LICENSE
39
40
 GNU General Public License, version 2
41
 http://www.gnu.org/licenses/gpl-2.0.html 
42
43
=head1 MAGIC MARKERS
44
45
 #%# family=auto
46
 #%# capabilities=autoconf suggest
47
48
=cut
49
50
# Determine name of parameter to monitor
51
name=`basename $0 | sed 's/^nvidia_gpu_//g'`
52
53
# Get location of nvidia-smi executable or use default
54
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
55
56
# Check if autoconf was requested
57
if [ "$1" = "autoconf" ]; then
58
	# Autoconf only returns yes if nvidia-smi exists and is executable 
59
	if [ -x $nvSmiExec ]; then
60
		echo yes
61
		exit 0
62
	else
63
		echo "no (nvidia-smi executable not found)"
64
		exit 0
65
	fi
66
fi
67
68
# Check if suggest was requested
69
if [ "$1" = "suggest" ]; then
70
	echo "temp"
71
	echo "mem"
72
	echo "fan"
73
	exit 0
74
fi
75
76
# Get number of GPUs
77
nGpusOutput=`$nvSmiExec -L`
78
nGpus=`echo "$nGpusOutput" | wc -l`
79
if [ $nGpus -eq 0 ]; then
80
	# Exit if no GPUs found
81
	echo "No NVIDIA GPUs detected. Exiting."
82
	exit 1
83
fi
84
85
# Get full output from nvidia-smi
86
smiOutput=`$nvSmiExec -q`
87
88
# Check if config was requested
89
if [ "$1" = "config" ]; then
90
91
	# Get driver version
92
	driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '`
93
94
	# Configure graph depending on what which quantity will be plotted
95
	case $name in
96
		temp)
97
			echo 'graph_title GPU temperature'
98
			echo 'graph_args -l 0 -u 120'
99
			echo 'graph_vlabel Degrees (C)'
100
			echo 'graph_category gpu'
101
			echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
102
			nGpusCounter=0
103
			while [ $nGpusCounter -lt $nGpus ]
104
			do
105
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
106 10b1de81 Nuno Fachada
				echo "temp${nGpusCounter}.warning ${warning:-75}"
107
				echo "temp${nGpusCounter}.critical ${critical:-95}"
108 426bba44 Nuno Fachada
				echo "temp${nGpusCounter}.info Temperature information for $gpuName"
109
				: $(( nGpusCounter = $nGpusCounter + 1 ))
110
			done 
111
			;;
112
		mem)
113
			# First determine total memory of each GPU...
114
			gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '`
115
			gpusTotalMem=''
116
			nGpusCounter=0
117
			while [ $nGpusCounter -lt $nGpus ]
118
			do
119
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
120
				echo "mem${nGpusCounter}.info Memory information for $gpuName"
121
				gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p`
122
				gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
123
				: $(( nGpusCounter = $nGpusCounter + 1 ))
124
				if [ $nGpusCounter -lt $nGpus ]; then
125
					gpusTotalMem="${gpusTotalMem}, "
126
				fi
127
			done
128
			# ...then output config data.
129
			echo 'graph_title GPU memory usage'
130
			echo 'graph_args -l 0 -u 100'
131
			echo 'graph_vlabel Percentage'
132
			echo 'graph_category gpu'
133
			echo "graph_info Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
134
			;;
135
		fan)
136
			echo 'graph_title GPU fan speed'
137
			echo 'graph_args -l 0 -u 100'
138
			echo 'graph_vlabel Percentage'
139
			echo 'graph_category gpu'
140
			echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
141
			nGpusCounter=0
142
			while [ $nGpusCounter -lt $nGpus ]
143
			do
144
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
145
				echo "fan${nGpusCounter}.info Fan information for $gpuName"
146
				: $(( nGpusCounter = $nGpusCounter + 1 ))
147
				done 
148
			;;
149
		*)
150
			echo "Can't run without a proper symlink. Exiting."
151
			echo "Try running munin-node-configure --suggest."
152
			exit 1
153
			;;
154
	esac
155
156
	# Common stuff for all quantities
157
	nGpusCounter=0
158
	while [ $nGpusCounter -lt $nGpus ]
159
	do
160
		gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
161
		echo "${name}${nGpusCounter}.label $gpuName"
162
		: $(( nGpusCounter = $nGpusCounter + 1 ))
163
		#print_warning $name
164
		#print_critical $name
165
	done
166
167
	exit 0
168
fi
169
170
# Get requested value
171
case $name in
172
	temp)
173
		valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2`
174
		;;
175
	mem)
176
		totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2`
177
		usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2`
178
		valueGpus=''
179
		nGpusCounter=0
180
		while [ $nGpusCounter -lt $nGpus ]
181
		do
182
			totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
183
			usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
184
			percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu ))
185
			valueGpus="${valueGpus}${percentMemUsed}\n"
186
			: $(( nGpusCounter = $nGpusCounter + 1 ))
187
		done
188
		;;
189
	fan)
190
		valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2`
191
		;;
192
	*)
193
		echo "Can't run without a proper symlink. Exiting."
194
		echo "Try running munin-node-configure --suggest."
195
		exit 1
196
		;;
197
	esac
198
199
200
# Print requested value
201
nGpusCounter=0
202
while [ $nGpusCounter -lt $nGpus ]
203
do
204
	value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p`
205
	echo "${name}${nGpusCounter}.value $value"
206
	: $(( nGpusCounter = $nGpusCounter + 1 ))
207
done
208
209 758ca724 Nuno Fachada
# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data.
210
# TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not)
211
212
213 426bba44 Nuno Fachada