Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / gpu / nvidia_gpu_ @ 00c0da18

Historique | Voir | Annoter | Télécharger (6,72 ko)

1 00c0da18 Nils
#!/usr/bin/env bash
2 426bba44 Nuno Fachada
# -*- sh -*-
3
4
: << =cut
5
6
=head1 NAME
7
8
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
9
usually bundled with NVIDIA GPU driver, to obtain information.
10
11
=head1 CONFIGURATION
12
13
This is a wildcard plugin. The wildcard prefix link name should be the 
14
value to monitor.
15
16
This plugin uses the following configuration variables:
17
18
 [nvidia_gpu_*]
19
  env.smiexec - Location of nvidia-smi executable.
20 10b1de81 Nuno Fachada
  env.warning - Warning temperature
21
  env.critical - Critical temperature
22 426bba44 Nuno Fachada
23
=head2 DEFAULT CONFIGURATION
24
25 c53197ce Nuno Fachada
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and
26
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.
27 426bba44 Nuno Fachada
28
=head2 EXAMPLE WILDCARD USAGE
29
30
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
31
32
...will monitor the temperature of available GPUs.
33
34 c53197ce Nuno Fachada
=head1 TODO
35
36
=over 4
37
38
=item *
39
40
Add support for specific professional GPU features such as number of compute 
41
processes, clocks, power draw, utilization, and so on.
42
43
=item *
44
45
Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput).
46
47
=back
48
49 426bba44 Nuno Fachada
=head1 AUTHOR
50
51
Nuno Fachada
52
faken@fakenmc.com
53
54
=head1 LICENSE
55
56
 GNU General Public License, version 2
57
 http://www.gnu.org/licenses/gpl-2.0.html 
58
59
=head1 MAGIC MARKERS
60
61
 #%# family=auto
62
 #%# capabilities=autoconf suggest
63
64
=cut
65
66
# Determine name of parameter to monitor
67
name=`basename $0 | sed 's/^nvidia_gpu_//g'`
68
69
# Get location of nvidia-smi executable or use default
70
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
71
72
# Check if autoconf was requested
73
if [ "$1" = "autoconf" ]; then
74
	# Autoconf only returns yes if nvidia-smi exists and is executable 
75
	if [ -x $nvSmiExec ]; then
76
		echo yes
77
		exit 0
78
	else
79
		echo "no (nvidia-smi executable not found)"
80
		exit 0
81
	fi
82
fi
83
84
# Check if suggest was requested
85
if [ "$1" = "suggest" ]; then
86
	echo "temp"
87
	echo "mem"
88
	echo "fan"
89 59361d6c Robert Kulyassa
	echo "power"
90 426bba44 Nuno Fachada
	exit 0
91
fi
92
93
# Get number of GPUs
94
nGpusOutput=`$nvSmiExec -L`
95
nGpus=`echo "$nGpusOutput" | wc -l`
96
if [ $nGpus -eq 0 ]; then
97
	# Exit if no GPUs found
98
	echo "No NVIDIA GPUs detected. Exiting."
99
	exit 1
100
fi
101
102
# Get full output from nvidia-smi
103
smiOutput=`$nvSmiExec -q`
104
105
# Check if config was requested
106
if [ "$1" = "config" ]; then
107
108
	# Get driver version
109
	driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '`
110
111
	# Configure graph depending on what which quantity will be plotted
112
	case $name in
113
		temp)
114
			echo 'graph_title GPU temperature'
115
			echo 'graph_args -l 0 -u 120'
116
			echo 'graph_vlabel Degrees (C)'
117 ff883dee dipohl
			echo 'graph_category sensors'
118 426bba44 Nuno Fachada
			echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
119
			nGpusCounter=0
120
			while [ $nGpusCounter -lt $nGpus ]
121
			do
122
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
123 10b1de81 Nuno Fachada
				echo "temp${nGpusCounter}.warning ${warning:-75}"
124
				echo "temp${nGpusCounter}.critical ${critical:-95}"
125 426bba44 Nuno Fachada
				echo "temp${nGpusCounter}.info Temperature information for $gpuName"
126
				: $(( nGpusCounter = $nGpusCounter + 1 ))
127
			done 
128
			;;
129
		mem)
130
			# First determine total memory of each GPU...
131 73bf78e7 Lee Clemens
			gpusTotalMemOutput=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '`
132 426bba44 Nuno Fachada
			gpusTotalMem=''
133
			nGpusCounter=0
134
			while [ $nGpusCounter -lt $nGpus ]
135
			do
136
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
137
				echo "mem${nGpusCounter}.info Memory information for $gpuName"
138
				gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p`
139
				gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
140
				: $(( nGpusCounter = $nGpusCounter + 1 ))
141
				if [ $nGpusCounter -lt $nGpus ]; then
142
					gpusTotalMem="${gpusTotalMem}, "
143
				fi
144
			done
145
			# ...then output config data.
146
			echo 'graph_title GPU memory usage'
147
			echo 'graph_args -l 0 -u 100'
148
			echo 'graph_vlabel Percentage'
149 ff883dee dipohl
			echo 'graph_category memory'
150 61f058fc leeclemens
			echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
151 426bba44 Nuno Fachada
			;;
152
		fan)
153
			echo 'graph_title GPU fan speed'
154
			echo 'graph_args -l 0 -u 100'
155
			echo 'graph_vlabel Percentage'
156 ff883dee dipohl
			echo 'graph_category sensors'
157 426bba44 Nuno Fachada
			echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
158
			nGpusCounter=0
159
			while [ $nGpusCounter -lt $nGpus ]
160
			do
161
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
162
				echo "fan${nGpusCounter}.info Fan information for $gpuName"
163
				: $(( nGpusCounter = $nGpusCounter + 1 ))
164
				done 
165
			;;
166 59361d6c Robert Kulyassa
		power)
167
			echo 'graph_title GPU power consumption'
168
			echo 'graph_vlabel Watt'
169
			echo 'graph_category sensors'
170
			echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion"
171
			nGpusCounter=0
172
			while [ $nGpusCounter -lt $nGpus ]
173
			do
174
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
175
				echo "power${nGpusCounter}.info power consumption of $gpuName"
176
				: $(( nGpusCounter = $nGpusCounter + 1 ))
177
				done
178
			;;
179 426bba44 Nuno Fachada
		*)
180
			echo "Can't run without a proper symlink. Exiting."
181
			echo "Try running munin-node-configure --suggest."
182
			exit 1
183
			;;
184
	esac
185
186
	# Common stuff for all quantities
187
	nGpusCounter=0
188
	while [ $nGpusCounter -lt $nGpus ]
189
	do
190
		gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
191
		echo "${name}${nGpusCounter}.label $gpuName"
192
		: $(( nGpusCounter = $nGpusCounter + 1 ))
193
		#print_warning $name
194
		#print_critical $name
195
	done
196
197
	exit 0
198
fi
199
200
# Get requested value
201
case $name in
202
	temp)
203 49312192 leeclemens
		valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2`
204 426bba44 Nuno Fachada
		;;
205
	mem)
206 73bf78e7 Lee Clemens
		totalMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2`
207
		usedMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2`
208 426bba44 Nuno Fachada
		valueGpus=''
209
		nGpusCounter=0
210
		while [ $nGpusCounter -lt $nGpus ]
211
		do
212
			totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
213
			usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
214
			percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu ))
215 3a4b7fcb Lee Clemens
			valueGpus="${valueGpus}${percentMemUsed}"$'\n'
216 426bba44 Nuno Fachada
			: $(( nGpusCounter = $nGpusCounter + 1 ))
217
		done
218
		;;
219
	fan)
220
		valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2`
221
		;;
222 59361d6c Robert Kulyassa
	power)
223
		valueGpus=`echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2`
224
		;;
225 426bba44 Nuno Fachada
	*)
226
		echo "Can't run without a proper symlink. Exiting."
227
		echo "Try running munin-node-configure --suggest."
228
		exit 1
229
		;;
230
	esac
231
232
233
# Print requested value
234
nGpusCounter=0
235
while [ $nGpusCounter -lt $nGpus ]
236
do
237
	value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p`
238
	echo "${name}${nGpusCounter}.value $value"
239
	: $(( nGpusCounter = $nGpusCounter + 1 ))
240
done
241
242