Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / gpu / nvidia_gpu_ @ 17f78427

Historique | Voir | Annoter | Télécharger (7,25 ko)

1 7ef2911e Nils
#!/bin/bash
2 426bba44 Nuno Fachada
# -*- sh -*-
3
4
: << =cut
5
6
=head1 NAME
7
8
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
9
usually bundled with NVIDIA GPU driver, to obtain information.
10
11
=head1 CONFIGURATION
12
13 17f78427 Lars Kruse
This is a wildcard plugin. The wildcard prefix link name should be the
14 426bba44 Nuno Fachada
value to monitor.
15
16
This plugin uses the following configuration variables:
17
18
 [nvidia_gpu_*]
19
  env.smiexec - Location of nvidia-smi executable.
20 10b1de81 Nuno Fachada
  env.warning - Warning temperature
21
  env.critical - Critical temperature
22 426bba44 Nuno Fachada
23
=head2 DEFAULT CONFIGURATION
24
25 c53197ce Nuno Fachada
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and
26
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.
27 426bba44 Nuno Fachada
28
=head2 EXAMPLE WILDCARD USAGE
29
30
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
31
32
...will monitor the temperature of available GPUs.
33
34 c53197ce Nuno Fachada
=head1 TODO
35
36
=over 4
37
38
=item *
39
40 a5ccb70d Nils
Add support for specific professional GPU features such as number of compute processes, clocks and so on.
41 c53197ce Nuno Fachada
42
=item *
43
44
Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput).
45
46
=back
47
48 426bba44 Nuno Fachada
=head1 AUTHOR
49
50
Nuno Fachada
51
faken@fakenmc.com
52
53
=head1 LICENSE
54
55
 GNU General Public License, version 2
56 17f78427 Lars Kruse
 http://www.gnu.org/licenses/gpl-2.0.html
57 426bba44 Nuno Fachada
58
=head1 MAGIC MARKERS
59
60
 #%# family=auto
61
 #%# capabilities=autoconf suggest
62
63
=cut
64
65
# Determine name of parameter to monitor
66 2a84478f Nils
name=$(basename "$0" | sed 's/^nvidia_gpu_//g')
67 426bba44 Nuno Fachada
68
# Get location of nvidia-smi executable or use default
69
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
70
71
# Check if autoconf was requested
72
if [ "$1" = "autoconf" ]; then
73 17f78427 Lars Kruse
	# Autoconf only returns yes if nvidia-smi exists and is executable
74 5c08e074 Nils
	if [ -x "$nvSmiExec" ]; then
75 426bba44 Nuno Fachada
		echo yes
76
		exit 0
77
	else
78
		echo "no (nvidia-smi executable not found)"
79
		exit 0
80
	fi
81
fi
82
83
# Check if suggest was requested
84
if [ "$1" = "suggest" ]; then
85
	echo "temp"
86
	echo "mem"
87
	echo "fan"
88 59361d6c Robert Kulyassa
	echo "power"
89 a5ccb70d Nils
	echo "utilization"
90 426bba44 Nuno Fachada
	exit 0
91
fi
92
93
# Get number of GPUs
94 cc0efe41 Nils
nGpusOutput=$("$nvSmiExec" -L)
95 2a84478f Nils
nGpus=$(echo "$nGpusOutput" | wc -l)
96 5c08e074 Nils
if [ "$nGpus" -eq 0 ]; then
97 426bba44 Nuno Fachada
	# Exit if no GPUs found
98
	echo "No NVIDIA GPUs detected. Exiting."
99
	exit 1
100
fi
101
102
# Get full output from nvidia-smi
103 cc0efe41 Nils
smiOutput=$("$nvSmiExec" -q)
104 426bba44 Nuno Fachada
105
# Check if config was requested
106
if [ "$1" = "config" ]; then
107
108
	# Get driver version
109 cc0efe41 Nils
	driverVersion=$(echo "$smiOutput" | grep "Driver Version" | cut -d : -f 2 | tr -d ' ')
110 426bba44 Nuno Fachada
111
	# Configure graph depending on what which quantity will be plotted
112
	case $name in
113
		temp)
114
			echo 'graph_title GPU temperature'
115
			echo 'graph_args -l 0 -u 120'
116 cf52a443 Nils
			echo 'graph_vlabel degrees Celsius'
117 ff883dee dipohl
			echo 'graph_category sensors'
118 426bba44 Nuno Fachada
			echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
119
			nGpusCounter=0
120 52917d2b Nils
			while [ $nGpusCounter -lt "$nGpus" ]
121 426bba44 Nuno Fachada
			do
122 15125852 Nils
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
123 60a20936 Nils
				echo "${name}${nGpusCounter}.warning ${warning:-75}"
124
				echo "${name}${nGpusCounter}.critical ${critical:-95}"
125
				echo "${name}${nGpusCounter}.info Temperature information for $gpuName"
126 15125852 Nils
				: $((nGpusCounter=nGpusCounter+1))
127 60a20936 Nils
			done
128 426bba44 Nuno Fachada
			;;
129
		mem)
130
			# First determine total memory of each GPU...
131 2a84478f Nils
			gpusTotalMemOutput=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' ')
132 426bba44 Nuno Fachada
			gpusTotalMem=''
133
			nGpusCounter=0
134 52917d2b Nils
			while [ $nGpusCounter -lt "$nGpus" ]
135 426bba44 Nuno Fachada
			do
136 15125852 Nils
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
137 60a20936 Nils
				echo "${name}${nGpusCounter}.info Memory information for $gpuName"
138 15125852 Nils
				gpuMem=$(echo "$gpusTotalMemOutput"| sed -n $((nGpusCounter+1))p)
139 426bba44 Nuno Fachada
				gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
140 15125852 Nils
				: $((nGpusCounter=nGpusCounter+1))
141 5c08e074 Nils
				if [ "$nGpusCounter" -lt "$nGpus" ]; then
142 426bba44 Nuno Fachada
					gpusTotalMem="${gpusTotalMem}, "
143
				fi
144
			done
145
			# ...then output config data.
146
			echo 'graph_title GPU memory usage'
147
			echo 'graph_args -l 0 -u 100'
148 cf52a443 Nils
			echo 'graph_vlabel %'
149 ff883dee dipohl
			echo 'graph_category memory'
150 61f058fc leeclemens
			echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
151 426bba44 Nuno Fachada
			;;
152
		fan)
153
			echo 'graph_title GPU fan speed'
154
			echo 'graph_args -l 0 -u 100'
155 cf52a443 Nils
			echo 'graph_vlabel %'
156 ff883dee dipohl
			echo 'graph_category sensors'
157 426bba44 Nuno Fachada
			echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
158
			nGpusCounter=0
159 52917d2b Nils
			while [ $nGpusCounter -lt "$nGpus" ]
160 426bba44 Nuno Fachada
			do
161 15125852 Nils
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
162 60a20936 Nils
				echo "${name}${nGpusCounter}.info Fan information for $gpuName"
163 15125852 Nils
				: $((nGpusCounter=nGpusCounter+1))
164 60a20936 Nils
				done
165 426bba44 Nuno Fachada
			;;
166 59361d6c Robert Kulyassa
		power)
167
			echo 'graph_title GPU power consumption'
168
			echo 'graph_vlabel Watt'
169
			echo 'graph_category sensors'
170
			echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion"
171
			nGpusCounter=0
172 52917d2b Nils
			while [ $nGpusCounter -lt "$nGpus" ]
173 59361d6c Robert Kulyassa
			do
174 15125852 Nils
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
175 60a20936 Nils
				echo "${name}${nGpusCounter}.info power consumption of $gpuName"
176 15125852 Nils
				: $((nGpusCounter=nGpusCounter+1))
177 59361d6c Robert Kulyassa
				done
178
			;;
179 a5ccb70d Nils
		utilization)
180
			echo 'graph_title GPU utilization'
181
			echo 'graph_args -l 0 -u 100'
182 cf52a443 Nils
			echo 'graph_vlabel %'
183
			echo 'graph_category system'
184 a5ccb70d Nils
			echo "graph_info GPU utilization of NVIDIA GPUs using driver version $driverVersion"
185
			nGpusCounter=0
186 52917d2b Nils
			while [ $nGpusCounter -lt "$nGpus" ]
187 a5ccb70d Nils
			do
188
				gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
189 60a20936 Nils
				echo "${name}${nGpusCounter}.info GPU utilization information for $gpuName"
190 a5ccb70d Nils
				: $((nGpusCounter=nGpusCounter+1))
191 17f78427 Lars Kruse
				done
192 a5ccb70d Nils
			;;
193 426bba44 Nuno Fachada
		*)
194
			echo "Can't run without a proper symlink. Exiting."
195
			echo "Try running munin-node-configure --suggest."
196
			exit 1
197
			;;
198
	esac
199
200
	# Common stuff for all quantities
201
	nGpusCounter=0
202 52917d2b Nils
	while [ $nGpusCounter -lt "$nGpus" ]
203 426bba44 Nuno Fachada
	do
204 15125852 Nils
		gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
205 426bba44 Nuno Fachada
		echo "${name}${nGpusCounter}.label $gpuName"
206 15125852 Nils
		: $((nGpusCounter=nGpusCounter+1))
207 426bba44 Nuno Fachada
		#print_warning $name
208
		#print_critical $name
209
	done
210
211
	exit 0
212
fi
213
214
# Get requested value
215
case $name in
216
	temp)
217 2a84478f Nils
		valueGpus=$(echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2)
218 426bba44 Nuno Fachada
		;;
219
	mem)
220 2a84478f Nils
		totalMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2)
221
		usedMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2)
222 426bba44 Nuno Fachada
		valueGpus=''
223
		nGpusCounter=0
224 52917d2b Nils
		while [ $nGpusCounter -lt "$nGpus" ]
225 426bba44 Nuno Fachada
		do
226 15125852 Nils
			totalMemGpu=$(echo "$totalMemGpus" | sed -n $((nGpusCounter+1))p)
227
			usedMemGpu=$(echo "$usedMemGpus" | sed -n $((nGpusCounter+1))p)
228
			percentMemUsed=$((usedMemGpu*100/totalMemGpu))
229 3a4b7fcb Lee Clemens
			valueGpus="${valueGpus}${percentMemUsed}"$'\n'
230 15125852 Nils
			: $((nGpusCounter=nGpusCounter+1))
231 426bba44 Nuno Fachada
		done
232
		;;
233
	fan)
234 2a84478f Nils
		valueGpus=$(echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2)
235 426bba44 Nuno Fachada
		;;
236 59361d6c Robert Kulyassa
	power)
237 2a84478f Nils
		valueGpus=$(echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2)
238 59361d6c Robert Kulyassa
		;;
239 a5ccb70d Nils
	utilization)
240
		valueGpus=$(echo "$smiOutput" | grep "Gpu" | cut -d ':' -f 2 | cut -d ' ' -f 2)
241
		;;
242 426bba44 Nuno Fachada
	*)
243
		echo "Can't run without a proper symlink. Exiting."
244
		echo "Try running munin-node-configure --suggest."
245
		exit 1
246
		;;
247
	esac
248
249
250
# Print requested value
251
nGpusCounter=0
252 52917d2b Nils
while [ $nGpusCounter -lt "$nGpus" ]
253 426bba44 Nuno Fachada
do
254 15125852 Nils
	value=$(echo "$valueGpus" | sed -n $((nGpusCounter+1))p)
255 426bba44 Nuno Fachada
	echo "${name}${nGpusCounter}.value $value"
256 15125852 Nils
	: $((nGpusCounter=nGpusCounter+1))
257 426bba44 Nuno Fachada
done