Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / gpu / nvidia_gpu_ @ 49312192

Historique | Voir | Annoter | Télécharger (6,08 ko)

1
#!/bin/sh
2
# -*- sh -*-
3

    
4
: << =cut
5

    
6
=head1 NAME
7

    
8
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
9
usually bundled with NVIDIA GPU driver, to obtain information.
10

    
11
=head1 CONFIGURATION
12

    
13
This is a wildcard plugin. The wildcard prefix link name should be the 
14
value to monitor.
15

    
16
This plugin uses the following configuration variables:
17

    
18
 [nvidia_gpu_*]
19
  env.smiexec - Location of nvidia-smi executable.
20
  env.warning - Warning temperature
21
  env.critical - Critical temperature
22

    
23
=head2 DEFAULT CONFIGURATION
24

    
25
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and
26
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.
27

    
28
=head2 EXAMPLE WILDCARD USAGE
29

    
30
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
31

    
32
...will monitor the temperature of available GPUs.
33

    
34
=head1 TODO
35

    
36
=over 4
37

    
38
=item *
39

    
40
Add support for specific professional GPU features such as number of compute 
41
processes, clocks, power draw, utilization, and so on.
42

    
43
=item *
44

    
45
Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput).
46

    
47
=back
48

    
49
=head1 AUTHOR
50

    
51
Nuno Fachada
52
faken@fakenmc.com
53

    
54
=head1 LICENSE
55

    
56
 GNU General Public License, version 2
57
 http://www.gnu.org/licenses/gpl-2.0.html 
58

    
59
=head1 MAGIC MARKERS
60

    
61
 #%# family=auto
62
 #%# capabilities=autoconf suggest
63

    
64
=cut
65

    
66
# Determine name of parameter to monitor
67
name=`basename $0 | sed 's/^nvidia_gpu_//g'`
68

    
69
# Get location of nvidia-smi executable or use default
70
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
71

    
72
# Check if autoconf was requested
73
if [ "$1" = "autoconf" ]; then
74
	# Autoconf only returns yes if nvidia-smi exists and is executable 
75
	if [ -x $nvSmiExec ]; then
76
		echo yes
77
		exit 0
78
	else
79
		echo "no (nvidia-smi executable not found)"
80
		exit 0
81
	fi
82
fi
83

    
84
# Check if suggest was requested
85
if [ "$1" = "suggest" ]; then
86
	echo "temp"
87
	echo "mem"
88
	echo "fan"
89
	exit 0
90
fi
91

    
92
# Get number of GPUs
93
nGpusOutput=`$nvSmiExec -L`
94
nGpus=`echo "$nGpusOutput" | wc -l`
95
if [ $nGpus -eq 0 ]; then
96
	# Exit if no GPUs found
97
	echo "No NVIDIA GPUs detected. Exiting."
98
	exit 1
99
fi
100

    
101
# Get full output from nvidia-smi
102
smiOutput=`$nvSmiExec -q`
103

    
104
# Check if config was requested
105
if [ "$1" = "config" ]; then
106

    
107
	# Get driver version
108
	driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '`
109

    
110
	# Configure graph depending on what which quantity will be plotted
111
	case $name in
112
		temp)
113
			echo 'graph_title GPU temperature'
114
			echo 'graph_args -l 0 -u 120'
115
			echo 'graph_vlabel Degrees (C)'
116
			echo 'graph_category gpu'
117
			echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
118
			nGpusCounter=0
119
			while [ $nGpusCounter -lt $nGpus ]
120
			do
121
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
122
				echo "temp${nGpusCounter}.warning ${warning:-75}"
123
				echo "temp${nGpusCounter}.critical ${critical:-95}"
124
				echo "temp${nGpusCounter}.info Temperature information for $gpuName"
125
				: $(( nGpusCounter = $nGpusCounter + 1 ))
126
			done 
127
			;;
128
		mem)
129
			# First determine total memory of each GPU...
130
			gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '`
131
			gpusTotalMem=''
132
			nGpusCounter=0
133
			while [ $nGpusCounter -lt $nGpus ]
134
			do
135
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
136
				echo "mem${nGpusCounter}.info Memory information for $gpuName"
137
				gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p`
138
				gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
139
				: $(( nGpusCounter = $nGpusCounter + 1 ))
140
				if [ $nGpusCounter -lt $nGpus ]; then
141
					gpusTotalMem="${gpusTotalMem}, "
142
				fi
143
			done
144
			# ...then output config data.
145
			echo 'graph_title GPU memory usage'
146
			echo 'graph_args -l 0 -u 100'
147
			echo 'graph_vlabel Percentage'
148
			echo 'graph_category gpu'
149
			echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
150
			;;
151
		fan)
152
			echo 'graph_title GPU fan speed'
153
			echo 'graph_args -l 0 -u 100'
154
			echo 'graph_vlabel Percentage'
155
			echo 'graph_category gpu'
156
			echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
157
			nGpusCounter=0
158
			while [ $nGpusCounter -lt $nGpus ]
159
			do
160
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
161
				echo "fan${nGpusCounter}.info Fan information for $gpuName"
162
				: $(( nGpusCounter = $nGpusCounter + 1 ))
163
				done 
164
			;;
165
		*)
166
			echo "Can't run without a proper symlink. Exiting."
167
			echo "Try running munin-node-configure --suggest."
168
			exit 1
169
			;;
170
	esac
171

    
172
	# Common stuff for all quantities
173
	nGpusCounter=0
174
	while [ $nGpusCounter -lt $nGpus ]
175
	do
176
		gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
177
		echo "${name}${nGpusCounter}.label $gpuName"
178
		: $(( nGpusCounter = $nGpusCounter + 1 ))
179
		#print_warning $name
180
		#print_critical $name
181
	done
182

    
183
	exit 0
184
fi
185

    
186
# Get requested value
187
case $name in
188
	temp)
189
		valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2`
190
		;;
191
	mem)
192
		totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2`
193
		usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2`
194
		valueGpus=''
195
		nGpusCounter=0
196
		while [ $nGpusCounter -lt $nGpus ]
197
		do
198
			totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
199
			usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
200
			percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu ))
201
			valueGpus="${valueGpus}${percentMemUsed}"$'\n'
202
			: $(( nGpusCounter = $nGpusCounter + 1 ))
203
		done
204
		;;
205
	fan)
206
		valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2`
207
		;;
208
	*)
209
		echo "Can't run without a proper symlink. Exiting."
210
		echo "Try running munin-node-configure --suggest."
211
		exit 1
212
		;;
213
	esac
214

    
215

    
216
# Print requested value
217
nGpusCounter=0
218
while [ $nGpusCounter -lt $nGpus ]
219
do
220
	value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p`
221
	echo "${name}${nGpusCounter}.value $value"
222
	: $(( nGpusCounter = $nGpusCounter + 1 ))
223
done
224

    
225

    
226