Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / gpu / nvidia_gpu_ @ 10b1de81

Historique | Voir | Annoter | Télécharger (6,12 ko)

1
#!/bin/sh
2
# -*- sh -*-
3

    
4
: << =cut
5

    
6
=head1 NAME
7

    
8
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
9
usually bundled with NVIDIA GPU driver, to obtain information.
10

    
11
=head1 CONFIGURATION
12

    
13
This is a wildcard plugin. The wildcard prefix link name should be the 
14
value to monitor.
15

    
16
This plugin uses the following configuration variables:
17

    
18
 [nvidia_gpu_*]
19
  env.smiexec - Location of nvidia-smi executable.
20
  env.warning - Warning temperature
21
  env.critical - Critical temperature
22

    
23
=head2 DEFAULT CONFIGURATION
24

    
25
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi.
26

    
27
=head2 EXAMPLE WILDCARD USAGE
28

    
29
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
30

    
31
...will monitor the temperature of available GPUs.
32

    
33
=head1 AUTHOR
34

    
35
Nuno Fachada
36
faken@fakenmc.com
37

    
38
=head1 LICENSE
39

    
40
 GNU General Public License, version 2
41
 http://www.gnu.org/licenses/gpl-2.0.html 
42

    
43
=head1 MAGIC MARKERS
44

    
45
 #%# family=auto
46
 #%# capabilities=autoconf suggest
47

    
48
=cut
49

    
50
# Determine name of parameter to monitor
51
name=`basename $0 | sed 's/^nvidia_gpu_//g'`
52

    
53
# Get location of nvidia-smi executable or use default
54
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
55

    
56
# Check if autoconf was requested
57
if [ "$1" = "autoconf" ]; then
58
	# Autoconf only returns yes if nvidia-smi exists and is executable 
59
	if [ -x $nvSmiExec ]; then
60
		echo yes
61
		exit 0
62
	else
63
		echo "no (nvidia-smi executable not found)"
64
		exit 0
65
	fi
66
fi
67

    
68
# Check if suggest was requested
69
if [ "$1" = "suggest" ]; then
70
	echo "temp"
71
	echo "mem"
72
	echo "fan"
73
	exit 0
74
fi
75

    
76
# Get number of GPUs
77
nGpusOutput=`$nvSmiExec -L`
78
nGpus=`echo "$nGpusOutput" | wc -l`
79
if [ $nGpus -eq 0 ]; then
80
	# Exit if no GPUs found
81
	echo "No NVIDIA GPUs detected. Exiting."
82
	exit 1
83
fi
84

    
85
# Get full output from nvidia-smi
86
smiOutput=`$nvSmiExec -q`
87

    
88
# Check if config was requested
89
if [ "$1" = "config" ]; then
90

    
91
	# Get driver version
92
	driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '`
93

    
94
	# Configure graph depending on what which quantity will be plotted
95
	case $name in
96
		temp)
97
			echo 'graph_title GPU temperature'
98
			echo 'graph_args -l 0 -u 120'
99
			echo 'graph_vlabel Degrees (C)'
100
			echo 'graph_category gpu'
101
			echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
102
			nGpusCounter=0
103
			while [ $nGpusCounter -lt $nGpus ]
104
			do
105
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
106
				echo "temp${nGpusCounter}.warning ${warning:-75}"
107
				echo "temp${nGpusCounter}.critical ${critical:-95}"
108
				echo "temp${nGpusCounter}.info Temperature information for $gpuName"
109
				: $(( nGpusCounter = $nGpusCounter + 1 ))
110
			done 
111
			;;
112
		mem)
113
			# First determine total memory of each GPU...
114
			gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '`
115
			gpusTotalMem=''
116
			nGpusCounter=0
117
			while [ $nGpusCounter -lt $nGpus ]
118
			do
119
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
120
				echo "mem${nGpusCounter}.info Memory information for $gpuName"
121
				gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p`
122
				gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
123
				: $(( nGpusCounter = $nGpusCounter + 1 ))
124
				if [ $nGpusCounter -lt $nGpus ]; then
125
					gpusTotalMem="${gpusTotalMem}, "
126
				fi
127
			done
128
			# ...then output config data.
129
			echo 'graph_title GPU memory usage'
130
			echo 'graph_args -l 0 -u 100'
131
			echo 'graph_vlabel Percentage'
132
			echo 'graph_category gpu'
133
			echo "graph_info Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
134
			;;
135
		fan)
136
			echo 'graph_title GPU fan speed'
137
			echo 'graph_args -l 0 -u 100'
138
			echo 'graph_vlabel Percentage'
139
			echo 'graph_category gpu'
140
			echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
141
			nGpusCounter=0
142
			while [ $nGpusCounter -lt $nGpus ]
143
			do
144
				gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
145
				echo "fan${nGpusCounter}.info Fan information for $gpuName"
146
				: $(( nGpusCounter = $nGpusCounter + 1 ))
147
				done 
148
			;;
149
		*)
150
			echo "Can't run without a proper symlink. Exiting."
151
			echo "Try running munin-node-configure --suggest."
152
			exit 1
153
			;;
154
	esac
155

    
156
	# Common stuff for all quantities
157
	nGpusCounter=0
158
	while [ $nGpusCounter -lt $nGpus ]
159
	do
160
		gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
161
		echo "${name}${nGpusCounter}.label $gpuName"
162
		: $(( nGpusCounter = $nGpusCounter + 1 ))
163
		#print_warning $name
164
		#print_critical $name
165
	done
166

    
167
	exit 0
168
fi
169

    
170
# Get requested value
171
case $name in
172
	temp)
173
		valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2`
174
		;;
175
	mem)
176
		totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2`
177
		usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2`
178
		valueGpus=''
179
		nGpusCounter=0
180
		while [ $nGpusCounter -lt $nGpus ]
181
		do
182
			totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
183
			usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
184
			percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu ))
185
			valueGpus="${valueGpus}${percentMemUsed}\n"
186
			: $(( nGpusCounter = $nGpusCounter + 1 ))
187
		done
188
		;;
189
	fan)
190
		valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2`
191
		;;
192
	*)
193
		echo "Can't run without a proper symlink. Exiting."
194
		echo "Try running munin-node-configure --suggest."
195
		exit 1
196
		;;
197
	esac
198

    
199

    
200
# Print requested value
201
nGpusCounter=0
202
while [ $nGpusCounter -lt $nGpus ]
203
do
204
	value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p`
205
	echo "${name}${nGpusCounter}.value $value"
206
	: $(( nGpusCounter = $nGpusCounter + 1 ))
207
done
208

    
209
# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data.
210
# TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not)
211

    
212

    
213

    
214