root / plugins / gpu / nvidia_gpu_ @ 49312192
Historique | Voir | Annoter | Télécharger (6,08 ko)
| 1 |
#!/bin/sh |
|---|---|
| 2 |
# -*- sh -*- |
| 3 |
|
| 4 |
: << =cut |
| 5 |
|
| 6 |
=head1 NAME |
| 7 |
|
| 8 |
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, |
| 9 |
usually bundled with NVIDIA GPU driver, to obtain information. |
| 10 |
|
| 11 |
=head1 CONFIGURATION |
| 12 |
|
| 13 |
This is a wildcard plugin. The wildcard prefix link name should be the |
| 14 |
value to monitor. |
| 15 |
|
| 16 |
This plugin uses the following configuration variables: |
| 17 |
|
| 18 |
[nvidia_gpu_*] |
| 19 |
env.smiexec - Location of nvidia-smi executable. |
| 20 |
env.warning - Warning temperature |
| 21 |
env.critical - Critical temperature |
| 22 |
|
| 23 |
=head2 DEFAULT CONFIGURATION |
| 24 |
|
| 25 |
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and |
| 26 |
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively. |
| 27 |
|
| 28 |
=head2 EXAMPLE WILDCARD USAGE |
| 29 |
|
| 30 |
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp> |
| 31 |
|
| 32 |
...will monitor the temperature of available GPUs. |
| 33 |
|
| 34 |
=head1 TODO |
| 35 |
|
| 36 |
=over 4 |
| 37 |
|
| 38 |
=item * |
| 39 |
|
| 40 |
Add support for specific professional GPU features such as number of compute |
| 41 |
processes, clocks, power draw, utilization, and so on. |
| 42 |
|
| 43 |
=item * |
| 44 |
|
| 45 |
Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput). |
| 46 |
|
| 47 |
=back |
| 48 |
|
| 49 |
=head1 AUTHOR |
| 50 |
|
| 51 |
Nuno Fachada |
| 52 |
faken@fakenmc.com |
| 53 |
|
| 54 |
=head1 LICENSE |
| 55 |
|
| 56 |
GNU General Public License, version 2 |
| 57 |
http://www.gnu.org/licenses/gpl-2.0.html |
| 58 |
|
| 59 |
=head1 MAGIC MARKERS |
| 60 |
|
| 61 |
#%# family=auto |
| 62 |
#%# capabilities=autoconf suggest |
| 63 |
|
| 64 |
=cut |
| 65 |
|
| 66 |
# Determine name of parameter to monitor |
| 67 |
name=`basename $0 | sed 's/^nvidia_gpu_//g'` |
| 68 |
|
| 69 |
# Get location of nvidia-smi executable or use default |
| 70 |
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
|
| 71 |
|
| 72 |
# Check if autoconf was requested |
| 73 |
if [ "$1" = "autoconf" ]; then |
| 74 |
# Autoconf only returns yes if nvidia-smi exists and is executable |
| 75 |
if [ -x $nvSmiExec ]; then |
| 76 |
echo yes |
| 77 |
exit 0 |
| 78 |
else |
| 79 |
echo "no (nvidia-smi executable not found)" |
| 80 |
exit 0 |
| 81 |
fi |
| 82 |
fi |
| 83 |
|
| 84 |
# Check if suggest was requested |
| 85 |
if [ "$1" = "suggest" ]; then |
| 86 |
echo "temp" |
| 87 |
echo "mem" |
| 88 |
echo "fan" |
| 89 |
exit 0 |
| 90 |
fi |
| 91 |
|
| 92 |
# Get number of GPUs |
| 93 |
nGpusOutput=`$nvSmiExec -L` |
| 94 |
nGpus=`echo "$nGpusOutput" | wc -l` |
| 95 |
if [ $nGpus -eq 0 ]; then |
| 96 |
# Exit if no GPUs found |
| 97 |
echo "No NVIDIA GPUs detected. Exiting." |
| 98 |
exit 1 |
| 99 |
fi |
| 100 |
|
| 101 |
# Get full output from nvidia-smi |
| 102 |
smiOutput=`$nvSmiExec -q` |
| 103 |
|
| 104 |
# Check if config was requested |
| 105 |
if [ "$1" = "config" ]; then |
| 106 |
|
| 107 |
# Get driver version |
| 108 |
driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '` |
| 109 |
|
| 110 |
# Configure graph depending on what which quantity will be plotted |
| 111 |
case $name in |
| 112 |
temp) |
| 113 |
echo 'graph_title GPU temperature' |
| 114 |
echo 'graph_args -l 0 -u 120' |
| 115 |
echo 'graph_vlabel Degrees (C)' |
| 116 |
echo 'graph_category gpu' |
| 117 |
echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" |
| 118 |
nGpusCounter=0 |
| 119 |
while [ $nGpusCounter -lt $nGpus ] |
| 120 |
do |
| 121 |
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
| 122 |
echo "temp${nGpusCounter}.warning ${warning:-75}"
|
| 123 |
echo "temp${nGpusCounter}.critical ${critical:-95}"
|
| 124 |
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
| 125 |
: $(( nGpusCounter = $nGpusCounter + 1 )) |
| 126 |
done |
| 127 |
;; |
| 128 |
mem) |
| 129 |
# First determine total memory of each GPU... |
| 130 |
gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '` |
| 131 |
gpusTotalMem='' |
| 132 |
nGpusCounter=0 |
| 133 |
while [ $nGpusCounter -lt $nGpus ] |
| 134 |
do |
| 135 |
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
| 136 |
echo "mem${nGpusCounter}.info Memory information for $gpuName"
|
| 137 |
gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p` |
| 138 |
gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
|
| 139 |
: $(( nGpusCounter = $nGpusCounter + 1 )) |
| 140 |
if [ $nGpusCounter -lt $nGpus ]; then |
| 141 |
gpusTotalMem="${gpusTotalMem}, "
|
| 142 |
fi |
| 143 |
done |
| 144 |
# ...then output config data. |
| 145 |
echo 'graph_title GPU memory usage' |
| 146 |
echo 'graph_args -l 0 -u 100' |
| 147 |
echo 'graph_vlabel Percentage' |
| 148 |
echo 'graph_category gpu' |
| 149 |
echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" |
| 150 |
;; |
| 151 |
fan) |
| 152 |
echo 'graph_title GPU fan speed' |
| 153 |
echo 'graph_args -l 0 -u 100' |
| 154 |
echo 'graph_vlabel Percentage' |
| 155 |
echo 'graph_category gpu' |
| 156 |
echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" |
| 157 |
nGpusCounter=0 |
| 158 |
while [ $nGpusCounter -lt $nGpus ] |
| 159 |
do |
| 160 |
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
| 161 |
echo "fan${nGpusCounter}.info Fan information for $gpuName"
|
| 162 |
: $(( nGpusCounter = $nGpusCounter + 1 )) |
| 163 |
done |
| 164 |
;; |
| 165 |
*) |
| 166 |
echo "Can't run without a proper symlink. Exiting." |
| 167 |
echo "Try running munin-node-configure --suggest." |
| 168 |
exit 1 |
| 169 |
;; |
| 170 |
esac |
| 171 |
|
| 172 |
# Common stuff for all quantities |
| 173 |
nGpusCounter=0 |
| 174 |
while [ $nGpusCounter -lt $nGpus ] |
| 175 |
do |
| 176 |
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
| 177 |
echo "${name}${nGpusCounter}.label $gpuName"
|
| 178 |
: $(( nGpusCounter = $nGpusCounter + 1 )) |
| 179 |
#print_warning $name |
| 180 |
#print_critical $name |
| 181 |
done |
| 182 |
|
| 183 |
exit 0 |
| 184 |
fi |
| 185 |
|
| 186 |
# Get requested value |
| 187 |
case $name in |
| 188 |
temp) |
| 189 |
valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2` |
| 190 |
;; |
| 191 |
mem) |
| 192 |
totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2` |
| 193 |
usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2` |
| 194 |
valueGpus='' |
| 195 |
nGpusCounter=0 |
| 196 |
while [ $nGpusCounter -lt $nGpus ] |
| 197 |
do |
| 198 |
totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
| 199 |
usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
| 200 |
percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu )) |
| 201 |
valueGpus="${valueGpus}${percentMemUsed}"$'\n'
|
| 202 |
: $(( nGpusCounter = $nGpusCounter + 1 )) |
| 203 |
done |
| 204 |
;; |
| 205 |
fan) |
| 206 |
valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2` |
| 207 |
;; |
| 208 |
*) |
| 209 |
echo "Can't run without a proper symlink. Exiting." |
| 210 |
echo "Try running munin-node-configure --suggest." |
| 211 |
exit 1 |
| 212 |
;; |
| 213 |
esac |
| 214 |
|
| 215 |
|
| 216 |
# Print requested value |
| 217 |
nGpusCounter=0 |
| 218 |
while [ $nGpusCounter -lt $nGpus ] |
| 219 |
do |
| 220 |
value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
| 221 |
echo "${name}${nGpusCounter}.value $value"
|
| 222 |
: $(( nGpusCounter = $nGpusCounter + 1 )) |
| 223 |
done |
| 224 |
|
| 225 |
|
| 226 |
|
