root / plugins / gpu / nvidia_gpu_ @ 3a4b7fcb
Historique | Voir | Annoter | Télécharger (6,12 ko)
| 1 | aaeaa2e6 | Stig Sandbeck Mathisen | #!/bin/bash |
|---|---|---|---|
| 2 | 426bba44 | Nuno Fachada | # -*- sh -*- |
| 3 | |||
| 4 | : << =cut |
||
| 5 | |||
| 6 | =head1 NAME |
||
| 7 | |||
| 8 | nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, |
||
| 9 | usually bundled with NVIDIA GPU driver, to obtain information. |
||
| 10 | |||
| 11 | =head1 CONFIGURATION |
||
| 12 | |||
| 13 | This is a wildcard plugin. The wildcard prefix link name should be the |
||
| 14 | value to monitor. |
||
| 15 | |||
| 16 | This plugin uses the following configuration variables: |
||
| 17 | |||
| 18 | [nvidia_gpu_*] |
||
| 19 | env.smiexec - Location of nvidia-smi executable. |
||
| 20 | 10b1de81 | Nuno Fachada | env.warning - Warning temperature |
| 21 | env.critical - Critical temperature |
||
| 22 | 426bba44 | Nuno Fachada | |
| 23 | =head2 DEFAULT CONFIGURATION |
||
| 24 | |||
| 25 | c53197ce | Nuno Fachada | The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and |
| 26 | assume warning and critical temperatures of 75 and 95 degrees celsius, respectively. |
||
| 27 | 426bba44 | Nuno Fachada | |
| 28 | =head2 EXAMPLE WILDCARD USAGE |
||
| 29 | |||
| 30 | C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp> |
||
| 31 | |||
| 32 | ...will monitor the temperature of available GPUs. |
||
| 33 | |||
| 34 | c53197ce | Nuno Fachada | =head1 TODO |
| 35 | |||
| 36 | =over 4 |
||
| 37 | |||
| 38 | =item * |
||
| 39 | |||
| 40 | Add support for specific professional GPU features such as number of compute |
||
| 41 | processes, clocks, power draw, utilization, and so on. |
||
| 42 | |||
| 43 | =item * |
||
| 44 | |||
| 45 | Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput). |
||
| 46 | |||
| 47 | =back |
||
| 48 | |||
| 49 | 426bba44 | Nuno Fachada | =head1 AUTHOR |
| 50 | |||
| 51 | Nuno Fachada |
||
| 52 | faken@fakenmc.com |
||
| 53 | |||
| 54 | =head1 LICENSE |
||
| 55 | |||
| 56 | GNU General Public License, version 2 |
||
| 57 | http://www.gnu.org/licenses/gpl-2.0.html |
||
| 58 | |||
| 59 | =head1 MAGIC MARKERS |
||
| 60 | |||
| 61 | #%# family=auto |
||
| 62 | #%# capabilities=autoconf suggest |
||
| 63 | |||
| 64 | =cut |
||
| 65 | |||
| 66 | # Determine name of parameter to monitor |
||
| 67 | name=`basename $0 | sed 's/^nvidia_gpu_//g'` |
||
| 68 | |||
| 69 | # Get location of nvidia-smi executable or use default |
||
| 70 | nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
|
||
| 71 | |||
| 72 | # Check if autoconf was requested |
||
| 73 | if [ "$1" = "autoconf" ]; then |
||
| 74 | # Autoconf only returns yes if nvidia-smi exists and is executable |
||
| 75 | if [ -x $nvSmiExec ]; then |
||
| 76 | echo yes |
||
| 77 | exit 0 |
||
| 78 | else |
||
| 79 | echo "no (nvidia-smi executable not found)" |
||
| 80 | exit 0 |
||
| 81 | fi |
||
| 82 | fi |
||
| 83 | |||
| 84 | # Check if suggest was requested |
||
| 85 | if [ "$1" = "suggest" ]; then |
||
| 86 | echo "temp" |
||
| 87 | echo "mem" |
||
| 88 | echo "fan" |
||
| 89 | exit 0 |
||
| 90 | fi |
||
| 91 | |||
| 92 | # Get number of GPUs |
||
| 93 | nGpusOutput=`$nvSmiExec -L` |
||
| 94 | nGpus=`echo "$nGpusOutput" | wc -l` |
||
| 95 | if [ $nGpus -eq 0 ]; then |
||
| 96 | # Exit if no GPUs found |
||
| 97 | echo "No NVIDIA GPUs detected. Exiting." |
||
| 98 | exit 1 |
||
| 99 | fi |
||
| 100 | |||
| 101 | # Get full output from nvidia-smi |
||
| 102 | smiOutput=`$nvSmiExec -q` |
||
| 103 | |||
| 104 | # Check if config was requested |
||
| 105 | if [ "$1" = "config" ]; then |
||
| 106 | |||
| 107 | # Get driver version |
||
| 108 | driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '` |
||
| 109 | |||
| 110 | # Configure graph depending on what which quantity will be plotted |
||
| 111 | case $name in |
||
| 112 | temp) |
||
| 113 | echo 'graph_title GPU temperature' |
||
| 114 | echo 'graph_args -l 0 -u 120' |
||
| 115 | echo 'graph_vlabel Degrees (C)' |
||
| 116 | echo 'graph_category gpu' |
||
| 117 | echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" |
||
| 118 | nGpusCounter=0 |
||
| 119 | while [ $nGpusCounter -lt $nGpus ] |
||
| 120 | do |
||
| 121 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 122 | 10b1de81 | Nuno Fachada | echo "temp${nGpusCounter}.warning ${warning:-75}"
|
| 123 | echo "temp${nGpusCounter}.critical ${critical:-95}"
|
||
| 124 | 426bba44 | Nuno Fachada | echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
| 125 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 126 | done |
||
| 127 | ;; |
||
| 128 | mem) |
||
| 129 | # First determine total memory of each GPU... |
||
| 130 | 73bf78e7 | Lee Clemens | gpusTotalMemOutput=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '` |
| 131 | 426bba44 | Nuno Fachada | gpusTotalMem='' |
| 132 | nGpusCounter=0 |
||
| 133 | while [ $nGpusCounter -lt $nGpus ] |
||
| 134 | do |
||
| 135 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 136 | echo "mem${nGpusCounter}.info Memory information for $gpuName"
|
||
| 137 | gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p` |
||
| 138 | gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
|
||
| 139 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 140 | if [ $nGpusCounter -lt $nGpus ]; then |
||
| 141 | gpusTotalMem="${gpusTotalMem}, "
|
||
| 142 | fi |
||
| 143 | done |
||
| 144 | # ...then output config data. |
||
| 145 | echo 'graph_title GPU memory usage' |
||
| 146 | echo 'graph_args -l 0 -u 100' |
||
| 147 | echo 'graph_vlabel Percentage' |
||
| 148 | echo 'graph_category gpu' |
||
| 149 | 61f058fc | leeclemens | echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" |
| 150 | 426bba44 | Nuno Fachada | ;; |
| 151 | fan) |
||
| 152 | echo 'graph_title GPU fan speed' |
||
| 153 | echo 'graph_args -l 0 -u 100' |
||
| 154 | echo 'graph_vlabel Percentage' |
||
| 155 | echo 'graph_category gpu' |
||
| 156 | echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" |
||
| 157 | nGpusCounter=0 |
||
| 158 | while [ $nGpusCounter -lt $nGpus ] |
||
| 159 | do |
||
| 160 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 161 | echo "fan${nGpusCounter}.info Fan information for $gpuName"
|
||
| 162 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 163 | done |
||
| 164 | ;; |
||
| 165 | *) |
||
| 166 | echo "Can't run without a proper symlink. Exiting." |
||
| 167 | echo "Try running munin-node-configure --suggest." |
||
| 168 | exit 1 |
||
| 169 | ;; |
||
| 170 | esac |
||
| 171 | |||
| 172 | # Common stuff for all quantities |
||
| 173 | nGpusCounter=0 |
||
| 174 | while [ $nGpusCounter -lt $nGpus ] |
||
| 175 | do |
||
| 176 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 177 | echo "${name}${nGpusCounter}.label $gpuName"
|
||
| 178 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 179 | #print_warning $name |
||
| 180 | #print_critical $name |
||
| 181 | done |
||
| 182 | |||
| 183 | exit 0 |
||
| 184 | fi |
||
| 185 | |||
| 186 | # Get requested value |
||
| 187 | case $name in |
||
| 188 | temp) |
||
| 189 | 49312192 | leeclemens | valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2` |
| 190 | 426bba44 | Nuno Fachada | ;; |
| 191 | mem) |
||
| 192 | 73bf78e7 | Lee Clemens | totalMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2` |
| 193 | usedMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2` |
||
| 194 | 426bba44 | Nuno Fachada | valueGpus='' |
| 195 | nGpusCounter=0 |
||
| 196 | while [ $nGpusCounter -lt $nGpus ] |
||
| 197 | do |
||
| 198 | totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 199 | usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 200 | percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu )) |
||
| 201 | 3a4b7fcb | Lee Clemens | valueGpus="${valueGpus}${percentMemUsed}"$'\n'
|
| 202 | 426bba44 | Nuno Fachada | : $(( nGpusCounter = $nGpusCounter + 1 )) |
| 203 | done |
||
| 204 | ;; |
||
| 205 | fan) |
||
| 206 | valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2` |
||
| 207 | ;; |
||
| 208 | *) |
||
| 209 | echo "Can't run without a proper symlink. Exiting." |
||
| 210 | echo "Try running munin-node-configure --suggest." |
||
| 211 | exit 1 |
||
| 212 | ;; |
||
| 213 | esac |
||
| 214 | |||
| 215 | |||
| 216 | # Print requested value |
||
| 217 | nGpusCounter=0 |
||
| 218 | while [ $nGpusCounter -lt $nGpus ] |
||
| 219 | do |
||
| 220 | value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 221 | echo "${name}${nGpusCounter}.value $value"
|
||
| 222 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 223 | done |
||
| 224 | |||
| 225 |
