root / plugins / gpu / nvidia_gpu_ @ 00c0da18
Historique | Voir | Annoter | Télécharger (6,72 ko)
| 1 | 00c0da18 | Nils | #!/usr/bin/env bash |
|---|---|---|---|
| 2 | 426bba44 | Nuno Fachada | # -*- sh -*- |
| 3 | |||
| 4 | : << =cut |
||
| 5 | |||
| 6 | =head1 NAME |
||
| 7 | |||
| 8 | nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, |
||
| 9 | usually bundled with NVIDIA GPU driver, to obtain information. |
||
| 10 | |||
| 11 | =head1 CONFIGURATION |
||
| 12 | |||
| 13 | This is a wildcard plugin. The wildcard prefix link name should be the |
||
| 14 | value to monitor. |
||
| 15 | |||
| 16 | This plugin uses the following configuration variables: |
||
| 17 | |||
| 18 | [nvidia_gpu_*] |
||
| 19 | env.smiexec - Location of nvidia-smi executable. |
||
| 20 | 10b1de81 | Nuno Fachada | env.warning - Warning temperature |
| 21 | env.critical - Critical temperature |
||
| 22 | 426bba44 | Nuno Fachada | |
| 23 | =head2 DEFAULT CONFIGURATION |
||
| 24 | |||
| 25 | c53197ce | Nuno Fachada | The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and |
| 26 | assume warning and critical temperatures of 75 and 95 degrees celsius, respectively. |
||
| 27 | 426bba44 | Nuno Fachada | |
| 28 | =head2 EXAMPLE WILDCARD USAGE |
||
| 29 | |||
| 30 | C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp> |
||
| 31 | |||
| 32 | ...will monitor the temperature of available GPUs. |
||
| 33 | |||
| 34 | c53197ce | Nuno Fachada | =head1 TODO |
| 35 | |||
| 36 | =over 4 |
||
| 37 | |||
| 38 | =item * |
||
| 39 | |||
| 40 | Add support for specific professional GPU features such as number of compute |
||
| 41 | processes, clocks, power draw, utilization, and so on. |
||
| 42 | |||
| 43 | =item * |
||
| 44 | |||
| 45 | Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput). |
||
| 46 | |||
| 47 | =back |
||
| 48 | |||
| 49 | 426bba44 | Nuno Fachada | =head1 AUTHOR |
| 50 | |||
| 51 | Nuno Fachada |
||
| 52 | faken@fakenmc.com |
||
| 53 | |||
| 54 | =head1 LICENSE |
||
| 55 | |||
| 56 | GNU General Public License, version 2 |
||
| 57 | http://www.gnu.org/licenses/gpl-2.0.html |
||
| 58 | |||
| 59 | =head1 MAGIC MARKERS |
||
| 60 | |||
| 61 | #%# family=auto |
||
| 62 | #%# capabilities=autoconf suggest |
||
| 63 | |||
| 64 | =cut |
||
| 65 | |||
| 66 | # Determine name of parameter to monitor |
||
| 67 | name=`basename $0 | sed 's/^nvidia_gpu_//g'` |
||
| 68 | |||
| 69 | # Get location of nvidia-smi executable or use default |
||
| 70 | nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
|
||
| 71 | |||
| 72 | # Check if autoconf was requested |
||
| 73 | if [ "$1" = "autoconf" ]; then |
||
| 74 | # Autoconf only returns yes if nvidia-smi exists and is executable |
||
| 75 | if [ -x $nvSmiExec ]; then |
||
| 76 | echo yes |
||
| 77 | exit 0 |
||
| 78 | else |
||
| 79 | echo "no (nvidia-smi executable not found)" |
||
| 80 | exit 0 |
||
| 81 | fi |
||
| 82 | fi |
||
| 83 | |||
| 84 | # Check if suggest was requested |
||
| 85 | if [ "$1" = "suggest" ]; then |
||
| 86 | echo "temp" |
||
| 87 | echo "mem" |
||
| 88 | echo "fan" |
||
| 89 | 59361d6c | Robert Kulyassa | echo "power" |
| 90 | 426bba44 | Nuno Fachada | exit 0 |
| 91 | fi |
||
| 92 | |||
| 93 | # Get number of GPUs |
||
| 94 | nGpusOutput=`$nvSmiExec -L` |
||
| 95 | nGpus=`echo "$nGpusOutput" | wc -l` |
||
| 96 | if [ $nGpus -eq 0 ]; then |
||
| 97 | # Exit if no GPUs found |
||
| 98 | echo "No NVIDIA GPUs detected. Exiting." |
||
| 99 | exit 1 |
||
| 100 | fi |
||
| 101 | |||
| 102 | # Get full output from nvidia-smi |
||
| 103 | smiOutput=`$nvSmiExec -q` |
||
| 104 | |||
| 105 | # Check if config was requested |
||
| 106 | if [ "$1" = "config" ]; then |
||
| 107 | |||
| 108 | # Get driver version |
||
| 109 | driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '` |
||
| 110 | |||
| 111 | # Configure graph depending on what which quantity will be plotted |
||
| 112 | case $name in |
||
| 113 | temp) |
||
| 114 | echo 'graph_title GPU temperature' |
||
| 115 | echo 'graph_args -l 0 -u 120' |
||
| 116 | echo 'graph_vlabel Degrees (C)' |
||
| 117 | ff883dee | dipohl | echo 'graph_category sensors' |
| 118 | 426bba44 | Nuno Fachada | echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" |
| 119 | nGpusCounter=0 |
||
| 120 | while [ $nGpusCounter -lt $nGpus ] |
||
| 121 | do |
||
| 122 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 123 | 10b1de81 | Nuno Fachada | echo "temp${nGpusCounter}.warning ${warning:-75}"
|
| 124 | echo "temp${nGpusCounter}.critical ${critical:-95}"
|
||
| 125 | 426bba44 | Nuno Fachada | echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
| 126 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 127 | done |
||
| 128 | ;; |
||
| 129 | mem) |
||
| 130 | # First determine total memory of each GPU... |
||
| 131 | 73bf78e7 | Lee Clemens | gpusTotalMemOutput=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '` |
| 132 | 426bba44 | Nuno Fachada | gpusTotalMem='' |
| 133 | nGpusCounter=0 |
||
| 134 | while [ $nGpusCounter -lt $nGpus ] |
||
| 135 | do |
||
| 136 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 137 | echo "mem${nGpusCounter}.info Memory information for $gpuName"
|
||
| 138 | gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p` |
||
| 139 | gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
|
||
| 140 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 141 | if [ $nGpusCounter -lt $nGpus ]; then |
||
| 142 | gpusTotalMem="${gpusTotalMem}, "
|
||
| 143 | fi |
||
| 144 | done |
||
| 145 | # ...then output config data. |
||
| 146 | echo 'graph_title GPU memory usage' |
||
| 147 | echo 'graph_args -l 0 -u 100' |
||
| 148 | echo 'graph_vlabel Percentage' |
||
| 149 | ff883dee | dipohl | echo 'graph_category memory' |
| 150 | 61f058fc | leeclemens | echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" |
| 151 | 426bba44 | Nuno Fachada | ;; |
| 152 | fan) |
||
| 153 | echo 'graph_title GPU fan speed' |
||
| 154 | echo 'graph_args -l 0 -u 100' |
||
| 155 | echo 'graph_vlabel Percentage' |
||
| 156 | ff883dee | dipohl | echo 'graph_category sensors' |
| 157 | 426bba44 | Nuno Fachada | echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" |
| 158 | nGpusCounter=0 |
||
| 159 | while [ $nGpusCounter -lt $nGpus ] |
||
| 160 | do |
||
| 161 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 162 | echo "fan${nGpusCounter}.info Fan information for $gpuName"
|
||
| 163 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 164 | done |
||
| 165 | ;; |
||
| 166 | 59361d6c | Robert Kulyassa | power) |
| 167 | echo 'graph_title GPU power consumption' |
||
| 168 | echo 'graph_vlabel Watt' |
||
| 169 | echo 'graph_category sensors' |
||
| 170 | echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion" |
||
| 171 | nGpusCounter=0 |
||
| 172 | while [ $nGpusCounter -lt $nGpus ] |
||
| 173 | do |
||
| 174 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 175 | echo "power${nGpusCounter}.info power consumption of $gpuName"
|
||
| 176 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 177 | done |
||
| 178 | ;; |
||
| 179 | 426bba44 | Nuno Fachada | *) |
| 180 | echo "Can't run without a proper symlink. Exiting." |
||
| 181 | echo "Try running munin-node-configure --suggest." |
||
| 182 | exit 1 |
||
| 183 | ;; |
||
| 184 | esac |
||
| 185 | |||
| 186 | # Common stuff for all quantities |
||
| 187 | nGpusCounter=0 |
||
| 188 | while [ $nGpusCounter -lt $nGpus ] |
||
| 189 | do |
||
| 190 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 191 | echo "${name}${nGpusCounter}.label $gpuName"
|
||
| 192 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 193 | #print_warning $name |
||
| 194 | #print_critical $name |
||
| 195 | done |
||
| 196 | |||
| 197 | exit 0 |
||
| 198 | fi |
||
| 199 | |||
| 200 | # Get requested value |
||
| 201 | case $name in |
||
| 202 | temp) |
||
| 203 | 49312192 | leeclemens | valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2` |
| 204 | 426bba44 | Nuno Fachada | ;; |
| 205 | mem) |
||
| 206 | 73bf78e7 | Lee Clemens | totalMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2` |
| 207 | usedMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2` |
||
| 208 | 426bba44 | Nuno Fachada | valueGpus='' |
| 209 | nGpusCounter=0 |
||
| 210 | while [ $nGpusCounter -lt $nGpus ] |
||
| 211 | do |
||
| 212 | totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 213 | usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 214 | percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu )) |
||
| 215 | 3a4b7fcb | Lee Clemens | valueGpus="${valueGpus}${percentMemUsed}"$'\n'
|
| 216 | 426bba44 | Nuno Fachada | : $(( nGpusCounter = $nGpusCounter + 1 )) |
| 217 | done |
||
| 218 | ;; |
||
| 219 | fan) |
||
| 220 | valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2` |
||
| 221 | ;; |
||
| 222 | 59361d6c | Robert Kulyassa | power) |
| 223 | valueGpus=`echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2` |
||
| 224 | ;; |
||
| 225 | 426bba44 | Nuno Fachada | *) |
| 226 | echo "Can't run without a proper symlink. Exiting." |
||
| 227 | echo "Try running munin-node-configure --suggest." |
||
| 228 | exit 1 |
||
| 229 | ;; |
||
| 230 | esac |
||
| 231 | |||
| 232 | |||
| 233 | # Print requested value |
||
| 234 | nGpusCounter=0 |
||
| 235 | while [ $nGpusCounter -lt $nGpus ] |
||
| 236 | do |
||
| 237 | value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 238 | echo "${name}${nGpusCounter}.value $value"
|
||
| 239 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 240 | done |
||
| 241 | |||
| 242 |
