root / plugins / gpu / nvidia_gpu_ @ 17f78427
Historique | Voir | Annoter | Télécharger (7,25 ko)
| 1 | 7ef2911e | Nils | #!/bin/bash |
|---|---|---|---|
| 2 | 426bba44 | Nuno Fachada | # -*- sh -*- |
| 3 | |||
| 4 | : << =cut |
||
| 5 | |||
| 6 | =head1 NAME |
||
| 7 | |||
| 8 | nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, |
||
| 9 | usually bundled with NVIDIA GPU driver, to obtain information. |
||
| 10 | |||
| 11 | =head1 CONFIGURATION |
||
| 12 | |||
| 13 | 17f78427 | Lars Kruse | This is a wildcard plugin. The wildcard prefix link name should be the |
| 14 | 426bba44 | Nuno Fachada | value to monitor. |
| 15 | |||
| 16 | This plugin uses the following configuration variables: |
||
| 17 | |||
| 18 | [nvidia_gpu_*] |
||
| 19 | env.smiexec - Location of nvidia-smi executable. |
||
| 20 | 10b1de81 | Nuno Fachada | env.warning - Warning temperature |
| 21 | env.critical - Critical temperature |
||
| 22 | 426bba44 | Nuno Fachada | |
| 23 | =head2 DEFAULT CONFIGURATION |
||
| 24 | |||
| 25 | c53197ce | Nuno Fachada | The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and |
| 26 | assume warning and critical temperatures of 75 and 95 degrees celsius, respectively. |
||
| 27 | 426bba44 | Nuno Fachada | |
| 28 | =head2 EXAMPLE WILDCARD USAGE |
||
| 29 | |||
| 30 | C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp> |
||
| 31 | |||
| 32 | ...will monitor the temperature of available GPUs. |
||
| 33 | |||
| 34 | c53197ce | Nuno Fachada | =head1 TODO |
| 35 | |||
| 36 | =over 4 |
||
| 37 | |||
| 38 | =item * |
||
| 39 | |||
| 40 | a5ccb70d | Nils | Add support for specific professional GPU features such as number of compute processes, clocks and so on. |
| 41 | c53197ce | Nuno Fachada | |
| 42 | =item * |
||
| 43 | |||
| 44 | Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput). |
||
| 45 | |||
| 46 | =back |
||
| 47 | |||
| 48 | 426bba44 | Nuno Fachada | =head1 AUTHOR |
| 49 | |||
| 50 | Nuno Fachada |
||
| 51 | faken@fakenmc.com |
||
| 52 | |||
| 53 | =head1 LICENSE |
||
| 54 | |||
| 55 | GNU General Public License, version 2 |
||
| 56 | 17f78427 | Lars Kruse | http://www.gnu.org/licenses/gpl-2.0.html |
| 57 | 426bba44 | Nuno Fachada | |
| 58 | =head1 MAGIC MARKERS |
||
| 59 | |||
| 60 | #%# family=auto |
||
| 61 | #%# capabilities=autoconf suggest |
||
| 62 | |||
| 63 | =cut |
||
| 64 | |||
| 65 | # Determine name of parameter to monitor |
||
| 66 | 2a84478f | Nils | name=$(basename "$0" | sed 's/^nvidia_gpu_//g') |
| 67 | 426bba44 | Nuno Fachada | |
| 68 | # Get location of nvidia-smi executable or use default |
||
| 69 | nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
|
||
| 70 | |||
| 71 | # Check if autoconf was requested |
||
| 72 | if [ "$1" = "autoconf" ]; then |
||
| 73 | 17f78427 | Lars Kruse | # Autoconf only returns yes if nvidia-smi exists and is executable |
| 74 | 5c08e074 | Nils | if [ -x "$nvSmiExec" ]; then |
| 75 | 426bba44 | Nuno Fachada | echo yes |
| 76 | exit 0 |
||
| 77 | else |
||
| 78 | echo "no (nvidia-smi executable not found)" |
||
| 79 | exit 0 |
||
| 80 | fi |
||
| 81 | fi |
||
| 82 | |||
| 83 | # Check if suggest was requested |
||
| 84 | if [ "$1" = "suggest" ]; then |
||
| 85 | echo "temp" |
||
| 86 | echo "mem" |
||
| 87 | echo "fan" |
||
| 88 | 59361d6c | Robert Kulyassa | echo "power" |
| 89 | a5ccb70d | Nils | echo "utilization" |
| 90 | 426bba44 | Nuno Fachada | exit 0 |
| 91 | fi |
||
| 92 | |||
| 93 | # Get number of GPUs |
||
| 94 | cc0efe41 | Nils | nGpusOutput=$("$nvSmiExec" -L)
|
| 95 | 2a84478f | Nils | nGpus=$(echo "$nGpusOutput" | wc -l) |
| 96 | 5c08e074 | Nils | if [ "$nGpus" -eq 0 ]; then |
| 97 | 426bba44 | Nuno Fachada | # Exit if no GPUs found |
| 98 | echo "No NVIDIA GPUs detected. Exiting." |
||
| 99 | exit 1 |
||
| 100 | fi |
||
| 101 | |||
| 102 | # Get full output from nvidia-smi |
||
| 103 | cc0efe41 | Nils | smiOutput=$("$nvSmiExec" -q)
|
| 104 | 426bba44 | Nuno Fachada | |
| 105 | # Check if config was requested |
||
| 106 | if [ "$1" = "config" ]; then |
||
| 107 | |||
| 108 | # Get driver version |
||
| 109 | cc0efe41 | Nils | driverVersion=$(echo "$smiOutput" | grep "Driver Version" | cut -d : -f 2 | tr -d ' ') |
| 110 | 426bba44 | Nuno Fachada | |
| 111 | # Configure graph depending on what which quantity will be plotted |
||
| 112 | case $name in |
||
| 113 | temp) |
||
| 114 | echo 'graph_title GPU temperature' |
||
| 115 | echo 'graph_args -l 0 -u 120' |
||
| 116 | cf52a443 | Nils | echo 'graph_vlabel degrees Celsius' |
| 117 | ff883dee | dipohl | echo 'graph_category sensors' |
| 118 | 426bba44 | Nuno Fachada | echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" |
| 119 | nGpusCounter=0 |
||
| 120 | 52917d2b | Nils | while [ $nGpusCounter -lt "$nGpus" ] |
| 121 | 426bba44 | Nuno Fachada | do |
| 122 | 15125852 | Nils | gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) |
| 123 | 60a20936 | Nils | echo "${name}${nGpusCounter}.warning ${warning:-75}"
|
| 124 | echo "${name}${nGpusCounter}.critical ${critical:-95}"
|
||
| 125 | echo "${name}${nGpusCounter}.info Temperature information for $gpuName"
|
||
| 126 | 15125852 | Nils | : $((nGpusCounter=nGpusCounter+1)) |
| 127 | 60a20936 | Nils | done |
| 128 | 426bba44 | Nuno Fachada | ;; |
| 129 | mem) |
||
| 130 | # First determine total memory of each GPU... |
||
| 131 | 2a84478f | Nils | gpusTotalMemOutput=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' ') |
| 132 | 426bba44 | Nuno Fachada | gpusTotalMem='' |
| 133 | nGpusCounter=0 |
||
| 134 | 52917d2b | Nils | while [ $nGpusCounter -lt "$nGpus" ] |
| 135 | 426bba44 | Nuno Fachada | do |
| 136 | 15125852 | Nils | gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) |
| 137 | 60a20936 | Nils | echo "${name}${nGpusCounter}.info Memory information for $gpuName"
|
| 138 | 15125852 | Nils | gpuMem=$(echo "$gpusTotalMemOutput"| sed -n $((nGpusCounter+1))p) |
| 139 | 426bba44 | Nuno Fachada | gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
|
| 140 | 15125852 | Nils | : $((nGpusCounter=nGpusCounter+1)) |
| 141 | 5c08e074 | Nils | if [ "$nGpusCounter" -lt "$nGpus" ]; then |
| 142 | 426bba44 | Nuno Fachada | gpusTotalMem="${gpusTotalMem}, "
|
| 143 | fi |
||
| 144 | done |
||
| 145 | # ...then output config data. |
||
| 146 | echo 'graph_title GPU memory usage' |
||
| 147 | echo 'graph_args -l 0 -u 100' |
||
| 148 | cf52a443 | Nils | echo 'graph_vlabel %' |
| 149 | ff883dee | dipohl | echo 'graph_category memory' |
| 150 | 61f058fc | leeclemens | echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" |
| 151 | 426bba44 | Nuno Fachada | ;; |
| 152 | fan) |
||
| 153 | echo 'graph_title GPU fan speed' |
||
| 154 | echo 'graph_args -l 0 -u 100' |
||
| 155 | cf52a443 | Nils | echo 'graph_vlabel %' |
| 156 | ff883dee | dipohl | echo 'graph_category sensors' |
| 157 | 426bba44 | Nuno Fachada | echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" |
| 158 | nGpusCounter=0 |
||
| 159 | 52917d2b | Nils | while [ $nGpusCounter -lt "$nGpus" ] |
| 160 | 426bba44 | Nuno Fachada | do |
| 161 | 15125852 | Nils | gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) |
| 162 | 60a20936 | Nils | echo "${name}${nGpusCounter}.info Fan information for $gpuName"
|
| 163 | 15125852 | Nils | : $((nGpusCounter=nGpusCounter+1)) |
| 164 | 60a20936 | Nils | done |
| 165 | 426bba44 | Nuno Fachada | ;; |
| 166 | 59361d6c | Robert Kulyassa | power) |
| 167 | echo 'graph_title GPU power consumption' |
||
| 168 | echo 'graph_vlabel Watt' |
||
| 169 | echo 'graph_category sensors' |
||
| 170 | echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion" |
||
| 171 | nGpusCounter=0 |
||
| 172 | 52917d2b | Nils | while [ $nGpusCounter -lt "$nGpus" ] |
| 173 | 59361d6c | Robert Kulyassa | do |
| 174 | 15125852 | Nils | gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) |
| 175 | 60a20936 | Nils | echo "${name}${nGpusCounter}.info power consumption of $gpuName"
|
| 176 | 15125852 | Nils | : $((nGpusCounter=nGpusCounter+1)) |
| 177 | 59361d6c | Robert Kulyassa | done |
| 178 | ;; |
||
| 179 | a5ccb70d | Nils | utilization) |
| 180 | echo 'graph_title GPU utilization' |
||
| 181 | echo 'graph_args -l 0 -u 100' |
||
| 182 | cf52a443 | Nils | echo 'graph_vlabel %' |
| 183 | echo 'graph_category system' |
||
| 184 | a5ccb70d | Nils | echo "graph_info GPU utilization of NVIDIA GPUs using driver version $driverVersion" |
| 185 | nGpusCounter=0 |
||
| 186 | 52917d2b | Nils | while [ $nGpusCounter -lt "$nGpus" ] |
| 187 | a5ccb70d | Nils | do |
| 188 | gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) |
||
| 189 | 60a20936 | Nils | echo "${name}${nGpusCounter}.info GPU utilization information for $gpuName"
|
| 190 | a5ccb70d | Nils | : $((nGpusCounter=nGpusCounter+1)) |
| 191 | 17f78427 | Lars Kruse | done |
| 192 | a5ccb70d | Nils | ;; |
| 193 | 426bba44 | Nuno Fachada | *) |
| 194 | echo "Can't run without a proper symlink. Exiting." |
||
| 195 | echo "Try running munin-node-configure --suggest." |
||
| 196 | exit 1 |
||
| 197 | ;; |
||
| 198 | esac |
||
| 199 | |||
| 200 | # Common stuff for all quantities |
||
| 201 | nGpusCounter=0 |
||
| 202 | 52917d2b | Nils | while [ $nGpusCounter -lt "$nGpus" ] |
| 203 | 426bba44 | Nuno Fachada | do |
| 204 | 15125852 | Nils | gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1) |
| 205 | 426bba44 | Nuno Fachada | echo "${name}${nGpusCounter}.label $gpuName"
|
| 206 | 15125852 | Nils | : $((nGpusCounter=nGpusCounter+1)) |
| 207 | 426bba44 | Nuno Fachada | #print_warning $name |
| 208 | #print_critical $name |
||
| 209 | done |
||
| 210 | |||
| 211 | exit 0 |
||
| 212 | fi |
||
| 213 | |||
| 214 | # Get requested value |
||
| 215 | case $name in |
||
| 216 | temp) |
||
| 217 | 2a84478f | Nils | valueGpus=$(echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2) |
| 218 | 426bba44 | Nuno Fachada | ;; |
| 219 | mem) |
||
| 220 | 2a84478f | Nils | totalMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2) |
| 221 | usedMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2) |
||
| 222 | 426bba44 | Nuno Fachada | valueGpus='' |
| 223 | nGpusCounter=0 |
||
| 224 | 52917d2b | Nils | while [ $nGpusCounter -lt "$nGpus" ] |
| 225 | 426bba44 | Nuno Fachada | do |
| 226 | 15125852 | Nils | totalMemGpu=$(echo "$totalMemGpus" | sed -n $((nGpusCounter+1))p) |
| 227 | usedMemGpu=$(echo "$usedMemGpus" | sed -n $((nGpusCounter+1))p) |
||
| 228 | percentMemUsed=$((usedMemGpu*100/totalMemGpu)) |
||
| 229 | 3a4b7fcb | Lee Clemens | valueGpus="${valueGpus}${percentMemUsed}"$'\n'
|
| 230 | 15125852 | Nils | : $((nGpusCounter=nGpusCounter+1)) |
| 231 | 426bba44 | Nuno Fachada | done |
| 232 | ;; |
||
| 233 | fan) |
||
| 234 | 2a84478f | Nils | valueGpus=$(echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2) |
| 235 | 426bba44 | Nuno Fachada | ;; |
| 236 | 59361d6c | Robert Kulyassa | power) |
| 237 | 2a84478f | Nils | valueGpus=$(echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2) |
| 238 | 59361d6c | Robert Kulyassa | ;; |
| 239 | a5ccb70d | Nils | utilization) |
| 240 | valueGpus=$(echo "$smiOutput" | grep "Gpu" | cut -d ':' -f 2 | cut -d ' ' -f 2) |
||
| 241 | ;; |
||
| 242 | 426bba44 | Nuno Fachada | *) |
| 243 | echo "Can't run without a proper symlink. Exiting." |
||
| 244 | echo "Try running munin-node-configure --suggest." |
||
| 245 | exit 1 |
||
| 246 | ;; |
||
| 247 | esac |
||
| 248 | |||
| 249 | |||
| 250 | # Print requested value |
||
| 251 | nGpusCounter=0 |
||
| 252 | 52917d2b | Nils | while [ $nGpusCounter -lt "$nGpus" ] |
| 253 | 426bba44 | Nuno Fachada | do |
| 254 | 15125852 | Nils | value=$(echo "$valueGpus" | sed -n $((nGpusCounter+1))p) |
| 255 | 426bba44 | Nuno Fachada | echo "${name}${nGpusCounter}.value $value"
|
| 256 | 15125852 | Nils | : $((nGpusCounter=nGpusCounter+1)) |
| 257 | 426bba44 | Nuno Fachada | done |
