root / plugins / gpu / nvidia_gpu_ @ 10b1de81
Historique | Voir | Annoter | Télécharger (6,12 ko)
| 1 | 426bba44 | Nuno Fachada | #!/bin/sh |
|---|---|---|---|
| 2 | # -*- sh -*- |
||
| 3 | |||
| 4 | : << =cut |
||
| 5 | |||
| 6 | =head1 NAME |
||
| 7 | |||
| 8 | nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, |
||
| 9 | usually bundled with NVIDIA GPU driver, to obtain information. |
||
| 10 | |||
| 11 | =head1 CONFIGURATION |
||
| 12 | |||
| 13 | This is a wildcard plugin. The wildcard prefix link name should be the |
||
| 14 | value to monitor. |
||
| 15 | |||
| 16 | This plugin uses the following configuration variables: |
||
| 17 | |||
| 18 | [nvidia_gpu_*] |
||
| 19 | env.smiexec - Location of nvidia-smi executable. |
||
| 20 | 10b1de81 | Nuno Fachada | env.warning - Warning temperature |
| 21 | env.critical - Critical temperature |
||
| 22 | 426bba44 | Nuno Fachada | |
| 23 | =head2 DEFAULT CONFIGURATION |
||
| 24 | |||
| 25 | The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi. |
||
| 26 | |||
| 27 | =head2 EXAMPLE WILDCARD USAGE |
||
| 28 | |||
| 29 | C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp> |
||
| 30 | |||
| 31 | ...will monitor the temperature of available GPUs. |
||
| 32 | |||
| 33 | =head1 AUTHOR |
||
| 34 | |||
| 35 | Nuno Fachada |
||
| 36 | faken@fakenmc.com |
||
| 37 | |||
| 38 | =head1 LICENSE |
||
| 39 | |||
| 40 | GNU General Public License, version 2 |
||
| 41 | http://www.gnu.org/licenses/gpl-2.0.html |
||
| 42 | |||
| 43 | =head1 MAGIC MARKERS |
||
| 44 | |||
| 45 | #%# family=auto |
||
| 46 | #%# capabilities=autoconf suggest |
||
| 47 | |||
| 48 | =cut |
||
| 49 | |||
| 50 | # Determine name of parameter to monitor |
||
| 51 | name=`basename $0 | sed 's/^nvidia_gpu_//g'` |
||
| 52 | |||
| 53 | # Get location of nvidia-smi executable or use default |
||
| 54 | nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
|
||
| 55 | |||
| 56 | # Check if autoconf was requested |
||
| 57 | if [ "$1" = "autoconf" ]; then |
||
| 58 | # Autoconf only returns yes if nvidia-smi exists and is executable |
||
| 59 | if [ -x $nvSmiExec ]; then |
||
| 60 | echo yes |
||
| 61 | exit 0 |
||
| 62 | else |
||
| 63 | echo "no (nvidia-smi executable not found)" |
||
| 64 | exit 0 |
||
| 65 | fi |
||
| 66 | fi |
||
| 67 | |||
| 68 | # Check if suggest was requested |
||
| 69 | if [ "$1" = "suggest" ]; then |
||
| 70 | echo "temp" |
||
| 71 | echo "mem" |
||
| 72 | echo "fan" |
||
| 73 | exit 0 |
||
| 74 | fi |
||
| 75 | |||
| 76 | # Get number of GPUs |
||
| 77 | nGpusOutput=`$nvSmiExec -L` |
||
| 78 | nGpus=`echo "$nGpusOutput" | wc -l` |
||
| 79 | if [ $nGpus -eq 0 ]; then |
||
| 80 | # Exit if no GPUs found |
||
| 81 | echo "No NVIDIA GPUs detected. Exiting." |
||
| 82 | exit 1 |
||
| 83 | fi |
||
| 84 | |||
| 85 | # Get full output from nvidia-smi |
||
| 86 | smiOutput=`$nvSmiExec -q` |
||
| 87 | |||
| 88 | # Check if config was requested |
||
| 89 | if [ "$1" = "config" ]; then |
||
| 90 | |||
| 91 | # Get driver version |
||
| 92 | driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '` |
||
| 93 | |||
| 94 | # Configure graph depending on what which quantity will be plotted |
||
| 95 | case $name in |
||
| 96 | temp) |
||
| 97 | echo 'graph_title GPU temperature' |
||
| 98 | echo 'graph_args -l 0 -u 120' |
||
| 99 | echo 'graph_vlabel Degrees (C)' |
||
| 100 | echo 'graph_category gpu' |
||
| 101 | echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" |
||
| 102 | nGpusCounter=0 |
||
| 103 | while [ $nGpusCounter -lt $nGpus ] |
||
| 104 | do |
||
| 105 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 106 | 10b1de81 | Nuno Fachada | echo "temp${nGpusCounter}.warning ${warning:-75}"
|
| 107 | echo "temp${nGpusCounter}.critical ${critical:-95}"
|
||
| 108 | 426bba44 | Nuno Fachada | echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
| 109 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 110 | done |
||
| 111 | ;; |
||
| 112 | mem) |
||
| 113 | # First determine total memory of each GPU... |
||
| 114 | gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '` |
||
| 115 | gpusTotalMem='' |
||
| 116 | nGpusCounter=0 |
||
| 117 | while [ $nGpusCounter -lt $nGpus ] |
||
| 118 | do |
||
| 119 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 120 | echo "mem${nGpusCounter}.info Memory information for $gpuName"
|
||
| 121 | gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p` |
||
| 122 | gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
|
||
| 123 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 124 | if [ $nGpusCounter -lt $nGpus ]; then |
||
| 125 | gpusTotalMem="${gpusTotalMem}, "
|
||
| 126 | fi |
||
| 127 | done |
||
| 128 | # ...then output config data. |
||
| 129 | echo 'graph_title GPU memory usage' |
||
| 130 | echo 'graph_args -l 0 -u 100' |
||
| 131 | echo 'graph_vlabel Percentage' |
||
| 132 | echo 'graph_category gpu' |
||
| 133 | echo "graph_info Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" |
||
| 134 | ;; |
||
| 135 | fan) |
||
| 136 | echo 'graph_title GPU fan speed' |
||
| 137 | echo 'graph_args -l 0 -u 100' |
||
| 138 | echo 'graph_vlabel Percentage' |
||
| 139 | echo 'graph_category gpu' |
||
| 140 | echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" |
||
| 141 | nGpusCounter=0 |
||
| 142 | while [ $nGpusCounter -lt $nGpus ] |
||
| 143 | do |
||
| 144 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 145 | echo "fan${nGpusCounter}.info Fan information for $gpuName"
|
||
| 146 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 147 | done |
||
| 148 | ;; |
||
| 149 | *) |
||
| 150 | echo "Can't run without a proper symlink. Exiting." |
||
| 151 | echo "Try running munin-node-configure --suggest." |
||
| 152 | exit 1 |
||
| 153 | ;; |
||
| 154 | esac |
||
| 155 | |||
| 156 | # Common stuff for all quantities |
||
| 157 | nGpusCounter=0 |
||
| 158 | while [ $nGpusCounter -lt $nGpus ] |
||
| 159 | do |
||
| 160 | gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` |
||
| 161 | echo "${name}${nGpusCounter}.label $gpuName"
|
||
| 162 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 163 | #print_warning $name |
||
| 164 | #print_critical $name |
||
| 165 | done |
||
| 166 | |||
| 167 | exit 0 |
||
| 168 | fi |
||
| 169 | |||
| 170 | # Get requested value |
||
| 171 | case $name in |
||
| 172 | temp) |
||
| 173 | valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2` |
||
| 174 | ;; |
||
| 175 | mem) |
||
| 176 | totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2` |
||
| 177 | usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2` |
||
| 178 | valueGpus='' |
||
| 179 | nGpusCounter=0 |
||
| 180 | while [ $nGpusCounter -lt $nGpus ] |
||
| 181 | do |
||
| 182 | totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 183 | usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 184 | percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu )) |
||
| 185 | valueGpus="${valueGpus}${percentMemUsed}\n"
|
||
| 186 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 187 | done |
||
| 188 | ;; |
||
| 189 | fan) |
||
| 190 | valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2` |
||
| 191 | ;; |
||
| 192 | *) |
||
| 193 | echo "Can't run without a proper symlink. Exiting." |
||
| 194 | echo "Try running munin-node-configure --suggest." |
||
| 195 | exit 1 |
||
| 196 | ;; |
||
| 197 | esac |
||
| 198 | |||
| 199 | |||
| 200 | # Print requested value |
||
| 201 | nGpusCounter=0 |
||
| 202 | while [ $nGpusCounter -lt $nGpus ] |
||
| 203 | do |
||
| 204 | value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p` |
||
| 205 | echo "${name}${nGpusCounter}.value $value"
|
||
| 206 | : $(( nGpusCounter = $nGpusCounter + 1 )) |
||
| 207 | done |
||
| 208 | |||
| 209 | 758ca724 | Nuno Fachada | # TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. |
| 210 | # TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not) |
||
| 211 | |||
| 212 | |||
| 213 | 426bba44 | Nuno Fachada |
