root / plugins / disk / nvme @ 16d38264
Historique | Voir | Annoter | Télécharger (8,94 ko)
| 1 | a2267c05 | Kjetil Torgrim Homme | #! /usr/bin/perl |
|---|---|---|---|
| 2 | # -*- mode: perl; perl-indent-level: 4 -*- |
||
| 3 | cca33b39 | Kjetil Torgrim Homme | |
| 4 | =head1 NAME |
||
| 5 | |||
| 6 | nvme - Munin plugin to monitor the use of NVMe devices |
||
| 7 | |||
| 8 | a2267c05 | Kjetil Torgrim Homme | =head1 APPLICABLE SYSTEMS |
| 9 | |||
| 10 | Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe |
||
| 11 | bus). |
||
| 12 | |||
| 13 | cca33b39 | Kjetil Torgrim Homme | =head1 CONFIGURATION |
| 14 | |||
| 15 | The plugin uses nvme(1) from the nvme-cli project to read status from |
||
| 16 | the NVMe devices. This requires root access. |
||
| 17 | |||
| 18 | [nvme] |
||
| 19 | c7299aeb | Kjetil Torgrim Homme | user root |
| 20 | cca33b39 | Kjetil Torgrim Homme | |
| 21 | c7299aeb | Kjetil Torgrim Homme | When setting alert levels per device, use graph and basename of device |
| 22 | name, e.g., 'nvme0n1', to make environment variable: |
||
| 23 | |||
| 24 | env.nvme_usage_nvme0n1_warning 5: |
||
| 25 | env.nvme_usage_warning 8: |
||
| 26 | cca33b39 | Kjetil Torgrim Homme | |
| 27 | 64089240 | Andreas Perhab | If your device names change on reboot you can also use the labels |
| 28 | (based on serial numbers) to set the warning and critical labels |
||
| 29 | |||
| 30 | env.nvme_usage_SN_1234567_warning 8:101 |
||
| 31 | env.nvme_usage_SN_1234567_critical 5:101 |
||
| 32 | |||
| 33 | cca33b39 | Kjetil Torgrim Homme | =head1 INTERPRETATION |
| 34 | |||
| 35 | a2267c05 | Kjetil Torgrim Homme | This is a multigraph plugin which makes three graphs. |
| 36 | cca33b39 | Kjetil Torgrim Homme | |
| 37 | =head2 nvme_usage |
||
| 38 | |||
| 39 | This reports how much of capacity is allocated in each NVMe |
||
| 40 | "namespace". The report is in percent. This number may not have much |
||
| 41 | relation to actual use, e.g., if deleted data areas have not been |
||
| 42 | trimmed/discarded. |
||
| 43 | |||
| 44 | c7299aeb | Kjetil Torgrim Homme | Default warning and critical: '95', '98' |
| 45 | |||
| 46 | cca33b39 | Kjetil Torgrim Homme | =head2 nvme_bytes |
| 47 | |||
| 48 | This reports read and write activity on each NVMe device, in bytes per |
||
| 49 | second. Ideally there should be much more read than write. If they |
||
| 50 | are symmetrical, you are using your NVMe as a very expensive FIFO, and |
||
| 51 | if you write more than you read, you should probably look for archival |
||
| 52 | storage instead. |
||
| 53 | |||
| 54 | It is a good idea to compare these numbers to I/O counters from |
||
| 55 | a2267c05 | Kjetil Torgrim Homme | diskstats. If they are much higher, look into whether the write |
| 56 | cca33b39 | Kjetil Torgrim Homme | amplification can be due to suboptimal I/O request sizes. |
| 57 | |||
| 58 | c7299aeb | Kjetil Torgrim Homme | This graph does not support alerting. |
| 59 | |||
| 60 | cca33b39 | Kjetil Torgrim Homme | =head2 nvme_writecycles |
| 61 | |||
| 62 | This graphs is intended to give an indication of how much life there |
||
| 63 | is left in your NVMe. It calculates the number of bytes written |
||
| 64 | during each device's lifetime against the capacity of the device, |
||
| 65 | thereby getting an average number of write cycle each cell has |
||
| 66 | experienced. |
||
| 67 | |||
| 68 | A prosumer NVMe will handle a few thousand writes to each cell before |
||
| 69 | the error rate gets out of hand. |
||
| 70 | |||
| 71 | c7299aeb | Kjetil Torgrim Homme | No default values for warning and critical. |
| 72 | |||
| 73 | =head2 nvme_spare |
||
| 74 | |||
| 75 | All NVMe has set a side reserve space to remap media errors. This |
||
| 76 | graphs how much is left in percent, taken directly from smart-log |
||
| 77 | output. |
||
| 78 | |||
| 79 | Default warning and critical: '10:', '3:' |
||
| 80 | |||
| 81 | cca33b39 | Kjetil Torgrim Homme | =head1 MAGIC MARKERS |
| 82 | |||
| 83 | #%# family=auto |
||
| 84 | #%# capabilities=autoconf |
||
| 85 | |||
| 86 | =head1 BUGS |
||
| 87 | |||
| 88 | None known. |
||
| 89 | |||
| 90 | =head1 VERSION |
||
| 91 | |||
| 92 | c7299aeb | Kjetil Torgrim Homme | 1.1 |
| 93 | cca33b39 | Kjetil Torgrim Homme | |
| 94 | =head1 AUTHOR |
||
| 95 | |||
| 96 | Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com> |
||
| 97 | |||
| 98 | =head1 LICENSE |
||
| 99 | |||
| 100 | GPLv2 |
||
| 101 | |||
| 102 | =cut |
||
| 103 | |||
| 104 | use strict; |
||
| 105 | use Munin::Plugin; |
||
| 106 | a2267c05 | Kjetil Torgrim Homme | use IPC::Cmd qw(can_run); |
| 107 | c7299aeb | Kjetil Torgrim Homme | use File::Basename; |
| 108 | cca33b39 | Kjetil Torgrim Homme | |
| 109 | # Check that multigraph is supported |
||
| 110 | need_multigraph(); |
||
| 111 | |||
| 112 | a2267c05 | Kjetil Torgrim Homme | # Return undef if no problem, otherwise explanation |
| 113 | sub autoconf_problem {
|
||
| 114 | return if can_run('nvme');
|
||
| 115 | if (open(my $mods, '/proc/modules')) {
|
||
| 116 | while (<$mods>) {
|
||
| 117 | return "missing nvme(1)" if /^nvme[^a-z]/; |
||
| 118 | } |
||
| 119 | close($mods); |
||
| 120 | } |
||
| 121 | return "missing nvme"; # vague message for non-Linux |
||
| 122 | } |
||
| 123 | |||
| 124 | cca33b39 | Kjetil Torgrim Homme | sub run_nvme {
|
| 125 | my (@cmd) = @_; |
||
| 126 | my @lines; |
||
| 127 | a2267c05 | Kjetil Torgrim Homme | if (can_run('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
|
| 128 | cca33b39 | Kjetil Torgrim Homme | @lines = <$nvme>; |
| 129 | close($nvme); |
||
| 130 | a2267c05 | Kjetil Torgrim Homme | warn "nvme: probably needs to run as user root\n" if $? && $> != 0; |
| 131 | cca33b39 | Kjetil Torgrim Homme | } |
| 132 | @lines; |
||
| 133 | } |
||
| 134 | |||
| 135 | sub human_to_bytes {
|
||
| 136 | my ($str) = @_; |
||
| 137 | my %units = ( |
||
| 138 | kB => 1000, |
||
| 139 | MB => 1000_000, |
||
| 140 | GB => 1000_000_000, |
||
| 141 | TB => 1000_000_000_000, |
||
| 142 | PB => 1000_000_000_000_000, # I wish I had need for this |
||
| 143 | ); |
||
| 144 | $str =~ /(\d+(\.\d+)?)\s+(.B)/; |
||
| 145 | int($1 * $units{$3});
|
||
| 146 | } |
||
| 147 | |||
| 148 | sub nvme_list {
|
||
| 149 | # Node SN Model Namespace Usage Format FW Rev |
||
| 150 | # ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- -------- |
||
| 151 | # /dev/nvme1n1 S464NB0K601188N Samsung SSD 970 EVO 2TB 1 695.50 GB / 2.00 TB 512 B + 0 B 1B2QEXE7 |
||
| 152 | my %devices; |
||
| 153 | a2267c05 | Kjetil Torgrim Homme | |
| 154 | my $recognised_output; |
||
| 155 | my $lineno = 0; |
||
| 156 | cca33b39 | Kjetil Torgrim Homme | for (run_nvme('list')) {
|
| 157 | a2267c05 | Kjetil Torgrim Homme | ++$lineno; |
| 158 | if (m:^Node\s+SN\s+Model\s+Namespace Usage:) {
|
||
| 159 | ++$recognised_output; |
||
| 160 | } elsif (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
|
||
| 161 | 842acaef | Florian Sager | $devices{'SN_'.$2} = {
|
| 162 | cca33b39 | Kjetil Torgrim Homme | device => $1, |
| 163 | sn => $2, |
||
| 164 | model => $3, |
||
| 165 | namespace => $4, |
||
| 166 | usage => human_to_bytes($5), |
||
| 167 | capacity => human_to_bytes($6), |
||
| 168 | }; |
||
| 169 | a2267c05 | Kjetil Torgrim Homme | } elsif ($lineno > 2) {
|
| 170 | # could not parse device information |
||
| 171 | $recognised_output = 0; |
||
| 172 | cca33b39 | Kjetil Torgrim Homme | } |
| 173 | } |
||
| 174 | a2267c05 | Kjetil Torgrim Homme | if ($lineno && !$recognised_output) {
|
| 175 | warn "Could not recognise output from 'nvme list', please report\n"; |
||
| 176 | } |
||
| 177 | cca33b39 | Kjetil Torgrim Homme | \%devices; |
| 178 | } |
||
| 179 | |||
| 180 | sub smart_log {
|
||
| 181 | my ($dev) = @_; |
||
| 182 | my %info; |
||
| 183 | for (run_nvme('smart-log', $dev)) {
|
||
| 184 | next if /^Smart Log/; |
||
| 185 | if (/(.*?)\s+:\s+(.*)/) {
|
||
| 186 | my ($var, $value) = ($1, $2); |
||
| 187 | $var =~ s/\s/_/g; |
||
| 188 | if ($value =~ /^\d+(,\d\d\d)+$/) {
|
||
| 189 | $value =~ s/,//g; |
||
| 190 | } |
||
| 191 | $info{lc $var} = $value;
|
||
| 192 | } |
||
| 193 | } |
||
| 194 | return \%info; |
||
| 195 | } |
||
| 196 | |||
| 197 | c7299aeb | Kjetil Torgrim Homme | sub my_print_thresholds {
|
| 198 | my ($label, $graph, $device, $warn_default, $crit_default) = @_; |
||
| 199 | my $dev = basename($device); |
||
| 200 | 64089240 | Andreas Perhab | my ($warn_label, $crit_label) = get_thresholds($graph, "${graph}_${label}_warning", "${graph}_${label}_critical",
|
| 201 | c7299aeb | Kjetil Torgrim Homme | $warn_default, $crit_default); |
| 202 | 64089240 | Andreas Perhab | my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
|
| 203 | $warn_label, $crit_label); |
||
| 204 | c7299aeb | Kjetil Torgrim Homme | print "${label}.warning $warn\n" if defined $warn;
|
| 205 | print "${label}.critical $crit\n" if defined $crit;
|
||
| 206 | } |
||
| 207 | |||
| 208 | cca33b39 | Kjetil Torgrim Homme | use Data::Dumper; |
| 209 | |||
| 210 | my $mode = ($ARGV[0] or "print"); |
||
| 211 | |||
| 212 | a2267c05 | Kjetil Torgrim Homme | my $problem = autoconf_problem(); |
| 213 | my $list = nvme_list(); |
||
| 214 | |||
| 215 | cca33b39 | Kjetil Torgrim Homme | if ($mode eq 'autoconf') {
|
| 216 | if (keys %{$list}) {
|
||
| 217 | print "yes\n"; |
||
| 218 | } else {
|
||
| 219 | a2267c05 | Kjetil Torgrim Homme | printf("no (%s)\n", $problem || "no devices to monitor");
|
| 220 | cca33b39 | Kjetil Torgrim Homme | } |
| 221 | exit 0; |
||
| 222 | } |
||
| 223 | |||
| 224 | my @sn = sort keys %{$list};
|
||
| 225 | |||
| 226 | if ($mode eq 'config') {
|
||
| 227 | my $sn_list = join(' ', @sn);
|
||
| 228 | |||
| 229 | print <<'EOF'; |
||
| 230 | multigraph nvme_usage |
||
| 231 | graph_title NVME Namespace Usage |
||
| 232 | graph_order $sn_list |
||
| 233 | graph_vlabel Percent used |
||
| 234 | graph_scale no |
||
| 235 | graph_category disk |
||
| 236 | graph_info How much space is used |
||
| 237 | EOF |
||
| 238 | for (@sn) {
|
||
| 239 | c7299aeb | Kjetil Torgrim Homme | my $device = $list->{$_}->{device};
|
| 240 | cca33b39 | Kjetil Torgrim Homme | print <<"EOF"; |
| 241 | c7299aeb | Kjetil Torgrim Homme | $_.label $device used |
| 242 | cca33b39 | Kjetil Torgrim Homme | $_.type GAUGE |
| 243 | $_.max 100 |
||
| 244 | $_.min 0 |
||
| 245 | EOF |
||
| 246 | c7299aeb | Kjetil Torgrim Homme | my_print_thresholds($_, 'nvme_usage', $device, '95', '98'); |
| 247 | cca33b39 | Kjetil Torgrim Homme | } |
| 248 | print <<'EOF'; |
||
| 249 | multigraph nvme_bytes |
||
| 250 | graph_title NVME Bytes Read / Written |
||
| 251 | graph_order $sn_list |
||
| 252 | graph_vlabel bytes read (-) / written (+) per ${graph_period}'
|
||
| 253 | graph_category disk |
||
| 254 | graph_info How much data is read and written |
||
| 255 | graph_period second |
||
| 256 | EOF |
||
| 257 | for (@sn) {
|
||
| 258 | print <<"EOF"; |
||
| 259 | ${_}_r.label $list->{$_}->{device}
|
||
| 260 | ${_}_r.type DERIVE
|
||
| 261 | ${_}_r.min 0
|
||
| 262 | ${_}_r.graph no
|
||
| 263 | ${_}_w.label $list->{$_}->{device}
|
||
| 264 | ${_}_w.type DERIVE
|
||
| 265 | ${_}_w.min 0
|
||
| 266 | ${_}_w.negative ${_}_r
|
||
| 267 | EOF |
||
| 268 | } |
||
| 269 | print <<'EOF'; |
||
| 270 | multigraph nvme_writecycles |
||
| 271 | graph_title NVME Write Cycles |
||
| 272 | graph_order $sn_list |
||
| 273 | graph_vlabel Cycles |
||
| 274 | graph_args --logarithmic |
||
| 275 | graph_category disk |
||
| 276 | graph_info How much data has been written in lifetime divided by capacity |
||
| 277 | EOF |
||
| 278 | for (@sn) {
|
||
| 279 | c7299aeb | Kjetil Torgrim Homme | my $device = $list->{$_}->{device};
|
| 280 | cca33b39 | Kjetil Torgrim Homme | print <<"EOF"; |
| 281 | c7299aeb | Kjetil Torgrim Homme | $_.label $device write cycles |
| 282 | cca33b39 | Kjetil Torgrim Homme | $_.type GAUGE |
| 283 | $_.min 0 |
||
| 284 | EOF |
||
| 285 | c7299aeb | Kjetil Torgrim Homme | my_print_thresholds($_, 'nvme_writecycles', $device); |
| 286 | } |
||
| 287 | print <<'EOF'; |
||
| 288 | multigraph nvme_spare |
||
| 289 | graph_title Available spare blocks |
||
| 290 | graph_order $sn_list |
||
| 291 | graph_vlabel Percent |
||
| 292 | graph_category disk |
||
| 293 | graph_info Spare capacity for replacing bad blocks |
||
| 294 | EOF |
||
| 295 | for (@sn) {
|
||
| 296 | my $device = $list->{$_}->{device};
|
||
| 297 | print <<"EOF"; |
||
| 298 | $_.label $device spare capacity |
||
| 299 | $_.type GAUGE |
||
| 300 | $_.min 0 |
||
| 301 | $_.max 100 |
||
| 302 | EOF |
||
| 303 | my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:'); |
||
| 304 | cca33b39 | Kjetil Torgrim Homme | } |
| 305 | } else {
|
||
| 306 | for (@sn) {
|
||
| 307 | $list->{$_}->{smart} = smart_log($list->{$_}->{device});
|
||
| 308 | } |
||
| 309 | print "multigraph nvme_usage\n"; |
||
| 310 | for (@sn) {
|
||
| 311 | my $info = $list->{$_};
|
||
| 312 | my $used = 100 * $info->{usage} / $info->{capacity};
|
||
| 313 | print "$_.value $used\n"; |
||
| 314 | } |
||
| 315 | print "multigraph nvme_bytes\n"; |
||
| 316 | for (@sn) {
|
||
| 317 | my $info = $list->{$_};
|
||
| 318 | my $rbytes = $info->{smart}->{data_units_read};
|
||
| 319 | my $wbytes = $info->{smart}->{data_units_written};
|
||
| 320 | print "${_}_r.value $rbytes\n";
|
||
| 321 | print "${_}_w.value $wbytes\n";
|
||
| 322 | } |
||
| 323 | print "multigraph nvme_writecycles\n"; |
||
| 324 | for (@sn) {
|
||
| 325 | my $info = $list->{$_};
|
||
| 326 | |||
| 327 | # The unit size reported is 1000 blocks. |
||
| 328 | 6c7ad652 | Kjetil Torgrim Homme | my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
|
| 329 | cca33b39 | Kjetil Torgrim Homme | print "$_.value $cycles\n"; |
| 330 | } |
||
| 331 | c7299aeb | Kjetil Torgrim Homme | print "multigraph nvme_spare\n"; |
| 332 | for (@sn) {
|
||
| 333 | my $info = $list->{$_};
|
||
| 334 | |||
| 335 | # The unit size reported is 1000 blocks. |
||
| 336 | my $spare = $info->{smart}->{available_spare};
|
||
| 337 | $spare =~ s/%//; |
||
| 338 | print "$_.value $spare\n"; |
||
| 339 | } |
||
| 340 | cca33b39 | Kjetil Torgrim Homme | } |
