root / plugins / disk / nvme @ 16d38264
Historique | Voir | Annoter | Télécharger (8,94 ko)
| 1 |
#! /usr/bin/perl |
|---|---|
| 2 |
# -*- mode: perl; perl-indent-level: 4 -*- |
| 3 |
|
| 4 |
=head1 NAME |
| 5 |
|
| 6 |
nvme - Munin plugin to monitor the use of NVMe devices |
| 7 |
|
| 8 |
=head1 APPLICABLE SYSTEMS |
| 9 |
|
| 10 |
Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe |
| 11 |
bus). |
| 12 |
|
| 13 |
=head1 CONFIGURATION |
| 14 |
|
| 15 |
The plugin uses nvme(1) from the nvme-cli project to read status from |
| 16 |
the NVMe devices. This requires root access. |
| 17 |
|
| 18 |
[nvme] |
| 19 |
user root |
| 20 |
|
| 21 |
When setting alert levels per device, use graph and basename of device |
| 22 |
name, e.g., 'nvme0n1', to make environment variable: |
| 23 |
|
| 24 |
env.nvme_usage_nvme0n1_warning 5: |
| 25 |
env.nvme_usage_warning 8: |
| 26 |
|
| 27 |
If your device names change on reboot you can also use the labels |
| 28 |
(based on serial numbers) to set the warning and critical labels |
| 29 |
|
| 30 |
env.nvme_usage_SN_1234567_warning 8:101 |
| 31 |
env.nvme_usage_SN_1234567_critical 5:101 |
| 32 |
|
| 33 |
=head1 INTERPRETATION |
| 34 |
|
| 35 |
This is a multigraph plugin which makes three graphs. |
| 36 |
|
| 37 |
=head2 nvme_usage |
| 38 |
|
| 39 |
This reports how much of capacity is allocated in each NVMe |
| 40 |
"namespace". The report is in percent. This number may not have much |
| 41 |
relation to actual use, e.g., if deleted data areas have not been |
| 42 |
trimmed/discarded. |
| 43 |
|
| 44 |
Default warning and critical: '95', '98' |
| 45 |
|
| 46 |
=head2 nvme_bytes |
| 47 |
|
| 48 |
This reports read and write activity on each NVMe device, in bytes per |
| 49 |
second. Ideally there should be much more read than write. If they |
| 50 |
are symmetrical, you are using your NVMe as a very expensive FIFO, and |
| 51 |
if you write more than you read, you should probably look for archival |
| 52 |
storage instead. |
| 53 |
|
| 54 |
It is a good idea to compare these numbers to I/O counters from |
| 55 |
diskstats. If they are much higher, look into whether the write |
| 56 |
amplification can be due to suboptimal I/O request sizes. |
| 57 |
|
| 58 |
This graph does not support alerting. |
| 59 |
|
| 60 |
=head2 nvme_writecycles |
| 61 |
|
| 62 |
This graphs is intended to give an indication of how much life there |
| 63 |
is left in your NVMe. It calculates the number of bytes written |
| 64 |
during each device's lifetime against the capacity of the device, |
| 65 |
thereby getting an average number of write cycle each cell has |
| 66 |
experienced. |
| 67 |
|
| 68 |
A prosumer NVMe will handle a few thousand writes to each cell before |
| 69 |
the error rate gets out of hand. |
| 70 |
|
| 71 |
No default values for warning and critical. |
| 72 |
|
| 73 |
=head2 nvme_spare |
| 74 |
|
| 75 |
All NVMe has set a side reserve space to remap media errors. This |
| 76 |
graphs how much is left in percent, taken directly from smart-log |
| 77 |
output. |
| 78 |
|
| 79 |
Default warning and critical: '10:', '3:' |
| 80 |
|
| 81 |
=head1 MAGIC MARKERS |
| 82 |
|
| 83 |
#%# family=auto |
| 84 |
#%# capabilities=autoconf |
| 85 |
|
| 86 |
=head1 BUGS |
| 87 |
|
| 88 |
None known. |
| 89 |
|
| 90 |
=head1 VERSION |
| 91 |
|
| 92 |
1.1 |
| 93 |
|
| 94 |
=head1 AUTHOR |
| 95 |
|
| 96 |
Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com> |
| 97 |
|
| 98 |
=head1 LICENSE |
| 99 |
|
| 100 |
GPLv2 |
| 101 |
|
| 102 |
=cut |
| 103 |
|
| 104 |
use strict; |
| 105 |
use Munin::Plugin; |
| 106 |
use IPC::Cmd qw(can_run); |
| 107 |
use File::Basename; |
| 108 |
|
| 109 |
# Check that multigraph is supported |
| 110 |
need_multigraph(); |
| 111 |
|
| 112 |
# Return undef if no problem, otherwise explanation |
| 113 |
sub autoconf_problem {
|
| 114 |
return if can_run('nvme');
|
| 115 |
if (open(my $mods, '/proc/modules')) {
|
| 116 |
while (<$mods>) {
|
| 117 |
return "missing nvme(1)" if /^nvme[^a-z]/; |
| 118 |
} |
| 119 |
close($mods); |
| 120 |
} |
| 121 |
return "missing nvme"; # vague message for non-Linux |
| 122 |
} |
| 123 |
|
| 124 |
sub run_nvme {
|
| 125 |
my (@cmd) = @_; |
| 126 |
my @lines; |
| 127 |
if (can_run('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
|
| 128 |
@lines = <$nvme>; |
| 129 |
close($nvme); |
| 130 |
warn "nvme: probably needs to run as user root\n" if $? && $> != 0; |
| 131 |
} |
| 132 |
@lines; |
| 133 |
} |
| 134 |
|
| 135 |
sub human_to_bytes {
|
| 136 |
my ($str) = @_; |
| 137 |
my %units = ( |
| 138 |
kB => 1000, |
| 139 |
MB => 1000_000, |
| 140 |
GB => 1000_000_000, |
| 141 |
TB => 1000_000_000_000, |
| 142 |
PB => 1000_000_000_000_000, # I wish I had need for this |
| 143 |
); |
| 144 |
$str =~ /(\d+(\.\d+)?)\s+(.B)/; |
| 145 |
int($1 * $units{$3});
|
| 146 |
} |
| 147 |
|
| 148 |
sub nvme_list {
|
| 149 |
# Node SN Model Namespace Usage Format FW Rev |
| 150 |
# ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- -------- |
| 151 |
# /dev/nvme1n1 S464NB0K601188N Samsung SSD 970 EVO 2TB 1 695.50 GB / 2.00 TB 512 B + 0 B 1B2QEXE7 |
| 152 |
my %devices; |
| 153 |
|
| 154 |
my $recognised_output; |
| 155 |
my $lineno = 0; |
| 156 |
for (run_nvme('list')) {
|
| 157 |
++$lineno; |
| 158 |
if (m:^Node\s+SN\s+Model\s+Namespace Usage:) {
|
| 159 |
++$recognised_output; |
| 160 |
} elsif (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
|
| 161 |
$devices{'SN_'.$2} = {
|
| 162 |
device => $1, |
| 163 |
sn => $2, |
| 164 |
model => $3, |
| 165 |
namespace => $4, |
| 166 |
usage => human_to_bytes($5), |
| 167 |
capacity => human_to_bytes($6), |
| 168 |
}; |
| 169 |
} elsif ($lineno > 2) {
|
| 170 |
# could not parse device information |
| 171 |
$recognised_output = 0; |
| 172 |
} |
| 173 |
} |
| 174 |
if ($lineno && !$recognised_output) {
|
| 175 |
warn "Could not recognise output from 'nvme list', please report\n"; |
| 176 |
} |
| 177 |
\%devices; |
| 178 |
} |
| 179 |
|
| 180 |
sub smart_log {
|
| 181 |
my ($dev) = @_; |
| 182 |
my %info; |
| 183 |
for (run_nvme('smart-log', $dev)) {
|
| 184 |
next if /^Smart Log/; |
| 185 |
if (/(.*?)\s+:\s+(.*)/) {
|
| 186 |
my ($var, $value) = ($1, $2); |
| 187 |
$var =~ s/\s/_/g; |
| 188 |
if ($value =~ /^\d+(,\d\d\d)+$/) {
|
| 189 |
$value =~ s/,//g; |
| 190 |
} |
| 191 |
$info{lc $var} = $value;
|
| 192 |
} |
| 193 |
} |
| 194 |
return \%info; |
| 195 |
} |
| 196 |
|
| 197 |
sub my_print_thresholds {
|
| 198 |
my ($label, $graph, $device, $warn_default, $crit_default) = @_; |
| 199 |
my $dev = basename($device); |
| 200 |
my ($warn_label, $crit_label) = get_thresholds($graph, "${graph}_${label}_warning", "${graph}_${label}_critical",
|
| 201 |
$warn_default, $crit_default); |
| 202 |
my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
|
| 203 |
$warn_label, $crit_label); |
| 204 |
print "${label}.warning $warn\n" if defined $warn;
|
| 205 |
print "${label}.critical $crit\n" if defined $crit;
|
| 206 |
} |
| 207 |
|
| 208 |
use Data::Dumper; |
| 209 |
|
| 210 |
my $mode = ($ARGV[0] or "print"); |
| 211 |
|
| 212 |
my $problem = autoconf_problem(); |
| 213 |
my $list = nvme_list(); |
| 214 |
|
| 215 |
if ($mode eq 'autoconf') {
|
| 216 |
if (keys %{$list}) {
|
| 217 |
print "yes\n"; |
| 218 |
} else {
|
| 219 |
printf("no (%s)\n", $problem || "no devices to monitor");
|
| 220 |
} |
| 221 |
exit 0; |
| 222 |
} |
| 223 |
|
| 224 |
my @sn = sort keys %{$list};
|
| 225 |
|
| 226 |
if ($mode eq 'config') {
|
| 227 |
my $sn_list = join(' ', @sn);
|
| 228 |
|
| 229 |
print <<'EOF'; |
| 230 |
multigraph nvme_usage |
| 231 |
graph_title NVME Namespace Usage |
| 232 |
graph_order $sn_list |
| 233 |
graph_vlabel Percent used |
| 234 |
graph_scale no |
| 235 |
graph_category disk |
| 236 |
graph_info How much space is used |
| 237 |
EOF |
| 238 |
for (@sn) {
|
| 239 |
my $device = $list->{$_}->{device};
|
| 240 |
print <<"EOF"; |
| 241 |
$_.label $device used |
| 242 |
$_.type GAUGE |
| 243 |
$_.max 100 |
| 244 |
$_.min 0 |
| 245 |
EOF |
| 246 |
my_print_thresholds($_, 'nvme_usage', $device, '95', '98'); |
| 247 |
} |
| 248 |
print <<'EOF'; |
| 249 |
multigraph nvme_bytes |
| 250 |
graph_title NVME Bytes Read / Written |
| 251 |
graph_order $sn_list |
| 252 |
graph_vlabel bytes read (-) / written (+) per ${graph_period}'
|
| 253 |
graph_category disk |
| 254 |
graph_info How much data is read and written |
| 255 |
graph_period second |
| 256 |
EOF |
| 257 |
for (@sn) {
|
| 258 |
print <<"EOF"; |
| 259 |
${_}_r.label $list->{$_}->{device}
|
| 260 |
${_}_r.type DERIVE
|
| 261 |
${_}_r.min 0
|
| 262 |
${_}_r.graph no
|
| 263 |
${_}_w.label $list->{$_}->{device}
|
| 264 |
${_}_w.type DERIVE
|
| 265 |
${_}_w.min 0
|
| 266 |
${_}_w.negative ${_}_r
|
| 267 |
EOF |
| 268 |
} |
| 269 |
print <<'EOF'; |
| 270 |
multigraph nvme_writecycles |
| 271 |
graph_title NVME Write Cycles |
| 272 |
graph_order $sn_list |
| 273 |
graph_vlabel Cycles |
| 274 |
graph_args --logarithmic |
| 275 |
graph_category disk |
| 276 |
graph_info How much data has been written in lifetime divided by capacity |
| 277 |
EOF |
| 278 |
for (@sn) {
|
| 279 |
my $device = $list->{$_}->{device};
|
| 280 |
print <<"EOF"; |
| 281 |
$_.label $device write cycles |
| 282 |
$_.type GAUGE |
| 283 |
$_.min 0 |
| 284 |
EOF |
| 285 |
my_print_thresholds($_, 'nvme_writecycles', $device); |
| 286 |
} |
| 287 |
print <<'EOF'; |
| 288 |
multigraph nvme_spare |
| 289 |
graph_title Available spare blocks |
| 290 |
graph_order $sn_list |
| 291 |
graph_vlabel Percent |
| 292 |
graph_category disk |
| 293 |
graph_info Spare capacity for replacing bad blocks |
| 294 |
EOF |
| 295 |
for (@sn) {
|
| 296 |
my $device = $list->{$_}->{device};
|
| 297 |
print <<"EOF"; |
| 298 |
$_.label $device spare capacity |
| 299 |
$_.type GAUGE |
| 300 |
$_.min 0 |
| 301 |
$_.max 100 |
| 302 |
EOF |
| 303 |
my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:'); |
| 304 |
} |
| 305 |
} else {
|
| 306 |
for (@sn) {
|
| 307 |
$list->{$_}->{smart} = smart_log($list->{$_}->{device});
|
| 308 |
} |
| 309 |
print "multigraph nvme_usage\n"; |
| 310 |
for (@sn) {
|
| 311 |
my $info = $list->{$_};
|
| 312 |
my $used = 100 * $info->{usage} / $info->{capacity};
|
| 313 |
print "$_.value $used\n"; |
| 314 |
} |
| 315 |
print "multigraph nvme_bytes\n"; |
| 316 |
for (@sn) {
|
| 317 |
my $info = $list->{$_};
|
| 318 |
my $rbytes = $info->{smart}->{data_units_read};
|
| 319 |
my $wbytes = $info->{smart}->{data_units_written};
|
| 320 |
print "${_}_r.value $rbytes\n";
|
| 321 |
print "${_}_w.value $wbytes\n";
|
| 322 |
} |
| 323 |
print "multigraph nvme_writecycles\n"; |
| 324 |
for (@sn) {
|
| 325 |
my $info = $list->{$_};
|
| 326 |
|
| 327 |
# The unit size reported is 1000 blocks. |
| 328 |
my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
|
| 329 |
print "$_.value $cycles\n"; |
| 330 |
} |
| 331 |
print "multigraph nvme_spare\n"; |
| 332 |
for (@sn) {
|
| 333 |
my $info = $list->{$_};
|
| 334 |
|
| 335 |
# The unit size reported is 1000 blocks. |
| 336 |
my $spare = $info->{smart}->{available_spare};
|
| 337 |
$spare =~ s/%//; |
| 338 |
print "$_.value $spare\n"; |
| 339 |
} |
| 340 |
} |
