Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / disk / nvme @ 16d38264

Historique | Voir | Annoter | Télécharger (8,94 ko)

1 a2267c05 Kjetil Torgrim Homme
#! /usr/bin/perl
2
# -*- mode: perl; perl-indent-level: 4 -*-
3 cca33b39 Kjetil Torgrim Homme
4
=head1 NAME
5
6
nvme - Munin plugin to monitor the use of NVMe devices
7
8 a2267c05 Kjetil Torgrim Homme
=head1 APPLICABLE SYSTEMS
9
10
Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe
11
bus).
12
13 cca33b39 Kjetil Torgrim Homme
=head1 CONFIGURATION
14
15
The plugin uses nvme(1) from the nvme-cli project to read status from
16
the NVMe devices.  This requires root access.
17
18
  [nvme]
19 c7299aeb Kjetil Torgrim Homme
    user root
20 cca33b39 Kjetil Torgrim Homme
21 c7299aeb Kjetil Torgrim Homme
When setting alert levels per device, use graph and basename of device
22
name, e.g., 'nvme0n1', to make environment variable:
23
24
    env.nvme_usage_nvme0n1_warning 5:
25
    env.nvme_usage_warning 8:
26 cca33b39 Kjetil Torgrim Homme
27 64089240 Andreas Perhab
If your device names change on reboot you can also use the labels
28
(based on serial numbers) to set the warning and critical labels
29
30
    env.nvme_usage_SN_1234567_warning 8:101
31
    env.nvme_usage_SN_1234567_critical 5:101
32
33 cca33b39 Kjetil Torgrim Homme
=head1 INTERPRETATION
34
35 a2267c05 Kjetil Torgrim Homme
This is a multigraph plugin which makes three graphs.
36 cca33b39 Kjetil Torgrim Homme
37
=head2 nvme_usage
38
39
This reports how much of capacity is allocated in each NVMe
40
"namespace".  The report is in percent.  This number may not have much
41
relation to actual use, e.g., if deleted data areas have not been
42
trimmed/discarded.
43
44 c7299aeb Kjetil Torgrim Homme
Default warning and critical: '95', '98'
45
46 cca33b39 Kjetil Torgrim Homme
=head2 nvme_bytes
47
48
This reports read and write activity on each NVMe device, in bytes per
49
second.  Ideally there should be much more read than write.  If they
50
are symmetrical, you are using your NVMe as a very expensive FIFO, and
51
if you write more than you read, you should probably look for archival
52
storage instead.
53
54
It is a good idea to compare these numbers to I/O counters from
55 a2267c05 Kjetil Torgrim Homme
diskstats.  If they are much higher, look into whether the write
56 cca33b39 Kjetil Torgrim Homme
amplification can be due to suboptimal I/O request sizes.
57
58 c7299aeb Kjetil Torgrim Homme
This graph does not support alerting.
59
60 cca33b39 Kjetil Torgrim Homme
=head2 nvme_writecycles
61
62
This graphs is intended to give an indication of how much life there
63
is left in your NVMe.  It calculates the number of bytes written
64
during each device's lifetime against the capacity of the device,
65
thereby getting an average number of write cycle each cell has
66
experienced.
67
68
A prosumer NVMe will handle a few thousand writes to each cell before
69
the error rate gets out of hand.
70
71 c7299aeb Kjetil Torgrim Homme
No default values for warning and critical.
72
73
=head2 nvme_spare
74
75
All NVMe has set a side reserve space to remap media errors.  This
76
graphs how much is left in percent, taken directly from smart-log
77
output.
78
79
Default warning and critical: '10:', '3:'
80
81 cca33b39 Kjetil Torgrim Homme
=head1 MAGIC MARKERS
82
83
  #%# family=auto
84
  #%# capabilities=autoconf
85
86
=head1 BUGS
87
88
None known.
89
90
=head1 VERSION
91
92 c7299aeb Kjetil Torgrim Homme
  1.1
93 cca33b39 Kjetil Torgrim Homme
94
=head1 AUTHOR
95
96
Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com>
97
98
=head1 LICENSE
99
100
GPLv2
101
102
=cut
103
104
use strict;
105
use Munin::Plugin;
106 a2267c05 Kjetil Torgrim Homme
use IPC::Cmd qw(can_run);
107 c7299aeb Kjetil Torgrim Homme
use File::Basename;
108 cca33b39 Kjetil Torgrim Homme
109
# Check that multigraph is supported
110
need_multigraph();
111
112 a2267c05 Kjetil Torgrim Homme
# Return undef if no problem, otherwise explanation
113
sub autoconf_problem {
114
    return if can_run('nvme');
115
    if (open(my $mods, '/proc/modules')) {
116
        while (<$mods>) {
117
            return "missing nvme(1)" if /^nvme[^a-z]/;
118
        }
119
        close($mods);
120
    }
121
    return "missing nvme";    # vague message for non-Linux
122
}
123
124 cca33b39 Kjetil Torgrim Homme
sub run_nvme {
125
    my (@cmd) = @_;
126
    my @lines;
127 a2267c05 Kjetil Torgrim Homme
    if (can_run('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
128 cca33b39 Kjetil Torgrim Homme
        @lines = <$nvme>;
129
        close($nvme);
130 a2267c05 Kjetil Torgrim Homme
        warn "nvme: probably needs to run as user root\n" if $? && $> != 0;
131 cca33b39 Kjetil Torgrim Homme
    }
132
    @lines;
133
}
134
135
sub human_to_bytes {
136
    my ($str) = @_;
137
    my %units = (
138
        kB => 1000,
139
        MB => 1000_000,
140
        GB => 1000_000_000,
141
        TB => 1000_000_000_000,
142
        PB => 1000_000_000_000_000,    # I wish I had need for this
143
    );
144
    $str =~ /(\d+(\.\d+)?)\s+(.B)/;
145
    int($1 * $units{$3});
146
}
147
148
sub nvme_list {
149
    # Node             SN                   Model                                    Namespace Usage                      Format           FW Rev
150
    # ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
151
    # /dev/nvme1n1     S464NB0K601188N      Samsung SSD 970 EVO 2TB                  1         695.50  GB /   2.00  TB    512   B +  0 B   1B2QEXE7
152
    my %devices;
153 a2267c05 Kjetil Torgrim Homme
154
    my $recognised_output;
155
    my $lineno = 0;
156 cca33b39 Kjetil Torgrim Homme
    for (run_nvme('list')) {
157 a2267c05 Kjetil Torgrim Homme
        ++$lineno;
158
        if (m:^Node\s+SN\s+Model\s+Namespace Usage:) {
159
            ++$recognised_output;
160
        } elsif (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
161 842acaef Florian Sager
            $devices{'SN_'.$2} = {
162 cca33b39 Kjetil Torgrim Homme
                device    => $1,
163
                sn        => $2,
164
                model     => $3,
165
                namespace => $4,
166
                usage     => human_to_bytes($5),
167
                capacity  => human_to_bytes($6),
168
            };
169 a2267c05 Kjetil Torgrim Homme
        } elsif ($lineno > 2) {
170
            # could not parse device information
171
            $recognised_output = 0;
172 cca33b39 Kjetil Torgrim Homme
        }
173
    }
174 a2267c05 Kjetil Torgrim Homme
    if ($lineno && !$recognised_output) {
175
        warn "Could not recognise output from 'nvme list', please report\n";
176
    }
177 cca33b39 Kjetil Torgrim Homme
    \%devices;
178
}
179
180
sub smart_log {
181
    my ($dev) = @_;
182
    my %info;
183
    for (run_nvme('smart-log', $dev)) {
184
        next if /^Smart Log/;
185
        if (/(.*?)\s+:\s+(.*)/) {
186
            my ($var, $value) = ($1, $2);
187
            $var =~ s/\s/_/g;
188
            if ($value =~ /^\d+(,\d\d\d)+$/) {
189
                $value =~ s/,//g;
190
            }
191
            $info{lc $var} = $value;
192
        }
193
    }
194
    return \%info;
195
}
196
197 c7299aeb Kjetil Torgrim Homme
sub my_print_thresholds {
198
    my ($label, $graph, $device, $warn_default, $crit_default) = @_;
199
    my $dev = basename($device);
200 64089240 Andreas Perhab
    my ($warn_label, $crit_label) = get_thresholds($graph, "${graph}_${label}_warning", "${graph}_${label}_critical",
201 c7299aeb Kjetil Torgrim Homme
                                       $warn_default, $crit_default);
202 64089240 Andreas Perhab
    my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
203
                                       $warn_label, $crit_label);
204 c7299aeb Kjetil Torgrim Homme
    print "${label}.warning $warn\n" if defined $warn;
205
    print "${label}.critical $crit\n" if defined $crit;
206
}
207
208 cca33b39 Kjetil Torgrim Homme
use Data::Dumper;
209
210
my $mode = ($ARGV[0] or "print");
211
212 a2267c05 Kjetil Torgrim Homme
my $problem = autoconf_problem();
213
my $list    = nvme_list();
214
215 cca33b39 Kjetil Torgrim Homme
if ($mode eq 'autoconf') {
216
    if (keys %{$list}) {
217
        print "yes\n";
218
    } else {
219 a2267c05 Kjetil Torgrim Homme
        printf("no (%s)\n", $problem || "no devices to monitor");
220 cca33b39 Kjetil Torgrim Homme
    }
221
    exit 0;
222
}
223
224
my @sn = sort keys %{$list};
225
226
if ($mode eq 'config') {
227
    my $sn_list = join(' ', @sn);
228
229
    print <<'EOF';
230
multigraph nvme_usage
231
graph_title NVME Namespace Usage
232
graph_order $sn_list
233
graph_vlabel Percent used
234
graph_scale no
235
graph_category disk
236
graph_info How much space is used
237
EOF
238
    for (@sn) {
239 c7299aeb Kjetil Torgrim Homme
        my $device = $list->{$_}->{device};
240 cca33b39 Kjetil Torgrim Homme
        print <<"EOF";
241 c7299aeb Kjetil Torgrim Homme
$_.label $device used
242 cca33b39 Kjetil Torgrim Homme
$_.type GAUGE
243
$_.max 100
244
$_.min 0
245
EOF
246 c7299aeb Kjetil Torgrim Homme
        my_print_thresholds($_, 'nvme_usage', $device, '95', '98');
247 cca33b39 Kjetil Torgrim Homme
    }
248
    print <<'EOF';
249
multigraph nvme_bytes
250
graph_title NVME Bytes Read / Written
251
graph_order $sn_list
252
graph_vlabel bytes read (-) / written (+) per ${graph_period}'
253
graph_category disk
254
graph_info How much data is read and written
255
graph_period second
256
EOF
257
    for (@sn) {
258
        print <<"EOF";
259
${_}_r.label $list->{$_}->{device}
260
${_}_r.type DERIVE
261
${_}_r.min 0
262
${_}_r.graph no
263
${_}_w.label $list->{$_}->{device}
264
${_}_w.type DERIVE
265
${_}_w.min 0
266
${_}_w.negative ${_}_r
267
EOF
268
    }
269
    print <<'EOF';
270
multigraph nvme_writecycles
271
graph_title NVME Write Cycles
272
graph_order $sn_list
273
graph_vlabel Cycles
274
graph_args --logarithmic
275
graph_category disk
276
graph_info How much data has been written in lifetime divided by capacity
277
EOF
278
    for (@sn) {
279 c7299aeb Kjetil Torgrim Homme
        my $device = $list->{$_}->{device};
280 cca33b39 Kjetil Torgrim Homme
        print <<"EOF";
281 c7299aeb Kjetil Torgrim Homme
$_.label $device write cycles
282 cca33b39 Kjetil Torgrim Homme
$_.type GAUGE
283
$_.min 0
284
EOF
285 c7299aeb Kjetil Torgrim Homme
        my_print_thresholds($_, 'nvme_writecycles', $device);
286
    }
287
    print <<'EOF';
288
multigraph nvme_spare
289
graph_title Available spare blocks
290
graph_order $sn_list
291
graph_vlabel Percent
292
graph_category disk
293
graph_info Spare capacity for replacing bad blocks
294
EOF
295
    for (@sn) {
296
        my $device = $list->{$_}->{device};
297
        print <<"EOF";
298
$_.label $device spare capacity
299
$_.type GAUGE
300
$_.min 0
301
$_.max 100
302
EOF
303
        my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:');
304 cca33b39 Kjetil Torgrim Homme
    }
305
} else {
306
    for (@sn) {
307
        $list->{$_}->{smart} = smart_log($list->{$_}->{device});
308
    }
309
    print "multigraph nvme_usage\n";
310
    for (@sn) {
311
        my $info = $list->{$_};
312
        my $used = 100 * $info->{usage} / $info->{capacity};
313
        print "$_.value $used\n";
314
    }
315
    print "multigraph nvme_bytes\n";
316
    for (@sn) {
317
        my $info   = $list->{$_};
318
        my $rbytes = $info->{smart}->{data_units_read};
319
        my $wbytes = $info->{smart}->{data_units_written};
320
        print "${_}_r.value $rbytes\n";
321
        print "${_}_w.value $wbytes\n";
322
    }
323
    print "multigraph nvme_writecycles\n";
324
    for (@sn) {
325
        my $info = $list->{$_};
326
327
        # The unit size reported is 1000 blocks.
328 6c7ad652 Kjetil Torgrim Homme
        my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
329 cca33b39 Kjetil Torgrim Homme
        print "$_.value $cycles\n";
330
    }
331 c7299aeb Kjetil Torgrim Homme
    print "multigraph nvme_spare\n";
332
    for (@sn) {
333
        my $info = $list->{$_};
334
335
        # The unit size reported is 1000 blocks.
336
        my $spare = $info->{smart}->{available_spare};
337
        $spare =~ s/%//;
338
        print "$_.value $spare\n";
339
    }
340 cca33b39 Kjetil Torgrim Homme
}