Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / disk / nvme @ 842acaef

Historique | Voir | Annoter | Télécharger (8,54 ko)

1
#! /usr/bin/perl
2
# -*- mode: perl; perl-indent-level: 4 -*-
3

    
4
=head1 NAME
5

    
6
nvme - Munin plugin to monitor the use of NVMe devices
7

    
8
=head1 APPLICABLE SYSTEMS
9

    
10
Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe
11
bus).
12

    
13
=head1 CONFIGURATION
14

    
15
The plugin uses nvme(1) from the nvme-cli project to read status from
16
the NVMe devices.  This requires root access.
17

    
18
  [nvme]
19
    user root
20

    
21
When setting alert levels per device, use graph and basename of device
22
name, e.g., 'nvme0n1', to make environment variable:
23

    
24
    env.nvme_usage_nvme0n1_warning 5:
25
    env.nvme_usage_warning 8:
26

    
27
=head1 INTERPRETATION
28

    
29
This is a multigraph plugin which makes three graphs.
30

    
31
=head2 nvme_usage
32

    
33
This reports how much of capacity is allocated in each NVMe
34
"namespace".  The report is in percent.  This number may not have much
35
relation to actual use, e.g., if deleted data areas have not been
36
trimmed/discarded.
37

    
38
Default warning and critical: '95', '98'
39

    
40
=head2 nvme_bytes
41

    
42
This reports read and write activity on each NVMe device, in bytes per
43
second.  Ideally there should be much more read than write.  If they
44
are symmetrical, you are using your NVMe as a very expensive FIFO, and
45
if you write more than you read, you should probably look for archival
46
storage instead.
47

    
48
It is a good idea to compare these numbers to I/O counters from
49
diskstats.  If they are much higher, look into whether the write
50
amplification can be due to suboptimal I/O request sizes.
51

    
52
This graph does not support alerting.
53

    
54
=head2 nvme_writecycles
55

    
56
This graphs is intended to give an indication of how much life there
57
is left in your NVMe.  It calculates the number of bytes written
58
during each device's lifetime against the capacity of the device,
59
thereby getting an average number of write cycle each cell has
60
experienced.
61

    
62
A prosumer NVMe will handle a few thousand writes to each cell before
63
the error rate gets out of hand.
64

    
65
No default values for warning and critical.
66

    
67
=head2 nvme_spare
68

    
69
All NVMe has set a side reserve space to remap media errors.  This
70
graphs how much is left in percent, taken directly from smart-log
71
output.
72

    
73
Default warning and critical: '10:', '3:'
74

    
75
=head1 MAGIC MARKERS
76

    
77
  #%# family=auto
78
  #%# capabilities=autoconf
79

    
80
=head1 BUGS
81

    
82
None known.
83

    
84
=head1 VERSION
85

    
86
  1.1
87

    
88
=head1 AUTHOR
89

    
90
Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com>
91

    
92
=head1 LICENSE
93

    
94
GPLv2
95

    
96
=cut
97

    
98
use strict;
99
use Munin::Plugin;
100
use IPC::Cmd qw(can_run);
101
use File::Basename;
102

    
103
# Check that multigraph is supported
104
need_multigraph();
105

    
106
# Return undef if no problem, otherwise explanation
107
sub autoconf_problem {
108
    return if can_run('nvme');
109
    if (open(my $mods, '/proc/modules')) {
110
        while (<$mods>) {
111
            return "missing nvme(1)" if /^nvme[^a-z]/;
112
        }
113
        close($mods);
114
    }
115
    return "missing nvme";    # vague message for non-Linux
116
}
117

    
118
sub run_nvme {
119
    my (@cmd) = @_;
120
    my @lines;
121
    if (can_run('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
122
        @lines = <$nvme>;
123
        close($nvme);
124
        warn "nvme: probably needs to run as user root\n" if $? && $> != 0;
125
    }
126
    @lines;
127
}
128

    
129
sub human_to_bytes {
130
    my ($str) = @_;
131
    my %units = (
132
        kB => 1000,
133
        MB => 1000_000,
134
        GB => 1000_000_000,
135
        TB => 1000_000_000_000,
136
        PB => 1000_000_000_000_000,    # I wish I had need for this
137
    );
138
    $str =~ /(\d+(\.\d+)?)\s+(.B)/;
139
    int($1 * $units{$3});
140
}
141

    
142
sub nvme_list {
143
    # Node             SN                   Model                                    Namespace Usage                      Format           FW Rev
144
    # ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
145
    # /dev/nvme1n1     S464NB0K601188N      Samsung SSD 970 EVO 2TB                  1         695.50  GB /   2.00  TB    512   B +  0 B   1B2QEXE7
146
    my %devices;
147

    
148
    my $recognised_output;
149
    my $lineno = 0;
150
    for (run_nvme('list')) {
151
        ++$lineno;
152
        if (m:^Node\s+SN\s+Model\s+Namespace Usage:) {
153
            ++$recognised_output;
154
        } elsif (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
155
            $devices{'SN_'.$2} = {
156
                device    => $1,
157
                sn        => $2,
158
                model     => $3,
159
                namespace => $4,
160
                usage     => human_to_bytes($5),
161
                capacity  => human_to_bytes($6),
162
            };
163
        } elsif ($lineno > 2) {
164
            # could not parse device information
165
            $recognised_output = 0;
166
        }
167
    }
168
    if ($lineno && !$recognised_output) {
169
        warn "Could not recognise output from 'nvme list', please report\n";
170
    }
171
    \%devices;
172
}
173

    
174
sub smart_log {
175
    my ($dev) = @_;
176
    my %info;
177
    for (run_nvme('smart-log', $dev)) {
178
        next if /^Smart Log/;
179
        if (/(.*?)\s+:\s+(.*)/) {
180
            my ($var, $value) = ($1, $2);
181
            $var =~ s/\s/_/g;
182
            if ($value =~ /^\d+(,\d\d\d)+$/) {
183
                $value =~ s/,//g;
184
            }
185
            $info{lc $var} = $value;
186
        }
187
    }
188
    return \%info;
189
}
190

    
191
sub my_print_thresholds {
192
    my ($label, $graph, $device, $warn_default, $crit_default) = @_;
193
    my $dev = basename($device);
194
    my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
195
                                       $warn_default, $crit_default);
196
    print "${label}.warning $warn\n" if defined $warn;
197
    print "${label}.critical $crit\n" if defined $crit;
198
}
199

    
200
use Data::Dumper;
201

    
202
my $mode = ($ARGV[0] or "print");
203

    
204
my $problem = autoconf_problem();
205
my $list    = nvme_list();
206

    
207
if ($mode eq 'autoconf') {
208
    if (keys %{$list}) {
209
        print "yes\n";
210
    } else {
211
        printf("no (%s)\n", $problem || "no devices to monitor");
212
    }
213
    exit 0;
214
}
215

    
216
my @sn = sort keys %{$list};
217

    
218
if ($mode eq 'config') {
219
    my $sn_list = join(' ', @sn);
220

    
221
    print <<'EOF';
222
multigraph nvme_usage
223
graph_title NVME Namespace Usage
224
graph_order $sn_list
225
graph_vlabel Percent used
226
graph_scale no
227
graph_category disk
228
graph_info How much space is used
229
EOF
230
    for (@sn) {
231
        my $device = $list->{$_}->{device};
232
        print <<"EOF";
233
$_.label $device used
234
$_.type GAUGE
235
$_.max 100
236
$_.min 0
237
EOF
238
        my_print_thresholds($_, 'nvme_usage', $device, '95', '98');
239
    }
240
    print <<'EOF';
241
multigraph nvme_bytes
242
graph_title NVME Bytes Read / Written
243
graph_order $sn_list
244
graph_vlabel bytes read (-) / written (+) per ${graph_period}'
245
graph_category disk
246
graph_info How much data is read and written
247
graph_period second
248
EOF
249
    for (@sn) {
250
        print <<"EOF";
251
${_}_r.label $list->{$_}->{device}
252
${_}_r.type DERIVE
253
${_}_r.min 0
254
${_}_r.graph no
255
${_}_w.label $list->{$_}->{device}
256
${_}_w.type DERIVE
257
${_}_w.min 0
258
${_}_w.negative ${_}_r
259
EOF
260
    }
261
    print <<'EOF';
262
multigraph nvme_writecycles
263
graph_title NVME Write Cycles
264
graph_order $sn_list
265
graph_vlabel Cycles
266
graph_args --logarithmic
267
graph_category disk
268
graph_info How much data has been written in lifetime divided by capacity
269
EOF
270
    for (@sn) {
271
        my $device = $list->{$_}->{device};
272
        print <<"EOF";
273
$_.label $device write cycles
274
$_.type GAUGE
275
$_.min 0
276
EOF
277
        my_print_thresholds($_, 'nvme_writecycles', $device);
278
    }
279
    print <<'EOF';
280
multigraph nvme_spare
281
graph_title Available spare blocks
282
graph_order $sn_list
283
graph_vlabel Percent
284
graph_category disk
285
graph_info Spare capacity for replacing bad blocks
286
EOF
287
    for (@sn) {
288
        my $device = $list->{$_}->{device};
289
        print <<"EOF";
290
$_.label $device spare capacity
291
$_.type GAUGE
292
$_.min 0
293
$_.max 100
294
EOF
295
        my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:');
296
    }
297
} else {
298
    for (@sn) {
299
        $list->{$_}->{smart} = smart_log($list->{$_}->{device});
300
    }
301
    print "multigraph nvme_usage\n";
302
    for (@sn) {
303
        my $info = $list->{$_};
304
        my $used = 100 * $info->{usage} / $info->{capacity};
305
        print "$_.value $used\n";
306
    }
307
    print "multigraph nvme_bytes\n";
308
    for (@sn) {
309
        my $info   = $list->{$_};
310
        my $rbytes = $info->{smart}->{data_units_read};
311
        my $wbytes = $info->{smart}->{data_units_written};
312
        print "${_}_r.value $rbytes\n";
313
        print "${_}_w.value $wbytes\n";
314
    }
315
    print "multigraph nvme_writecycles\n";
316
    for (@sn) {
317
        my $info = $list->{$_};
318

    
319
        # The unit size reported is 1000 blocks.
320
        my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
321
        print "$_.value $cycles\n";
322
    }
323
    print "multigraph nvme_spare\n";
324
    for (@sn) {
325
        my $info = $list->{$_};
326

    
327
        # The unit size reported is 1000 blocks.
328
        my $spare = $info->{smart}->{available_spare};
329
        $spare =~ s/%//;
330
        print "$_.value $spare\n";
331
    }
332
}