Projet

Général

Profil

Paste
Télécharger au format
Statistiques
| Branche: | Révision:

root / plugins / disk / nvme @ 16d38264

Historique | Voir | Annoter | Télécharger (8,94 ko)

1
#! /usr/bin/perl
2
# -*- mode: perl; perl-indent-level: 4 -*-
3

    
4
=head1 NAME
5

    
6
nvme - Munin plugin to monitor the use of NVMe devices
7

    
8
=head1 APPLICABLE SYSTEMS
9

    
10
Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe
11
bus).
12

    
13
=head1 CONFIGURATION
14

    
15
The plugin uses nvme(1) from the nvme-cli project to read status from
16
the NVMe devices.  This requires root access.
17

    
18
  [nvme]
19
    user root
20

    
21
When setting alert levels per device, use graph and basename of device
22
name, e.g., 'nvme0n1', to make environment variable:
23

    
24
    env.nvme_usage_nvme0n1_warning 5:
25
    env.nvme_usage_warning 8:
26

    
27
If your device names change on reboot you can also use the labels
28
(based on serial numbers) to set the warning and critical labels
29

    
30
    env.nvme_usage_SN_1234567_warning 8:101
31
    env.nvme_usage_SN_1234567_critical 5:101
32

    
33
=head1 INTERPRETATION
34

    
35
This is a multigraph plugin which makes three graphs.
36

    
37
=head2 nvme_usage
38

    
39
This reports how much of capacity is allocated in each NVMe
40
"namespace".  The report is in percent.  This number may not have much
41
relation to actual use, e.g., if deleted data areas have not been
42
trimmed/discarded.
43

    
44
Default warning and critical: '95', '98'
45

    
46
=head2 nvme_bytes
47

    
48
This reports read and write activity on each NVMe device, in bytes per
49
second.  Ideally there should be much more read than write.  If they
50
are symmetrical, you are using your NVMe as a very expensive FIFO, and
51
if you write more than you read, you should probably look for archival
52
storage instead.
53

    
54
It is a good idea to compare these numbers to I/O counters from
55
diskstats.  If they are much higher, look into whether the write
56
amplification can be due to suboptimal I/O request sizes.
57

    
58
This graph does not support alerting.
59

    
60
=head2 nvme_writecycles
61

    
62
This graphs is intended to give an indication of how much life there
63
is left in your NVMe.  It calculates the number of bytes written
64
during each device's lifetime against the capacity of the device,
65
thereby getting an average number of write cycle each cell has
66
experienced.
67

    
68
A prosumer NVMe will handle a few thousand writes to each cell before
69
the error rate gets out of hand.
70

    
71
No default values for warning and critical.
72

    
73
=head2 nvme_spare
74

    
75
All NVMe has set a side reserve space to remap media errors.  This
76
graphs how much is left in percent, taken directly from smart-log
77
output.
78

    
79
Default warning and critical: '10:', '3:'
80

    
81
=head1 MAGIC MARKERS
82

    
83
  #%# family=auto
84
  #%# capabilities=autoconf
85

    
86
=head1 BUGS
87

    
88
None known.
89

    
90
=head1 VERSION
91

    
92
  1.1
93

    
94
=head1 AUTHOR
95

    
96
Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com>
97

    
98
=head1 LICENSE
99

    
100
GPLv2
101

    
102
=cut
103

    
104
use strict;
105
use Munin::Plugin;
106
use IPC::Cmd qw(can_run);
107
use File::Basename;
108

    
109
# Check that multigraph is supported
110
need_multigraph();
111

    
112
# Return undef if no problem, otherwise explanation
113
sub autoconf_problem {
114
    return if can_run('nvme');
115
    if (open(my $mods, '/proc/modules')) {
116
        while (<$mods>) {
117
            return "missing nvme(1)" if /^nvme[^a-z]/;
118
        }
119
        close($mods);
120
    }
121
    return "missing nvme";    # vague message for non-Linux
122
}
123

    
124
sub run_nvme {
125
    my (@cmd) = @_;
126
    my @lines;
127
    if (can_run('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
128
        @lines = <$nvme>;
129
        close($nvme);
130
        warn "nvme: probably needs to run as user root\n" if $? && $> != 0;
131
    }
132
    @lines;
133
}
134

    
135
sub human_to_bytes {
136
    my ($str) = @_;
137
    my %units = (
138
        kB => 1000,
139
        MB => 1000_000,
140
        GB => 1000_000_000,
141
        TB => 1000_000_000_000,
142
        PB => 1000_000_000_000_000,    # I wish I had need for this
143
    );
144
    $str =~ /(\d+(\.\d+)?)\s+(.B)/;
145
    int($1 * $units{$3});
146
}
147

    
148
sub nvme_list {
149
    # Node             SN                   Model                                    Namespace Usage                      Format           FW Rev
150
    # ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
151
    # /dev/nvme1n1     S464NB0K601188N      Samsung SSD 970 EVO 2TB                  1         695.50  GB /   2.00  TB    512   B +  0 B   1B2QEXE7
152
    my %devices;
153

    
154
    my $recognised_output;
155
    my $lineno = 0;
156
    for (run_nvme('list')) {
157
        ++$lineno;
158
        if (m:^Node\s+SN\s+Model\s+Namespace Usage:) {
159
            ++$recognised_output;
160
        } elsif (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
161
            $devices{'SN_'.$2} = {
162
                device    => $1,
163
                sn        => $2,
164
                model     => $3,
165
                namespace => $4,
166
                usage     => human_to_bytes($5),
167
                capacity  => human_to_bytes($6),
168
            };
169
        } elsif ($lineno > 2) {
170
            # could not parse device information
171
            $recognised_output = 0;
172
        }
173
    }
174
    if ($lineno && !$recognised_output) {
175
        warn "Could not recognise output from 'nvme list', please report\n";
176
    }
177
    \%devices;
178
}
179

    
180
sub smart_log {
181
    my ($dev) = @_;
182
    my %info;
183
    for (run_nvme('smart-log', $dev)) {
184
        next if /^Smart Log/;
185
        if (/(.*?)\s+:\s+(.*)/) {
186
            my ($var, $value) = ($1, $2);
187
            $var =~ s/\s/_/g;
188
            if ($value =~ /^\d+(,\d\d\d)+$/) {
189
                $value =~ s/,//g;
190
            }
191
            $info{lc $var} = $value;
192
        }
193
    }
194
    return \%info;
195
}
196

    
197
sub my_print_thresholds {
198
    my ($label, $graph, $device, $warn_default, $crit_default) = @_;
199
    my $dev = basename($device);
200
    my ($warn_label, $crit_label) = get_thresholds($graph, "${graph}_${label}_warning", "${graph}_${label}_critical",
201
                                       $warn_default, $crit_default);
202
    my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
203
                                       $warn_label, $crit_label);
204
    print "${label}.warning $warn\n" if defined $warn;
205
    print "${label}.critical $crit\n" if defined $crit;
206
}
207

    
208
use Data::Dumper;
209

    
210
my $mode = ($ARGV[0] or "print");
211

    
212
my $problem = autoconf_problem();
213
my $list    = nvme_list();
214

    
215
if ($mode eq 'autoconf') {
216
    if (keys %{$list}) {
217
        print "yes\n";
218
    } else {
219
        printf("no (%s)\n", $problem || "no devices to monitor");
220
    }
221
    exit 0;
222
}
223

    
224
my @sn = sort keys %{$list};
225

    
226
if ($mode eq 'config') {
227
    my $sn_list = join(' ', @sn);
228

    
229
    print <<'EOF';
230
multigraph nvme_usage
231
graph_title NVME Namespace Usage
232
graph_order $sn_list
233
graph_vlabel Percent used
234
graph_scale no
235
graph_category disk
236
graph_info How much space is used
237
EOF
238
    for (@sn) {
239
        my $device = $list->{$_}->{device};
240
        print <<"EOF";
241
$_.label $device used
242
$_.type GAUGE
243
$_.max 100
244
$_.min 0
245
EOF
246
        my_print_thresholds($_, 'nvme_usage', $device, '95', '98');
247
    }
248
    print <<'EOF';
249
multigraph nvme_bytes
250
graph_title NVME Bytes Read / Written
251
graph_order $sn_list
252
graph_vlabel bytes read (-) / written (+) per ${graph_period}'
253
graph_category disk
254
graph_info How much data is read and written
255
graph_period second
256
EOF
257
    for (@sn) {
258
        print <<"EOF";
259
${_}_r.label $list->{$_}->{device}
260
${_}_r.type DERIVE
261
${_}_r.min 0
262
${_}_r.graph no
263
${_}_w.label $list->{$_}->{device}
264
${_}_w.type DERIVE
265
${_}_w.min 0
266
${_}_w.negative ${_}_r
267
EOF
268
    }
269
    print <<'EOF';
270
multigraph nvme_writecycles
271
graph_title NVME Write Cycles
272
graph_order $sn_list
273
graph_vlabel Cycles
274
graph_args --logarithmic
275
graph_category disk
276
graph_info How much data has been written in lifetime divided by capacity
277
EOF
278
    for (@sn) {
279
        my $device = $list->{$_}->{device};
280
        print <<"EOF";
281
$_.label $device write cycles
282
$_.type GAUGE
283
$_.min 0
284
EOF
285
        my_print_thresholds($_, 'nvme_writecycles', $device);
286
    }
287
    print <<'EOF';
288
multigraph nvme_spare
289
graph_title Available spare blocks
290
graph_order $sn_list
291
graph_vlabel Percent
292
graph_category disk
293
graph_info Spare capacity for replacing bad blocks
294
EOF
295
    for (@sn) {
296
        my $device = $list->{$_}->{device};
297
        print <<"EOF";
298
$_.label $device spare capacity
299
$_.type GAUGE
300
$_.min 0
301
$_.max 100
302
EOF
303
        my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:');
304
    }
305
} else {
306
    for (@sn) {
307
        $list->{$_}->{smart} = smart_log($list->{$_}->{device});
308
    }
309
    print "multigraph nvme_usage\n";
310
    for (@sn) {
311
        my $info = $list->{$_};
312
        my $used = 100 * $info->{usage} / $info->{capacity};
313
        print "$_.value $used\n";
314
    }
315
    print "multigraph nvme_bytes\n";
316
    for (@sn) {
317
        my $info   = $list->{$_};
318
        my $rbytes = $info->{smart}->{data_units_read};
319
        my $wbytes = $info->{smart}->{data_units_written};
320
        print "${_}_r.value $rbytes\n";
321
        print "${_}_w.value $wbytes\n";
322
    }
323
    print "multigraph nvme_writecycles\n";
324
    for (@sn) {
325
        my $info = $list->{$_};
326

    
327
        # The unit size reported is 1000 blocks.
328
        my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
329
        print "$_.value $cycles\n";
330
    }
331
    print "multigraph nvme_spare\n";
332
    for (@sn) {
333
        my $info = $list->{$_};
334

    
335
        # The unit size reported is 1000 blocks.
336
        my $spare = $info->{smart}->{available_spare};
337
        $spare =~ s/%//;
338
        print "$_.value $spare\n";
339
    }
340
}