root / plugins / disk / raid @ 17f78427
Historique | Voir | Annoter | Télécharger (4,47 ko)
| 1 |
#!/usr/bin/perl -w |
|---|---|
| 2 |
# |
| 3 |
# (c) 2007 Nathan Rutman nathan@clusterfs.com |
| 4 |
# |
| 5 |
# Plugin to monitor RAID status |
| 6 |
# |
| 7 |
# Results are % of healthy drives in a raid device |
| 8 |
# and % rebuilt of devices that are resyncing. |
| 9 |
# |
| 10 |
#%# family=contrib |
| 11 |
#%# capabilities=autoconf |
| 12 |
|
| 13 |
if ($ARGV[0] and $ARGV[0] eq "autoconf") {
|
| 14 |
if (-r "/proc/mdstat" and `grep md /proc/mdstat`) {
|
| 15 |
print "yes\n"; |
| 16 |
exit 0; |
| 17 |
} else {
|
| 18 |
print "no RAID devices\n"; |
| 19 |
exit 1; |
| 20 |
} |
| 21 |
} |
| 22 |
|
| 23 |
if ( $ARGV[0] and $ARGV[0] eq "config" ) {
|
| 24 |
print "graph_title RAID status\n"; |
| 25 |
print "graph_category disk\n"; |
| 26 |
print "graph_info This graph monitors RAID disk health. Values are percentage of healthy drives in each raid group. Degraded devices are marked Critical.\n"; |
| 27 |
print "graph_args --base 1000 -l 0\n"; |
| 28 |
print "graph_vlabel % healthy/rebuilt\n"; |
| 29 |
print "graph_scale no\n"; |
| 30 |
} |
| 31 |
|
| 32 |
open(my $mdstat, "/proc/mdstat"); |
| 33 |
my(@text) = <$mdstat>; |
| 34 |
# contents of <$mdstat> may be changed at next reading, so fetch the contents at a time |
| 35 |
close($mdstat); |
| 36 |
|
| 37 |
my($devinfo_re, $devstat_re, $action_re) = ( |
| 38 |
'(md\d+)\s+:\s+active\s+(\(read-only\)\s+|\(auto-read-only\)\s+|)(\w+)\s+(.*)', |
| 39 |
'.*\[(\d+)\/(\d+)]\s+\[(\w+)]', |
| 40 |
'.*(reshape|check|resync|recovery)\s*=\s*(\d+\.\d+%|\w+)(.*finish=(.*min))?', |
| 41 |
); |
| 42 |
# Interestingly, swap is presented as "active (auto-read-only)" |
| 43 |
# and mdadm has '--readonly' option to make the array 'active (read-only)' |
| 44 |
|
| 45 |
my($dev, $ro, $type, $members, $failed, $nmem, $nact, $status, $action, $proc, $minute); |
| 46 |
while (@text) {
|
| 47 |
my $line = shift @text; |
| 48 |
if ($line =~ /$devinfo_re/) {
|
| 49 |
# first line should like "active raid1 sda1[0] sdc1[2] sdb1[1]" |
| 50 |
$dev = $1; |
| 51 |
$ro = $2 || ''; |
| 52 |
$type = $3; |
| 53 |
$members = $4; |
| 54 |
$failed = $members; |
| 55 |
$failed =~ s/[^F]+//g; |
| 56 |
$failed = length($failed); |
| 57 |
|
| 58 |
$line = shift @text; |
| 59 |
if ($line =~ /$devstat_re/) {
|
| 60 |
# second line should like "123456 blocks super 1.2 [2/2] [UU]" |
| 61 |
$nmem = $1; |
| 62 |
$nact = $2; |
| 63 |
$status = $3; |
| 64 |
} |
| 65 |
else {
|
| 66 |
# second line did not exist on /proc/mdstat |
| 67 |
next; |
| 68 |
} |
| 69 |
|
| 70 |
$line = shift @text; |
| 71 |
if ($line =~ /$action_re/) {
|
| 72 |
# third line should like " [==>..................] check = 10.0% (12345/123456) finish=123min speed=12345/sec" |
| 73 |
# this line will appear only when the array is in action |
| 74 |
$action = $1; |
| 75 |
my $percent = $2; |
| 76 |
$minute = $4 || ''; |
| 77 |
if ($percent =~ /(\d+\.\d+)%/) {
|
| 78 |
$proc = $1; |
| 79 |
} |
| 80 |
else {
|
| 81 |
# 'resync=DELAYED' or 'resync=PENDING' |
| 82 |
$action .= " ($percent)"; |
| 83 |
$proc = -1; |
| 84 |
} |
| 85 |
} |
| 86 |
else {
|
| 87 |
# array is not in action |
| 88 |
$action = 'idle'; |
| 89 |
$minute = ''; |
| 90 |
unshift(@text, $line); |
| 91 |
} |
| 92 |
} |
| 93 |
else {
|
| 94 |
# skip until first line is found |
| 95 |
next; |
| 96 |
} |
| 97 |
|
| 98 |
if ( $ARGV[0] and $ARGV[0] eq "config" ) {
|
| 99 |
print "$dev.label $dev\n"; |
| 100 |
print "$dev.info $type $ro$members\n"; |
| 101 |
# 100: means less than 100 |
| 102 |
# Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100. |
| 103 |
print "$dev.critical 98:\n"; |
| 104 |
print $dev, "_rebuild.label $dev reshape/recovery\n"; |
| 105 |
print $dev, "_rebuild.info $action $minute\n"; |
| 106 |
# Because of an unfound bug, sometimes reported as 99.XX even when OS reports 100. |
| 107 |
print $dev, "_rebuild.critical 98:\n"; |
| 108 |
print $dev, "_check.label $dev check/resync \n"; |
| 109 |
print $dev, "_check.info $action $minute\n"; |
| 110 |
print $dev, "_failed.label $dev failed disks \n"; |
| 111 |
print $dev, "_failed.info $action $minute\n"; |
| 112 |
print $dev, "_failed.critical 0:0\n"; |
| 113 |
} else {
|
| 114 |
my $pct = 100 * $nact / $nmem; |
| 115 |
my $rpct = 100; |
| 116 |
my $cpct = 100; |
| 117 |
if ($action =~ /reshape|recovery/) {
|
| 118 |
$rpct = $proc; |
| 119 |
$cpct = 0; # check/resync is not done |
| 120 |
} |
| 121 |
elsif ($action =~ /check|resync/) {
|
| 122 |
if ($proc < 0) {
|
| 123 |
# array is on DELAYED or PENDING, further info is unknown |
| 124 |
$rpct = 0; |
| 125 |
$cpct = 0; |
| 126 |
} |
| 127 |
else {
|
| 128 |
# reshape/recovery was done, $rpct => 100 |
| 129 |
$cpct = $proc; |
| 130 |
} |
| 131 |
} |
| 132 |
|
| 133 |
print "$dev.value $pct\n"; |
| 134 |
print $dev, "_rebuild.value $rpct\n"; |
| 135 |
print $dev, "_check.value $cpct\n"; |
| 136 |
print $dev, "_failed.value $failed\n"; |
| 137 |
} |
| 138 |
} |
| 139 |
|
| 140 |
exit 0; |
| 141 |
|
