| 1 |
package Core::Zpool; |
|---|
| 2 |
|
|---|
| 3 |
use strict; |
|---|
| 4 |
use warnings; |
|---|
| 5 |
|
|---|
| 6 |
use base 'Resmon::Module'; |
|---|
| 7 |
|
|---|
| 8 |
use Resmon::ExtComm qw(run_command cache_command); |
|---|
| 9 |
|
|---|
| 10 |
=pod |
|---|
| 11 |
|
|---|
| 12 |
=head1 NAME |
|---|
| 13 |
|
|---|
| 14 |
Core::Zpool - monitor zfs zpool health |
|---|
| 15 |
|
|---|
| 16 |
=head1 SYNOPSIS |
|---|
| 17 |
|
|---|
| 18 |
Core::Zpool { |
|---|
| 19 |
zpools: noop |
|---|
| 20 |
} |
|---|
| 21 |
|
|---|
| 22 |
Core::Zpool { |
|---|
| 23 |
zpools: zpool_path = '/sbin/zpool' |
|---|
| 24 |
} |
|---|
| 25 |
|
|---|
| 26 |
=head1 DESCRIPTION |
|---|
| 27 |
|
|---|
| 28 |
This module checks the status of ZFS pools, reporting any read/write/checksum |
|---|
| 29 |
errors, as well as the status of the pools as a whole. |
|---|
| 30 |
|
|---|
| 31 |
=head1 CONFIGURATION |
|---|
| 32 |
|
|---|
| 33 |
=over |
|---|
| 34 |
|
|---|
| 35 |
=item check_name |
|---|
| 36 |
|
|---|
| 37 |
The check name is descriptive only in this check. It is not used for anything. |
|---|
| 38 |
|
|---|
| 39 |
=item zpool_path |
|---|
| 40 |
|
|---|
| 41 |
Specify an alternative location for the zpool command. Default: /sbin/zpool. |
|---|
| 42 |
|
|---|
| 43 |
=back |
|---|
| 44 |
|
|---|
| 45 |
=head1 METRICS |
|---|
| 46 |
|
|---|
| 47 |
A set of metrics is returned for each pool on the system, with the name of the |
|---|
| 48 |
pool being used as a prefix. For example, if you have rpool and data pools, |
|---|
| 49 |
then you will end up with both rpool_state and data_state (as well as the |
|---|
| 50 |
rest of the metrics for each pool). |
|---|
| 51 |
|
|---|
| 52 |
=over |
|---|
| 53 |
|
|---|
| 54 |
=item poolname_state |
|---|
| 55 |
|
|---|
| 56 |
The state of the pool as a string. Examples: ONLINE, FAULTED, DEGRADED. |
|---|
| 57 |
|
|---|
| 58 |
=item poolname_errors_read |
|---|
| 59 |
|
|---|
| 60 |
A count of read errors in the pool as a whole. This is the sum of the errors |
|---|
| 61 |
for all devices in the pool |
|---|
| 62 |
|
|---|
| 63 |
=item poolname_errors_write |
|---|
| 64 |
|
|---|
| 65 |
A count of write errors in the pool as a whole. This is the sum of the errors |
|---|
| 66 |
for all devices in the pool |
|---|
| 67 |
|
|---|
| 68 |
=item poolname_errors_cksum |
|---|
| 69 |
|
|---|
| 70 |
A count of checksum errors in the pool as a whole. This is the sum of the |
|---|
| 71 |
errors for all devices in the pool |
|---|
| 72 |
|
|---|
| 73 |
=item poolname_device_errors |
|---|
| 74 |
|
|---|
| 75 |
A list of devices that have errors, along with the error count. For example: |
|---|
| 76 |
|
|---|
| 77 |
c0t0d0 3R 2W 1C, c1t0d0 100W |
|---|
| 78 |
|
|---|
| 79 |
=back |
|---|
| 80 |
|
|---|
| 81 |
=cut |
|---|
| 82 |
|
|---|
| 83 |
sub convert_units { |
|---|
| 84 |
my ($self, $count, $unit) = @_; |
|---|
| 85 |
my %units = ( |
|---|
| 86 |
'G' => 1000000000, |
|---|
| 87 |
'M' => 1000000, |
|---|
| 88 |
'K' => 1000 |
|---|
| 89 |
); |
|---|
| 90 |
if ($unit) { |
|---|
| 91 |
$count = $count * $units{$unit}; |
|---|
| 92 |
} |
|---|
| 93 |
return $count; |
|---|
| 94 |
} |
|---|
| 95 |
|
|---|
| 96 |
sub handler { |
|---|
| 97 |
my $self = shift; |
|---|
| 98 |
my $config = $self->{config}; # All configuration is in here |
|---|
| 99 |
my $zpool_command = $config->{zpool_command} || '/sbin/zpool'; |
|---|
| 100 |
|
|---|
| 101 |
my $pool = ""; |
|---|
| 102 |
my $pool_status = {}; |
|---|
| 103 |
my $status = {}; |
|---|
| 104 |
my $output = run_command("$zpool_command status"); |
|---|
| 105 |
foreach my $line (split(/\n/, $output)) { |
|---|
| 106 |
if ($line =~ /pool: (.+)$/) { |
|---|
| 107 |
# Start of a new pool |
|---|
| 108 |
$pool = $1; |
|---|
| 109 |
$pool_status = { |
|---|
| 110 |
'state' => '', |
|---|
| 111 |
'r' => 0, |
|---|
| 112 |
'w' => 0, |
|---|
| 113 |
'c' => 0, |
|---|
| 114 |
'deverrs' => [] |
|---|
| 115 |
} |
|---|
| 116 |
} |
|---|
| 117 |
elsif ($line =~ /errors: (.+)$/) { |
|---|
| 118 |
# This line marks the end of a pool in zpool status. Store the |
|---|
| 119 |
# status for a pool. |
|---|
| 120 |
$status->{"${pool}_state"} = [$pool_status->{state}, "s"]; |
|---|
| 121 |
$status->{"${pool}_errors_read"} = [$pool_status->{r}, "i"]; |
|---|
| 122 |
$status->{"${pool}_errors_write"} = [$pool_status->{w}, "i"]; |
|---|
| 123 |
$status->{"${pool}_errors_cksum"} = [$pool_status->{c}, "i"]; |
|---|
| 124 |
$status->{"${pool}_device_errors"} = |
|---|
| 125 |
[join(', ', @{$pool_status->{deverrs}}), "s"]; |
|---|
| 126 |
} |
|---|
| 127 |
elsif ($line =~ /state: (.+)$/) { |
|---|
| 128 |
# Pool state |
|---|
| 129 |
$pool_status->{state} = $1; |
|---|
| 130 |
} |
|---|
| 131 |
elsif ($line =~ /([a-z0-9]+)\s+([A-Z]+)\s+([\d.]+)([KMG])?\s+([\d.]+)([KMG])?\s+([\d.]+)([KMG])?/) { |
|---|
| 132 |
# A device status line |
|---|
| 133 |
my $device = $1; |
|---|
| 134 |
my @errs; |
|---|
| 135 |
if ($3 != 0) { |
|---|
| 136 |
my $count = $self->convert_units($3, $4); |
|---|
| 137 |
$pool_status->{r} += $count; |
|---|
| 138 |
push(@errs, "${count}R"); |
|---|
| 139 |
} |
|---|
| 140 |
if ($5 != 0) { |
|---|
| 141 |
my $count = $self->convert_units($5, $6); |
|---|
| 142 |
$pool_status->{w} += $count; |
|---|
| 143 |
push(@errs, "${count}W"); |
|---|
| 144 |
} |
|---|
| 145 |
if ($7 != 0) { |
|---|
| 146 |
my $count = $self->convert_units($7, $8); |
|---|
| 147 |
$pool_status->{c} += $count; |
|---|
| 148 |
push(@errs, "${count}C"); |
|---|
| 149 |
} |
|---|
| 150 |
if (scalar(@errs)) { |
|---|
| 151 |
push(@{$pool_status->{deverrs}}, join(" ", $device, @errs)); |
|---|
| 152 |
} |
|---|
| 153 |
} |
|---|
| 154 |
} |
|---|
| 155 |
|
|---|
| 156 |
return $status; |
|---|
| 157 |
}; |
|---|
| 158 |
|
|---|
| 159 |
1; |
|---|