1 |
package Core::Zpool; |
---|
2 |
|
---|
3 |
use strict; |
---|
4 |
use warnings; |
---|
5 |
|
---|
6 |
use base 'Resmon::Module'; |
---|
7 |
|
---|
8 |
use Resmon::ExtComm qw(run_command cache_command); |
---|
9 |
|
---|
10 |
=pod |
---|
11 |
|
---|
12 |
=head1 NAME |
---|
13 |
|
---|
14 |
Core::Zpool - monitor zfs zpool health |
---|
15 |
|
---|
16 |
=head1 SYNOPSIS |
---|
17 |
|
---|
18 |
Core::Zpool { |
---|
19 |
zpools: noop |
---|
20 |
} |
---|
21 |
|
---|
22 |
Core::Zpool { |
---|
23 |
zpools: zpool_path = '/sbin/zpool' |
---|
24 |
} |
---|
25 |
|
---|
26 |
=head1 DESCRIPTION |
---|
27 |
|
---|
28 |
This module checks the status of ZFS pools, reporting any read/write/checksum |
---|
29 |
errors, as well as the status of the pools as a whole. |
---|
30 |
|
---|
31 |
=head1 CONFIGURATION |
---|
32 |
|
---|
33 |
=over |
---|
34 |
|
---|
35 |
=item check_name |
---|
36 |
|
---|
37 |
The check name is descriptive only in this check. It is not used for anything. |
---|
38 |
|
---|
39 |
=item zpool_path |
---|
40 |
|
---|
41 |
Specify an alternative location for the zpool command. Default: /sbin/zpool. |
---|
42 |
|
---|
43 |
=back |
---|
44 |
|
---|
45 |
=head1 METRICS |
---|
46 |
|
---|
47 |
A set of metrics is returned for each pool on the system, with the name of the |
---|
48 |
pool being used as a prefix. For example, if you have rpool and data pools, |
---|
49 |
then you will end up with both rpool_state and data_state (as well as the |
---|
50 |
rest of the metrics for each pool). |
---|
51 |
|
---|
52 |
=over |
---|
53 |
|
---|
54 |
=item poolname_state |
---|
55 |
|
---|
56 |
The state of the pool as a string. Examples: ONLINE, FAULTED, DEGRADED. |
---|
57 |
|
---|
58 |
=item poolname_errors_read |
---|
59 |
|
---|
60 |
A count of read errors in the pool as a whole. This is the sum of the errors |
---|
61 |
for all devices in the pool |
---|
62 |
|
---|
63 |
=item poolname_errors_write |
---|
64 |
|
---|
65 |
A count of write errors in the pool as a whole. This is the sum of the errors |
---|
66 |
for all devices in the pool |
---|
67 |
|
---|
68 |
=item poolname_errors_cksum |
---|
69 |
|
---|
70 |
A count of checksum errors in the pool as a whole. This is the sum of the |
---|
71 |
errors for all devices in the pool |
---|
72 |
|
---|
73 |
=item poolname_device_errors |
---|
74 |
|
---|
75 |
A list of devices that have errors, along with the error count. For example: |
---|
76 |
|
---|
77 |
c0t0d0 3R 2W 1C, c1t0d0 100W |
---|
78 |
|
---|
79 |
=back |
---|
80 |
|
---|
81 |
=cut |
---|
82 |
|
---|
83 |
sub convert_units { |
---|
84 |
my ($self, $count, $unit) = @_; |
---|
85 |
my %units = ( |
---|
86 |
'G' => 1000000000, |
---|
87 |
'M' => 1000000, |
---|
88 |
'K' => 1000 |
---|
89 |
); |
---|
90 |
if ($unit) { |
---|
91 |
$count = $count * $units{$unit}; |
---|
92 |
} |
---|
93 |
return $count; |
---|
94 |
} |
---|
95 |
|
---|
96 |
sub handler { |
---|
97 |
my $self = shift; |
---|
98 |
my $config = $self->{config}; # All configuration is in here |
---|
99 |
my $zpool_command = $config->{zpool_command} || '/sbin/zpool'; |
---|
100 |
|
---|
101 |
my $pool = ""; |
---|
102 |
my $pool_status = {}; |
---|
103 |
my $status = {}; |
---|
104 |
my $output = run_command("$zpool_command status"); |
---|
105 |
foreach my $line (split(/\n/, $output)) { |
---|
106 |
if ($line =~ /pool: (.+)$/) { |
---|
107 |
# Start of a new pool |
---|
108 |
$pool = $1; |
---|
109 |
$pool_status = { |
---|
110 |
'state' => '', |
---|
111 |
'r' => 0, |
---|
112 |
'w' => 0, |
---|
113 |
'c' => 0, |
---|
114 |
'deverrs' => [] |
---|
115 |
} |
---|
116 |
} |
---|
117 |
elsif ($line =~ /errors: (.+)$/) { |
---|
118 |
# This line marks the end of a pool in zpool status. Store the |
---|
119 |
# status for a pool. |
---|
120 |
$status->{"${pool}_state"} = [$pool_status->{state}, "s"]; |
---|
121 |
$status->{"${pool}_errors_read"} = [$pool_status->{r}, "i"]; |
---|
122 |
$status->{"${pool}_errors_write"} = [$pool_status->{w}, "i"]; |
---|
123 |
$status->{"${pool}_errors_cksum"} = [$pool_status->{c}, "i"]; |
---|
124 |
$status->{"${pool}_device_errors"} = |
---|
125 |
[join(', ', @{$pool_status->{deverrs}}), "s"]; |
---|
126 |
} |
---|
127 |
elsif ($line =~ /state: (.+)$/) { |
---|
128 |
# Pool state |
---|
129 |
$pool_status->{state} = $1; |
---|
130 |
} |
---|
131 |
elsif ($line =~ /([a-z0-9]+)\s+([A-Z]+)\s+([\d.]+)([KMG])?\s+([\d.]+)([KMG])?\s+([\d.]+)([KMG])?/) { |
---|
132 |
# A device status line |
---|
133 |
my $device = $1; |
---|
134 |
my @errs; |
---|
135 |
if ($3 != 0) { |
---|
136 |
my $count = $self->convert_units($3, $4); |
---|
137 |
$pool_status->{r} += $count; |
---|
138 |
push(@errs, "${count}R"); |
---|
139 |
} |
---|
140 |
if ($5 != 0) { |
---|
141 |
my $count = $self->convert_units($5, $6); |
---|
142 |
$pool_status->{w} += $count; |
---|
143 |
push(@errs, "${count}W"); |
---|
144 |
} |
---|
145 |
if ($7 != 0) { |
---|
146 |
my $count = $self->convert_units($7, $8); |
---|
147 |
$pool_status->{c} += $count; |
---|
148 |
push(@errs, "${count}C"); |
---|
149 |
} |
---|
150 |
if (scalar(@errs)) { |
---|
151 |
push(@{$pool_status->{deverrs}}, join(" ", $device, @errs)); |
---|
152 |
} |
---|
153 |
} |
---|
154 |
} |
---|
155 |
|
---|
156 |
return $status; |
---|
157 |
}; |
---|
158 |
|
---|
159 |
1; |
---|