| 1 |
#!/usr/bin/perl -w |
|---|
| 2 |
# Remove the following line to disable embedded perl |
|---|
| 3 |
# nagios: +epn |
|---|
| 4 |
|
|---|
| 5 |
=pod |
|---|
| 6 |
|
|---|
| 7 |
=head1 NAME |
|---|
| 8 |
|
|---|
| 9 |
check_resmon_metric - Nagios check to monitor a resmon metric |
|---|
| 10 |
|
|---|
| 11 |
=head1 SYNOPSIS |
|---|
| 12 |
|
|---|
| 13 |
check_resmon_metric -H hostname [-P port] -M module_name -C check_name |
|---|
| 14 |
-m metric_name [ -r regex | [-w warning_threshold] |
|---|
| 15 |
[-c critical_threshold] ] [ -u alternate_url ] |
|---|
| 16 |
|
|---|
| 17 |
=head1 DESCRIPTION |
|---|
| 18 |
|
|---|
| 19 |
This is a nagios check script that will connect to a running resmon instance, |
|---|
| 20 |
fetch information for a single check, and compare a metric against rules |
|---|
| 21 |
provided on the command line, returning the status to nagios based on the |
|---|
| 22 |
result. |
|---|
| 23 |
|
|---|
| 24 |
For numeric metrics, there are options for warning/critical thresholds, and |
|---|
| 25 |
for string metrics, there is an option to match against a regular expression. |
|---|
| 26 |
|
|---|
| 27 |
=head1 OPTIONS |
|---|
| 28 |
|
|---|
| 29 |
=over |
|---|
| 30 |
|
|---|
| 31 |
=item -H hostname |
|---|
| 32 |
|
|---|
| 33 |
The hostname of the resmon instance to connect to. Required. |
|---|
| 34 |
|
|---|
| 35 |
=item -P port |
|---|
| 36 |
|
|---|
| 37 |
The port that resmon is listening on. Defaults to 81. |
|---|
| 38 |
|
|---|
| 39 |
=item -M module_name |
|---|
| 40 |
|
|---|
| 41 |
The module name of the check you wish to fetch. Required. |
|---|
| 42 |
|
|---|
| 43 |
=item -C check_name |
|---|
| 44 |
|
|---|
| 45 |
The check name you wish to fetch. Required. |
|---|
| 46 |
|
|---|
| 47 |
=item -m metric_name |
|---|
| 48 |
|
|---|
| 49 |
The name of the metric you wish to evaluate rules against. Required. |
|---|
| 50 |
|
|---|
| 51 |
=item -w warning_threshold |
|---|
| 52 |
|
|---|
| 53 |
A numeric threshold (see below) to test the metric against. Will return a |
|---|
| 54 |
warning status if the threshold matches. This is only applicable for numeric |
|---|
| 55 |
metrics. An error will be returned for string metrics. Optional. |
|---|
| 56 |
|
|---|
| 57 |
=item -c critical_threshold |
|---|
| 58 |
|
|---|
| 59 |
A numeric threshold (see below) to test the metric against. Will return a |
|---|
| 60 |
critical status if the threshold matches. This is only applicable for numeric |
|---|
| 61 |
metrics. An error will be returned for string metrics. If both the critical |
|---|
| 62 |
and warning thresholds match a given metric, then the status returned is |
|---|
| 63 |
critical. Optional. |
|---|
| 64 |
|
|---|
| 65 |
=item -r regex |
|---|
| 66 |
|
|---|
| 67 |
A regular expression to match the metric against. This is most useful for |
|---|
| 68 |
string based metrics. If the regular expression matches, then an ok status |
|---|
| 69 |
will be returned, otherwise the status will be critical. This cannot be used |
|---|
| 70 |
in conjunction with the warning/critical thresholds. Optional. |
|---|
| 71 |
|
|---|
| 72 |
=item -A age |
|---|
| 73 |
|
|---|
| 74 |
This option will cause a check to be critical if it was updated more than age |
|---|
| 75 |
seconds ago. |
|---|
| 76 |
|
|---|
| 77 |
=item -u url |
|---|
| 78 |
|
|---|
| 79 |
Normally this check will fetch metrics from http://host:port/module/check, but |
|---|
| 80 |
if you are using this to check a different system that exposes metrics in a |
|---|
| 81 |
similar fashion to resmon, you can specify an alternate url here. You only |
|---|
| 82 |
need to specify the part after hostname/port. For example, if you need to hit |
|---|
| 83 |
http://host:port/resmon then pass -u /resmon. |
|---|
| 84 |
|
|---|
| 85 |
=item -a |
|---|
| 86 |
|
|---|
| 87 |
Specify that an absence should be treated as an OK value. By default, if a |
|---|
| 88 |
metric is absent, it is treated as CRITICAL. A metric is considered absent if |
|---|
| 89 |
the check information can be fetched, but the metric itself isn't listed in |
|---|
| 90 |
the XML output. If the check information cannot be fetched, then it is still a |
|---|
| 91 |
critical error. |
|---|
| 92 |
|
|---|
| 93 |
=back |
|---|
| 94 |
|
|---|
| 95 |
=head1 THRESHOLDS |
|---|
| 96 |
|
|---|
| 97 |
The warning and critical thresholds are specified in the same way as for other |
|---|
| 98 |
nagios plugins, as desribed at |
|---|
| 99 |
http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT |
|---|
| 100 |
|
|---|
| 101 |
Most of the time, you just want to put a number for the threshold. For |
|---|
| 102 |
example, '-w 10 -c 20' would go warning if the value is over 10, and critical |
|---|
| 103 |
if it is over 20. |
|---|
| 104 |
|
|---|
| 105 |
For more complex ranges, use the more generalized format of: [@]start:end |
|---|
| 106 |
|
|---|
| 107 |
=over |
|---|
| 108 |
|
|---|
| 109 |
=item * |
|---|
| 110 |
|
|---|
| 111 |
Start and end are both numbers, and either can be negative |
|---|
| 112 |
|
|---|
| 113 |
=item * |
|---|
| 114 |
|
|---|
| 115 |
If start is missing, then the colon isn't required, and the range is from 0 to |
|---|
| 116 |
end. In other words, if you specify a single number, the range is from 0 to |
|---|
| 117 |
the number you specified. |
|---|
| 118 |
|
|---|
| 119 |
=item * |
|---|
| 120 |
|
|---|
| 121 |
If end is missing (E.g. 10:), then the range is from start to infinity. |
|---|
| 122 |
|
|---|
| 123 |
=item * |
|---|
| 124 |
|
|---|
| 125 |
If start is '~', then the range is from minus infinity to end (as opposed to |
|---|
| 126 |
0-end if start is omitted) |
|---|
| 127 |
|
|---|
| 128 |
=item * |
|---|
| 129 |
|
|---|
| 130 |
An alert is generated if the test value lies outside the specified range. |
|---|
| 131 |
|
|---|
| 132 |
=item * |
|---|
| 133 |
|
|---|
| 134 |
If @ is specified at the beginning, the alerting behavior is flipped. In other |
|---|
| 135 |
words, an alert is generated if the value lies within the range. |
|---|
| 136 |
|
|---|
| 137 |
=back |
|---|
| 138 |
|
|---|
| 139 |
=head2 EXAMPLES |
|---|
| 140 |
|
|---|
| 141 |
The following lists example thresholds, and the conditions under which an |
|---|
| 142 |
alert is generated. |
|---|
| 143 |
|
|---|
| 144 |
=over |
|---|
| 145 |
|
|---|
| 146 |
=item '10' |
|---|
| 147 |
|
|---|
| 148 |
Ok if: 0 <= X <= 10, Alert if: X < 0 or X > 10 |
|---|
| 149 |
|
|---|
| 150 |
=item '10:' |
|---|
| 151 |
|
|---|
| 152 |
Ok if: X >= 10, Alert if: X < 10 |
|---|
| 153 |
|
|---|
| 154 |
=item '~:10' |
|---|
| 155 |
|
|---|
| 156 |
Ok if: X <= 10, Alert if: X > 10 |
|---|
| 157 |
|
|---|
| 158 |
=item '10:20' |
|---|
| 159 |
|
|---|
| 160 |
Ok if: 10 <= X <= 20, Alert if: X < 10 or X > 20 |
|---|
| 161 |
|
|---|
| 162 |
=item '@10:20' |
|---|
| 163 |
|
|---|
| 164 |
Ok if: X < 10 or X > 20, Alert if: 10 <= X <= 20 |
|---|
| 165 |
|
|---|
| 166 |
=back |
|---|
| 167 |
|
|---|
| 168 |
=cut |
|---|
| 169 |
|
|---|
| 170 |
use vars qw($PROGNAME); |
|---|
| 171 |
if ($0 =~ m/^(.*?)[\/\\]([^\/\\]+)$/) { |
|---|
| 172 |
$PROGNAME = $2; |
|---|
| 173 |
} |
|---|
| 174 |
|
|---|
| 175 |
use strict; |
|---|
| 176 |
use warnings; |
|---|
| 177 |
|
|---|
| 178 |
use LWP::UserAgent; |
|---|
| 179 |
use HTTP::Request; |
|---|
| 180 |
use HTTP::Response; |
|---|
| 181 |
use Time::HiRes qw( gettimeofday tv_interval ); |
|---|
| 182 |
use XML::Simple; |
|---|
| 183 |
use Getopt::Long; |
|---|
| 184 |
|
|---|
| 185 |
use utils qw($TIMEOUT %ERRORS &print_revision &support); |
|---|
| 186 |
|
|---|
| 187 |
delete @ENV{'PATH', 'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; |
|---|
| 188 |
|
|---|
| 189 |
my ($port, $host, $module, $check, $age, $warning, $critical, $equals, $regex, |
|---|
| 190 |
$metric, $url, $allow_absent) = (81, undef, undef, undef, 0, undef, undef, |
|---|
| 191 |
undef, undef, 0); |
|---|
| 192 |
|
|---|
| 193 |
sub help { |
|---|
| 194 |
print "Usage: $0 [options]\n"; |
|---|
| 195 |
print " -H | --hostname host to check\n"; |
|---|
| 196 |
print " -P | --port port that resmon runs on (default 81)\n"; |
|---|
| 197 |
print " -M | --module module to check\n"; |
|---|
| 198 |
print " -C | --check name of individual check\n"; |
|---|
| 199 |
print " -A | --age how recently should the check have been"; |
|---|
| 200 |
print " updated\n"; |
|---|
| 201 |
print " -m | --metric metric name to check\n"; |
|---|
| 202 |
print " -w | --warning warning threshold (numeric metrics only)\n"; |
|---|
| 203 |
print " -c | --critical critical threshold (numeric metrics only)\n"; |
|---|
| 204 |
print " -e | --equals metric must equal this value (numerical metrics only)\n"; |
|---|
| 205 |
print " -r | --regex regex match against the metric (string print metrics only)\n"; |
|---|
| 206 |
print " -u | --url specify an alternate URL to fetch\n"; |
|---|
| 207 |
print " -a | --allowabsent Treat absences as OK (default: critical)\n"; |
|---|
| 208 |
exit $ERRORS{'UNKNOWN'}; |
|---|
| 209 |
} |
|---|
| 210 |
|
|---|
| 211 |
sub short_help { |
|---|
| 212 |
print "Usage: $0 -H host -M module -C check -m metric [options]\n"; |
|---|
| 213 |
print "run $0 --help for more information\n"; |
|---|
| 214 |
exit $ERRORS{'UNKNOWN'}; |
|---|
| 215 |
} |
|---|
| 216 |
|
|---|
| 217 |
sub check_threshold { |
|---|
| 218 |
my ($value, $warning, $critical) = @_; |
|---|
| 219 |
my ($state, $message, $warnmessage, $critmessage) = (0,"","",""); |
|---|
| 220 |
if (defined $critical) { |
|---|
| 221 |
($state, $critmessage) = check_single_threshold($value, $critical); |
|---|
| 222 |
if (!$state) { |
|---|
| 223 |
return ("CRITICAL", $critmessage); |
|---|
| 224 |
} |
|---|
| 225 |
$message = $critmessage; |
|---|
| 226 |
} |
|---|
| 227 |
if (defined $warning) { |
|---|
| 228 |
($state, $warnmessage) = check_single_threshold($value, $warning); |
|---|
| 229 |
if (!$state) { |
|---|
| 230 |
return ("WARNING", $warnmessage); |
|---|
| 231 |
} |
|---|
| 232 |
if ($message) { |
|---|
| 233 |
$message .= " and $warnmessage"; |
|---|
| 234 |
} else { |
|---|
| 235 |
$message = $warnmessage; |
|---|
| 236 |
} |
|---|
| 237 |
} |
|---|
| 238 |
return ("OK", $message); |
|---|
| 239 |
} |
|---|
| 240 |
|
|---|
| 241 |
sub check_single_threshold { |
|---|
| 242 |
my ($value, $threshold) = @_; |
|---|
| 243 |
my ($inclusive, $start, $end) = ( |
|---|
| 244 |
$threshold =~ /(\@?)(?:(-?[0-9.]+|~):)?(-?[0-9.]+|~)?/); |
|---|
| 245 |
$start = "0" unless defined $start; |
|---|
| 246 |
$end = "" unless defined $end; |
|---|
| 247 |
|
|---|
| 248 |
my $message; |
|---|
| 249 |
my $goodmessage; |
|---|
| 250 |
my $badmessage; |
|---|
| 251 |
|
|---|
| 252 |
my $state = 1; |
|---|
| 253 |
if ($start eq "~") { |
|---|
| 254 |
$badmessage = "$value > $end"; |
|---|
| 255 |
$goodmessage = "$value <= $end"; |
|---|
| 256 |
if ($value > $end) { |
|---|
| 257 |
$state = 0; |
|---|
| 258 |
} |
|---|
| 259 |
} elsif ($end eq "") { |
|---|
| 260 |
$badmessage = "$value < $start"; |
|---|
| 261 |
$goodmessage = "$value >= $start"; |
|---|
| 262 |
if ($value < $start) { |
|---|
| 263 |
$state = 0; |
|---|
| 264 |
} |
|---|
| 265 |
} else { |
|---|
| 266 |
$badmessage = "$value outside range ($start to $end)"; |
|---|
| 267 |
$goodmessage = "$start <= $value <= $end"; |
|---|
| 268 |
if ($value < $start || $value > $end) { |
|---|
| 269 |
$state = 0; |
|---|
| 270 |
} |
|---|
| 271 |
} |
|---|
| 272 |
|
|---|
| 273 |
$message = $state ? $goodmessage : $badmessage; |
|---|
| 274 |
|
|---|
| 275 |
# Negate the result if inclusive |
|---|
| 276 |
if ($inclusive) { |
|---|
| 277 |
$state = $state ? 0 : 1; |
|---|
| 278 |
} |
|---|
| 279 |
return ($state, $message); |
|---|
| 280 |
} |
|---|
| 281 |
|
|---|
| 282 |
Getopt::Long::Configure('bundling', 'no_ignore_case'); |
|---|
| 283 |
GetOptions ( |
|---|
| 284 |
"h|help" => \&help, |
|---|
| 285 |
"H|host=s" => \$host, |
|---|
| 286 |
"P|port=i" => \$port, |
|---|
| 287 |
"M|module=s" => \$module, |
|---|
| 288 |
"C|check=s" => \$check, |
|---|
| 289 |
"A|age=i" => \$age, |
|---|
| 290 |
"m|metric=s" => \$metric, |
|---|
| 291 |
"w|warning=s" => \$warning, |
|---|
| 292 |
"c|critical=s" => \$critical, |
|---|
| 293 |
"e|equals=i" => \$equals, |
|---|
| 294 |
"r|regex=s" => \$regex, |
|---|
| 295 |
"u|url=s" => \$url, |
|---|
| 296 |
"a|allowabsent" => \$allow_absent); |
|---|
| 297 |
|
|---|
| 298 |
unless (defined $host && defined $module && defined $check) { |
|---|
| 299 |
short_help(); |
|---|
| 300 |
} |
|---|
| 301 |
|
|---|
| 302 |
if ((defined $warning || defined $critical || defined $equals ) && |
|---|
| 303 |
defined $regex) { |
|---|
| 304 |
print "Cannot specify both numeric thresholds and a string based match\n"; |
|---|
| 305 |
exit $ERRORS{'UNKNOWN'}; |
|---|
| 306 |
} |
|---|
| 307 |
|
|---|
| 308 |
if ((defined $warning || defined $critical ) && defined $equals) { |
|---|
| 309 |
print "Cannot specify thresholds and equals\n"; |
|---|
| 310 |
exit $ERRORS{'UNKNOWN'}; |
|---|
| 311 |
} |
|---|
| 312 |
|
|---|
| 313 |
my $ua = LWP::UserAgent->new; |
|---|
| 314 |
$url = "/$module/$check" unless $url; |
|---|
| 315 |
my $t = HTTP::Request->new('GET', "http://$host:$port$url"); |
|---|
| 316 |
my $xs = XML::Simple->new(); |
|---|
| 317 |
my $state = "UNKNOWN"; |
|---|
| 318 |
eval { |
|---|
| 319 |
my $ref; |
|---|
| 320 |
# Make the HTTP request |
|---|
| 321 |
my $res = $ua->request($t); |
|---|
| 322 |
die "could not fetch http://$host:$port$url - " . $res->status_line ."\n" |
|---|
| 323 |
unless($res && $res->is_success); |
|---|
| 324 |
# Parse the xml |
|---|
| 325 |
eval { $ref = $xs->XMLin($res->content, ForceArray => 1); }; |
|---|
| 326 |
die "error parsing XML\n" if($@); |
|---|
| 327 |
|
|---|
| 328 |
# Debugging |
|---|
| 329 |
#use Data::Dumper; |
|---|
| 330 |
#print Dumper($ref->{ResmonResult}); |
|---|
| 331 |
|
|---|
| 332 |
my $idx; |
|---|
| 333 |
for ($idx=0; $idx < $#{$ref->{ResmonResult}}; $idx++) { |
|---|
| 334 |
last if ($ref->{ResmonResult}->[$idx]->{module} eq $module && |
|---|
| 335 |
$ref->{ResmonResult}->[$idx]->{service} eq $check); |
|---|
| 336 |
} |
|---|
| 337 |
|
|---|
| 338 |
# If we have stale information, then go critical |
|---|
| 339 |
my $last_update = time() - $ref->{ResmonResult}->[$idx]->{last_update}->[0]; |
|---|
| 340 |
die "Stale metrics. Last updated $last_update seconds ago" |
|---|
| 341 |
if($age && $age < $last_update); |
|---|
| 342 |
|
|---|
| 343 |
# Get the metrics |
|---|
| 344 |
my $metricval = $ref->{ResmonResult}->[$idx]->{metric}->{$metric}; |
|---|
| 345 |
|
|---|
| 346 |
# Detect absence of a metric |
|---|
| 347 |
if (!defined($metricval)) { |
|---|
| 348 |
if ($allow_absent) { |
|---|
| 349 |
$state = "OK"; |
|---|
| 350 |
} else { |
|---|
| 351 |
$state = "CRITICAL"; |
|---|
| 352 |
} |
|---|
| 353 |
print "$state: Metric $metric is absent\n"; |
|---|
| 354 |
exit $ERRORS{$state}; |
|---|
| 355 |
} |
|---|
| 356 |
|
|---|
| 357 |
my $value = $metricval->{content}; |
|---|
| 358 |
my $type = $metricval->{type} || "0"; |
|---|
| 359 |
|
|---|
| 360 |
# Note: if type is auto (0), then we assume it can be treated as a number |
|---|
| 361 |
# of some sort. If you're specifying a warning/critical threshold, then |
|---|
| 362 |
# you are too. |
|---|
| 363 |
die "Numeric threshold specified for a non-numeric metric" |
|---|
| 364 |
if ((defined $warning || defined $critical) && $type !~ /[0IlLni]/); |
|---|
| 365 |
|
|---|
| 366 |
|
|---|
| 367 |
if (defined $equals) { |
|---|
| 368 |
if ($value eq $equals) { |
|---|
| 369 |
$state = "OK"; |
|---|
| 370 |
} else { |
|---|
| 371 |
$state = "CRITICAL"; |
|---|
| 372 |
} |
|---|
| 373 |
print "$state: $metric = $value\n"; |
|---|
| 374 |
} |
|---|
| 375 |
|
|---|
| 376 |
if (defined $regex) { |
|---|
| 377 |
if (!defined($value)) { |
|---|
| 378 |
$value = ''; |
|---|
| 379 |
} |
|---|
| 380 |
if ($value =~ /$regex/) { |
|---|
| 381 |
$state = "OK"; |
|---|
| 382 |
} else { |
|---|
| 383 |
$state = "CRITICAL"; |
|---|
| 384 |
} |
|---|
| 385 |
print "$state: $metric - $value\n"; |
|---|
| 386 |
} |
|---|
| 387 |
|
|---|
| 388 |
if (defined $warning || defined $critical) { |
|---|
| 389 |
my $message; |
|---|
| 390 |
($state, $message) = check_threshold($value, $warning, $critical); |
|---|
| 391 |
print "$state: $metric - $message\n"; |
|---|
| 392 |
} |
|---|
| 393 |
}; |
|---|
| 394 |
|
|---|
| 395 |
if($@) { |
|---|
| 396 |
chomp($@); |
|---|
| 397 |
print "CRITICAL: $@\n"; |
|---|
| 398 |
exit $ERRORS{'CRITICAL'}; |
|---|
| 399 |
} else { |
|---|
| 400 |
exit $ERRORS{$state}; |
|---|
| 401 |
} |
|---|