root/resources/check_resmon_metric

Revision eb4ac990d1c5ec58d6843a8b629dd7637295c789, 9.0 kB (checked in by Mark Harrison <mark@omniti.com>, 4 years ago)

Documentation for check_resmon_metric

git-svn-id: https://labs.omniti.com/resmon/branches/resmon2@285 8c0face9-b7db-6ec6-c4b3-d5f7145c7d55

  • Property mode set to 100755
Line 
1 #!/usr/bin/perl -w
2 # Remove the following line to disable embedded perl
3 # nagios: +epn
4
5 =pod
6
7 =head1 NAME
8
9 check_resmon_metric - Nagios check to monitor a resmon metric
10
11 =head1 SYNOPSIS
12
13 check_resmon_metric -H hostname [-P port] -M module_name -C check_name
14                     -m metric_name [ -r regex | [-w warning_threshold]
15                     [-c critical_threshold] ]
16
17 =head1 DESCRIPTION
18
19 This is a nagios check script that will connect to a running resmon instance,
20 fetch information for a single check, and compare a metric against rules
21 provided on the command line, returning the status to nagios based on the
22 result.
23
24 For numeric metrics, there are options for warning/critical thresholds, and
25 for string metrics, there is an option to match against a regular expression.
26
27 =head1 OPTIONS
28
29 =over
30
31 =item -H hostname
32
33 The hostname of the resmon instance to connect to. Required.
34
35 =item -P port
36
37 The port that resmon is listening on. Defaults to 81.
38
39 =item -M module_name
40
41 The module name of the check you wish to fetch. Required.
42
43 =item -C check_name
44
45 The check name you wish to fetch. Required.
46
47 =item -m metric_name
48
49 The name of the metric you wish to evaluate rules against. Required.
50
51 =item -w warning_threshold
52
53 A numeric threshold (see below) to test the metric against. Will return a
54 warning status if the threshold matches. This is only applicable for numeric
55 metrics. An error will be returned for string metrics. Optional.
56
57 =item -c critical_threshold
58
59 A numeric threshold (see below) to test the metric against. Will return a
60 critical status if the threshold matches. This is only applicable for numeric
61 metrics. An error will be returned for string metrics. If both the critical
62 and warning thresholds match a given metric, then the status returned is
63 critical.  Optional.
64
65 =item -r regex
66
67 A regular expression to match the metric against. This is most useful for
68 string based metrics. If the regular expression matches, then an ok status
69 will be returned, otherwise the status will be critical. This cannot be used
70 in conjunction with the warning/critical thresholds. Optional.
71
72 =item -A age
73
74 This option will cause a check to be critical if it was updated more than age
75 seconds ago.
76
77 =back
78
79 =head1 THRESHOLDS
80
81 The warning and critical thresholds are specified in the same way as for other
82 nagios plugins, as desribed at
83 http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
84
85 Most of the time, you just want to put a number for the threshold. For
86 example, '-w 10 -c 20' would go warning if the value is over 10, and critical
87 if it is over 20.
88
89 For more complex ranges, use the more generalized format of: [@]start:end
90
91 =over
92
93 =item *
94
95 Start and end are both numbers, and either can be negative
96
97 =item *
98
99 If start is missing, then the colon isn't required, and the range is from 0 to
100 end. In other words, if you specify a single number, the range is from 0 to
101 the number you specified.
102
103 =item *
104
105 If end is missing (E.g. 10:), then the range is from start to infinity.
106
107 =item *
108
109 If start is '~', then the range is from minus infinity to end (as opposed to
110 0-end if start is omitted)
111
112 =item *
113
114 An alert is generated if the test value lies outside the specified range.
115
116 =item *
117
118 If @ is specified at the beginning, the alerting behavior is flipped. In other
119 words, an alert is generated if the value lies within the range.
120
121 =back
122
123 =head2 EXAMPLES
124
125 The following lists example thresholds, and the conditions under which an
126 alert is generated.
127
128 =over
129
130 =item '10'
131
132 Ok if: 0 <= X <= 10, Alert if: X < 0 or X > 10
133
134 =item '10:'
135
136 Ok if: X >= 10, Alert if: X < 10
137
138 =item '~:10'
139
140 Ok if: X <= 10, Alert if: X > 10
141
142 =item '10:20'
143
144 Ok if: 10 <= X <= 20, Alert if: X < 10 or X > 20
145
146 =item '@10:20'
147
148 Ok if: X < 10 or X > 20, Alert if: 10 <= X <= 20
149
150 =back
151
152 =cut
153
154 use vars qw($PROGNAME);
155 if ($0 =~ m/^(.*?)[\/\\]([^\/\\]+)$/) {
156         $PROGNAME = $2;
157 }
158
159 use strict;
160 use warnings;
161 use LWP::UserAgent;
162 use HTTP::Request;
163 use HTTP::Response;
164 use Time::HiRes qw( gettimeofday tv_interval );
165 use XML::Simple;
166 use Getopt::Long;
167
168 use utils qw($TIMEOUT %ERRORS &print_revision &support);
169
170 delete @ENV{'PATH', 'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
171
172 my ($port, $host, $module, $check, $age, $warning, $critical, $regex,
173     $metric) = (81,"","","",0, "", "", "");
174
175 sub help {
176     print "Usage: $0 [options]";
177     print " -H | --hostname     host to check\n";
178     print " -P | --port         port that resmon runs on (default 81)\n";
179     print " -M | --module       module to check\n";
180     print " -C | --check        name of individual check\n";
181     print " -A | --age          how recently should the check have been";
182     print " updated\n";
183     print " -m | --metric       metric name to check\n";
184     print " -w | --warning      warning threshold (numeric metrics only)\n";
185     print " -c | --critical     critical threshold (numeric metrics only)\n";
186     print " -r | --regex        regex match against the metric (string";
187     print " metrics only)\n";
188     exit $ERRORS{'UNKNOWN'};
189 }
190
191 sub short_help {
192     print "Usage: $0 -H host -M module -C check -m metric [options]\n";
193     print "run $0 --help for more information\n";
194     exit $ERRORS{'UNKNOWN'};
195 }
196
197 sub check_threshold {
198     my ($value, $warning, $critical) = @_;
199     my ($state, $message, $warnmessage, $critmessage) = (0,"","","");
200     if ($critical) {
201         ($state, $critmessage) = check_single_threshold($value, $critical);
202         if (!$state) {
203             return ("CRITICAL", $critmessage);
204         }
205         $message = $critmessage;
206     }
207     if ($warning) {
208         ($state, $warnmessage) = check_single_threshold($value, $warning);
209         if (!$state) {
210             return ("WARNING", $warnmessage);
211         }
212         if ($message) {
213             $message .= " and $warnmessage";
214         } else {
215             $message = $warnmessage;
216         }
217     }
218     return ("OK", $message);
219 }
220
221 sub check_single_threshold {
222     my ($value, $threshold) = @_;
223     my ($inclusive, $start, $end) = (
224         $threshold =~ /(\@?)(?:(-?[0-9.]+|~):)?(-?[0-9.]+|~)?/);
225     $start ||= 0;
226     $end ||= "";
227     my $message;
228     my $goodmessage;
229     my $badmessage;
230
231     my $state = 1;
232     if ($start eq "~") {
233         $badmessage = "$value > $end";
234         $goodmessage = "$value <= $end";
235         if ($value > $end) {
236             $state = 0;
237         }
238     } elsif ($end eq "") {
239         $badmessage = "$value < $start";
240         $goodmessage = "$value >= $start";
241         if ($value < $start) {
242             $state = 0;
243         }
244     } else {
245         $badmessage = "$value outside range ($start to $end)";
246         $goodmessage = "$start <= $value <= $end";
247         if ($value < $start || $value > $end) {
248             $state = 0;
249         }
250     }
251
252     $message = $state ? $goodmessage : $badmessage;
253
254     # Negate the result if inclusive
255     if ($inclusive) {
256         $state = $state ? 0 : 1;
257     }
258     return ($state, $message);
259 }
260
261 Getopt::Long::Configure('bundling', 'no_ignore_case');
262 GetOptions (
263     "h|help"       => \&help,
264     "H|host=s"     => \$host,
265     "P|port=i"     => \$port,
266     "M|module=s"   => \$module,
267     "C|check=s"    => \$check,
268     "A|age=i"      => \$age,
269     "m|metric=s"   => \$metric,
270     "w|warning=s"  => \$warning,
271     "c|critical=s" => \$critical,
272     "r|regex=s"    => \$regex);
273
274 unless ($host && $module && $check) {
275     short_help();
276 }
277
278 if (($warning || $critical) && $regex) {
279     print "Cannot specify both numeric thresholds and a string based match\n";
280     exit $ERRORS{'UNKNOWN'};
281 }
282
283 my $ua = LWP::UserAgent->new;
284 my $t = HTTP::Request->new('GET', "http://$host:$port/$module/$check");
285 my $xs = XML::Simple->new();
286 my $state = "UNKNOWN";
287 eval {
288     my $ref;
289     # Make the HTTP request
290     my $res = $ua->request($t);
291     die "CRITICAL: could not fetch\n" unless($res && $res->is_success);
292     # Parse the xml
293     eval { $ref = $xs->XMLin($res->content, ForceArray => 1); };
294     die "CRITICAL: error parsing XML\n" if($@);
295
296     # Debugging
297     #use Data::Dumper;
298     #print Dumper($ref->{ResmonResult});
299
300     # If we have stale information, then go critical
301     my $last_update = time() - $ref->{ResmonResult}->[0]->{last_update}->[0];
302     die "Stale metrics. Last updated $last_update seconds ago"
303         if($age && $age < $last_update);
304
305     # Get the metrics
306     my $metricval = $ref->{ResmonResult}->[0]->{metric}->{$metric};
307     my $value = $metricval->{content};
308     my $type = $metricval->{type} || "0";
309
310     # Note: if type is auto (0), then we assume it can be treated as a number
311     # of some sort. If you're specifying a warning/critical threshold, then
312     # you are too.
313     die "Numeric threshold specified for a non-numeric metric"
314         if (($warning || $critical) && $type !~ /[0IlLn]/);
315
316     if ($regex) {
317         if ($value =~ /$regex/) {
318             $state = "OK";
319         } else {
320             $state = "CRITICAL";
321         }
322         print "$state: $value\n";
323     }
324
325     if ($warning || $critical) {
326         my $message;
327         ($state, $message) = check_threshold($value, $warning, $critical);
328         print "$state: $metric - $message\n";
329     }
330 };
331
332 if($@) {
333     chomp($@);
334     print "CRITICAL: $@\n";
335     exit $ERRORS{'CRITICAL'};
336 } else {
337     exit $ERRORS{$state};
338 }
Note: See TracBrowser for help on using the browser.