root/trunk/tools/system_monitoring.pl

Revision 200, 11.3 kB (checked in by depesz, 3 years ago)

1. add ability to ignore output of checks (to implement cleanup pseudo-checks)
2. add ability to read files directly by system_monitoring.pl, and not by using external 'cat' program

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl -w
2
3 package main;
4 my $program = Monitoring->new();
5 $program->run();
6
7 exit;
8
9 package Monitoring;
10 use strict;
11 use English qw( -no_match_vars );
12 use Time::HiRes qw( time );
13 use POSIX qw( strftime );
14 use File::Spec;
15 use File::Path qw( mkpath );
16 use Data::Dumper;
17 use IO::Select;
18 use IO::Handle;
19
20 sub new {
21     my $class = shift;
22     return bless {}, $class;
23 }
24
25 sub run {
26     my $self = shift;
27
28     $self->read_config();
29     $self->validate_config();
30
31     $self->{ 'select' } = IO::Select->new();
32     $self->start_persistent_processes();
33
34     $self->main_loop();
35     return;
36 }
37
38 sub main_loop {
39     my $self = shift;
40     while ( 1 ) {
41         $self->{ 'current_time' } = time();
42         $self->update_logger_filehandles();
43
44         my $timeout = $self->calculate_timeout();
45         my @ready   = $self->{ 'select' }->can_read( $timeout );
46         for my $fh ( @ready ) {
47             $self->handle_read( $fh );
48         }
49         $self->start_periodic_processes();
50     }
51 }
52
53 sub handle_read {
54     my $self = shift;
55     my $fh   = shift;
56
57     my $C;
58     for my $check ( keys %{ $self->{ 'checks' } } ) {
59         my $tmp = $self->{ 'checks' }->{ $check };
60         next unless $tmp->{ 'input' };
61         my $tmp_fh = $tmp->{ 'input' };
62         next if "$tmp_fh" ne "$fh";    # Stringified reference to io handle
63         $C = $tmp;
64         last;
65     }
66     die "Data from unknown input?! It shouldn't *ever* happen\n" unless $C;
67
68     my $read_data = '';
69     while ( 1 ) {
70         my $buffer;
71         my $read_bytes = sysread( $fh, $buffer, 8192 );
72         $read_data .= $buffer;
73         last if 8192 > $read_bytes;
74     }
75     if ( '' eq $read_data ) {
76         $self->{ 'select' }->remove( $fh );
77         close $fh;
78         delete $C->{ 'input' };
79         return unless 'periodic' eq $C->{ 'type' };
80         $C->{ 'next_call' } = $self->{ 'current_time' } + $C->{ 'interval' } if $self->{ 'current_time' } < $C->{ 'next_call' };
81         return;
82     }
83     return if $C->{ 'ignore' };
84
85     my $line_prefix = strftime( '%Y-%m-%d %H:%M:%S %Z | ', localtime( $self->{ 'current_time' } ) );
86     $read_data =~ s/^/$line_prefix/gm;
87     $read_data =~ s/([^\n])\z/$1\n/;
88     print { $C->{ 'fh' } } $read_data;
89     $C->{ 'fh' }->flush();
90     return;
91 }
92
93 sub run_check {
94     my $self    = shift;
95     my $C       = shift;
96     my $command = $C->{ 'exec' };
97
98     my $mode = '-|';
99     $mode = '<' if $command =~ s/\A\s*<\s*//;
100
101     open my $fh, $mode, $command or die "Cannot open [$command] in mode [$mode]: $OS_ERROR\n";
102     $self->{ 'select' }->add( $fh );
103     $C->{ 'input' } = $fh;
104
105     return;
106 }
107
108 sub start_periodic_processes {
109     my $self = shift;
110     for my $C ( values %{ $self->{ 'checks' } } ) {
111         next unless 'periodic' eq $C->{ 'type' };
112         next if defined $C->{ 'input' };
113         next if ( defined $C->{ 'next_call' } ) && ( $C->{ 'next_call' } > $self->{ 'current_time' } );
114         $self->run_check( $C );
115         $C->{ 'next_call' } = $self->{ 'current_time' } + $C->{ 'interval' };
116     }
117     return;
118 }
119
120 sub start_persistent_processes {
121     my $self = shift;
122     for my $C ( values %{ $self->{ 'checks' } } ) {
123         next unless 'persistent' eq $C->{ 'type' };
124         $self->run_check( $C );
125     }
126     return;
127 }
128
129 sub calculate_timeout {
130     my $self    = shift;
131     my $nearest = undef;
132     for my $check ( keys %{ $self->{ 'checks' } } ) {
133         my $C = $self->{ 'checks' }->{ $check };
134         next if 'persistent' eq $C->{ 'type' };
135         next if defined $C->{ 'input' };
136         return 0 unless defined $C->{ 'next_call' };
137         if ( defined $nearest ) {
138             $nearest = $C->{ 'next_call' } if $C->{ 'next_call' } < $nearest;
139         }
140         else {
141             $nearest = $C->{ 'next_call' };
142         }
143     }
144     $nearest = $self->{ 'current_time' } unless defined $nearest;
145     my $sleep_time = $nearest - $self->{ 'current_time' };
146     return $sleep_time < 0.5 ? 0.5 : $sleep_time;    # limit sleep time to 0.5s to avoid too aggresive calls.
147 }
148
149 sub update_logger_filehandles {
150     my $self = shift;
151
152     my $file_suffix = strftime( '-%Y-%m-%d-%H.log', localtime( $self->{ 'current_time' } ) );
153     return if ( defined $self->{ 'previous-suffix' } ) && ( $self->{ 'previous-suffix' } eq $file_suffix );
154     $self->{ 'previous-suffix' } = $file_suffix;
155
156     my $directory_prefix = strftime( '%Y/%m/%d', localtime( $self->{ 'current_time' } ) );
157     my $full_directory = File::Spec->catfile( $self->{ 'logdir' }, $directory_prefix );
158
159     mkpath( [ $full_directory ], 0, oct( "750" ) ) unless -e $full_directory;
160
161     for my $check ( keys %{ $self->{ 'checks' } } ) {
162         my $C = $self->{ 'checks' }->{ $check };
163         next if $C->{ 'ignore' };
164
165         if ( $C->{ 'fh' } ) {
166             close $C->{ 'fh' };
167             delete $C->{ 'fh' };
168         }
169
170         my $full_name = File::Spec->catfile( $full_directory, $check . $file_suffix );
171         open my $fh, '>>', $full_name or die "Cannot write to $full_name: $OS_ERROR\n";
172         $C->{ 'fh' } = $fh;
173     }
174
175     return;
176 }
177
178 sub validate_config {
179     my $self = shift;
180
181     die "GLOBAL.logdir was not provided in config!\n" unless defined $self->{ 'logdir' };
182     die "There are no checks to be run!\n"            unless defined $self->{ 'checks' };
183
184     for my $check ( sort keys %{ $self->{ 'checks' } } ) {
185         my $C = $self->{ 'checks' }->{ $check };
186         $C->{ 'name' } = $check;
187
188         die "Bad type " . $C->{ 'type' } . " in check $check!\n" unless $C->{ 'type' } =~ m{\A(?:persistent|periodic)\z};
189         next unless $C->{ 'type' } eq 'periodic';
190
191         die "Undefined interval for check $check!\n" unless defined $C->{ 'interval' };
192         die "Bad interval (" . $C->{ 'interval' } . ") in check $check!\n" unless $C->{ 'interval' } =~ m{\A[1-9]\d*\z};
193     }
194     return;
195 }
196
197 sub read_config {
198     my $self = shift;
199
200     die "You have to provide name of config file! Check: perldoc $PROGRAM_NAME\n" if 0 == scalar @ARGV;
201     my $config_file_name = shift @ARGV;
202
203     open my $fh, '<', $config_file_name or die "Cannot open config file ($config_file_name) : $OS_ERROR\n";
204     while ( my $line = <$fh> ) {
205         next if $line =~ m{^\s*#};     # comment
206         next if $line =~ m{^\s*\z};    # empty line
207         $line =~ s{\A\s*}{};           # removing leading spaces
208         $line =~ s{\s*\z}{};           # removing trailing spaces
209         if ( $line =~ m{ \A GLOBAL\.logdir \s* = \s* (\S.*) \z }xmsi ) {
210             $self->{ 'logdir' } = $1;
211             next;
212         }
213         elsif ( $line =~ m{ \A check\.([A-Za-z0-9_]+)\.(type|exec|interval|ignore) \s* = \s* (\S.*) \z }xmsi ) {
214             $self->{ 'checks' }->{ $1 }->{ $2 } = $3;
215             next;
216         }
217         die "Unknown line: [ $line ]\n";
218     }
219     close $fh;
220     return;
221 }
222
223 1;
224
225 =head1 system_monitoring.pl
226
227 =head2 USAGE
228
229 omnipitr-archive <config_file>
230
231 =head2 DESCRIPTION
232
233 system_monitoring.pl script is meant to provide single and solution for logging system data which change more often than it's practical for systems like cacti/nagios.
234
235 It is meant to be run on some low-privilege account, and gather the data, which are partitioned automatically by source, and time, and stored in simple text files.
236
237 After running, system_monitor.pl will check config, and if there are no errors - will start processing checks.
238
239 All checks work in parallel, so there is no chance single check could lock whole system_monitoring.pl.
240
241 =head2 Configuration file
242
243 Format of the configuration file is kept as simple as possible, to make this script very portable - which in this particular case means: no external (aside from core perl) dependencies.
244
245 Each line should be one of:
246
247 =over
248
249 =item * Comment (starts with #)
250
251 =item * Empty line (just white space characters)
252
253 =item * Setting
254
255 =back
256
257 Where setting line looks like:
258
259     PARAM=value
260
261 with optional leading, trailing or around "=" whitespace.
262
263 Recognized parameters are:
264
265 =over
266
267 =item * GLOBAL.logdir - full path to log directory
268
269 =item * check.XXX.type - type of check with name XXX
270
271 =item * check.XXX.exec - what should be executed to get data for check XXX
272
273 =item * check.XXX.interval - how often to run check XXX
274
275 =item * check.XXX.ignore - should output be ignored?
276
277 =back
278
279 There are only two supported types:
280
281 =over
282
283 =item * persistent - which means given program is to be run in background, and whatever it will return should be logged. Such program "interval" will be ignored.
284
285 =item * periodic - which means that given program is to be run periodically as it will exit after returning data
286
287 =back
288
289 "exec" parameter is simply command line, to be run via shell, that will run the program.
290
291 If exec parameter starts with '<' character (with optional whitespace characters after), it is treated as filename to be read, and logged.
292
293 Due to the way it is internally processed - using "<" approach makes sense only for periodic checks - in case of permenent checks it would simply copy the file at start of system_monitoring.pl, and
294 ignore any changes to it afterwards. If you'd like to have something like 'tail -f' - use tail -f.
295
296 interval is time (in seconds) how often given program (of periodic type) should be run.
297
298 ignore is optional parameter which is checked using Perl boolean logic (any value other than empty string or 0 ar treated as true). Since system_monitoring doesn't let setting empty string as value
299 for option - it's best to not include ignore option for checks you want to log, and just add '...ignore=1' for those that you want to ignore.
300
301 If ignore is set, system_monitoring will not log output from such check.
302
303 This is helpful to build-in compression of older logs, using for example:
304
305     check.cleanup.type=periodic
306     check.cleanup.interval=300
307     check.cleanup.exec=find /var/log/monitoring -type f -name '*.log' -mmin +120 -print0 | xargs -0 gzip
308     check.cleanup.ignore=1
309
310 "XXX" (name of check) can consist only of upper and lower case letters, digits, and character _. That is it has to match regular expression:
311
312     /\A[A-Za-z0-9_]+\z/
313
314 Output from all programs will be logged in files named:
315
316     /logdir/YYYY/MM/DD/XXX-YYY-MM-DD-HH.log
317
318 where YYYY, MM, DD and HH are date and time parts of current (as of logging moment) time.
319
320 HH is 0 padded 24-hour style hour.
321
322 Example configuration:
323
324     # Global configuration, log directory
325     GLOBAL.logdir=/var/tmp/monitoring
326
327     # Logging iostat output in 10 second intervals
328     check.iostat.type=persistent
329     check.iostat.exec=iostat -kx 10
330
331     # Logging "ps auxwwn" every 30 seconds.
332     check.ps.type=periodic
333     check.ps.exec=ps auxwwn
334     check.ps.interval=30
335
336 =head2 LICENSE
337
338 Copyright (c) 2010, OmniTI, Inc.
339
340 Permission to use, copy, modify, and distribute this software and its
341 documentation for any purpose, without fee, and without a written agreement
342 is hereby granted, provided that the above copyright notice and this
343 paragraph and the following two paragraphs appear in all copies.
344
345 IN NO EVENT SHALL OmniTI, Inc. BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
346 SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
347 ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
348 OmniTI, Inc. HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
349
350 OmniTI, Inc. SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
351 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
352 PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
353 AND OmniTI, Inc. HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
354 UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
355
356 =head2 COPYRIGHT
357
358 The OmniPITR project is Copyright (c) 2010 OmniTI. All rights reserved.
359
Note: See TracBrowser for help on using the browser.