root/trunk/tools/system_monitoring.pl

Revision 201, 14.0 kB (checked in by depesz, 3 years ago)

- some cleanup to make the code a bit more readable
- added documentation for methods in case anyone ever would need to modify it

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl -w
2
3 package main;
4 my $program = Monitoring->new();
5 $program->run();
6
7 exit;
8
9 package Monitoring;
10 use strict;
11 use English qw( -no_match_vars );
12 use Time::HiRes qw( time );
13 use POSIX qw( strftime );
14 use File::Spec;
15 use File::Path qw( mkpath );
16 use IO::Select;
17 use IO::Handle;
18
19 sub new {
20     my $class = shift;
21     return bless {}, $class;
22 }
23
24 sub run {
25     my $self = shift;
26
27     $self->read_config();
28     $self->validate_config();
29
30     $self->{ 'select' } = IO::Select->new();
31     $self->start_persistent_processes();
32
33     $self->main_loop();
34     return;
35 }
36
37 sub main_loop {
38     my $self = shift;
39     while ( 1 ) {
40         $self->{ 'current_time' } = time();
41         $self->update_logger_filehandles();
42
43         my $timeout = $self->calculate_timeout();
44         my @ready   = $self->{ 'select' }->can_read( $timeout );
45         for my $fh ( @ready ) {
46             $self->handle_read( $fh );
47         }
48         $self->start_periodic_processes();
49     }
50 }
51
52 sub handle_read {
53     my $self = shift;
54     my $fh   = shift;
55
56     my $C;
57     for my $tmp ( $self->checks ) {
58         next unless $tmp->{ 'input' };
59         my $tmp_fh = $tmp->{ 'input' };
60         next if "$tmp_fh" ne "$fh";    # Stringified reference to io handle
61         $C = $tmp;
62         last;
63     }
64     die "Data from unknown input?! It shouldn't *ever* happen\n" unless $C;
65
66     my $read_data = '';
67     while ( 1 ) {
68         my $buffer;
69         my $read_bytes = sysread( $fh, $buffer, 8192 );
70         $read_data .= $buffer;
71         last if 8192 > $read_bytes;
72     }
73     if ( '' eq $read_data ) {
74         $self->{ 'select' }->remove( $fh );
75         close $fh;
76         delete $C->{ 'input' };
77         return unless 'periodic' eq $C->{ 'type' };
78         $C->{ 'next_call' } = $self->{ 'current_time' } + $C->{ 'interval' } if $self->{ 'current_time' } < $C->{ 'next_call' };
79         return;
80     }
81     return if $C->{ 'ignore' };
82
83     my $line_prefix = strftime( '%Y-%m-%d %H:%M:%S %Z | ', localtime( $self->{ 'current_time' } ) );
84     $read_data =~ s/^/$line_prefix/gm;
85     $read_data =~ s/([^\n])\z/$1\n/;
86     print { $C->{ 'fh' } } $read_data;
87     $C->{ 'fh' }->flush();
88     return;
89 }
90
91 sub run_check {
92     my $self    = shift;
93     my $C       = shift;
94     my $command = $C->{ 'exec' };
95
96     my $mode = '-|';
97     $mode = '<' if $command =~ s/\A\s*<\s*//;
98
99     open my $fh, $mode, $command or die "Cannot open [$command] in mode [$mode]: $OS_ERROR\n";
100     $self->{ 'select' }->add( $fh );
101     $C->{ 'input' } = $fh;
102
103     return;
104 }
105
106 sub start_periodic_processes {
107     my $self = shift;
108     for my $C ( $self->checks ) {
109         next unless 'periodic' eq $C->{ 'type' };
110         next if defined $C->{ 'input' };
111         next if ( defined $C->{ 'next_call' } ) && ( $C->{ 'next_call' } > $self->{ 'current_time' } );
112         $self->run_check( $C );
113         $C->{ 'next_call' } = $self->{ 'current_time' } + $C->{ 'interval' };
114     }
115     return;
116 }
117
118 sub start_persistent_processes {
119     my $self = shift;
120     for my $C ( $self->checks ) {
121         next unless 'persistent' eq $C->{ 'type' };
122         $self->run_check( $C );
123     }
124     return;
125 }
126
127 sub calculate_timeout {
128     my $self = shift;
129
130     my $nearest = undef;
131
132     for my $C ( $self->checks ) {
133         next if 'persistent' eq $C->{ 'type' };
134         next if defined $C->{ 'input' };
135         return 0 unless defined $C->{ 'next_call' };
136         if ( defined $nearest ) {
137             $nearest = $C->{ 'next_call' } if $C->{ 'next_call' } < $nearest;
138         }
139         else {
140             $nearest = $C->{ 'next_call' };
141         }
142     }
143
144     $nearest = $self->{ 'current_time' } unless defined $nearest;
145     my $sleep_time = $nearest - $self->{ 'current_time' };
146
147     return $sleep_time < 0.5 ? 0.5 : $sleep_time;    # limit sleep time to 0.5s to avoid too aggresive calls.
148 }
149
150 sub update_logger_filehandles {
151     my $self = shift;
152
153     my $file_suffix = strftime( '-%Y-%m-%d-%H.log', localtime( $self->{ 'current_time' } ) );
154     return if ( defined $self->{ 'previous-suffix' } ) && ( $self->{ 'previous-suffix' } eq $file_suffix );
155     $self->{ 'previous-suffix' } = $file_suffix;
156
157     my $directory_prefix = strftime( '%Y/%m/%d', localtime( $self->{ 'current_time' } ) );
158     my $full_directory = File::Spec->catfile( $self->{ 'logdir' }, $directory_prefix );
159
160     mkpath( [ $full_directory ], 0, oct( "750" ) ) unless -e $full_directory;
161
162     for my $C ( $self->checks ) {
163         next if $C->{ 'ignore' };
164
165         if ( $C->{ 'fh' } ) {
166             close $C->{ 'fh' };
167             delete $C->{ 'fh' };
168         }
169
170         my $full_name = File::Spec->catfile( $full_directory, $C->{ 'name' } . $file_suffix );
171         open my $fh, '>>', $full_name or die "Cannot write to $full_name: $OS_ERROR\n";
172         $C->{ 'fh' } = $fh;
173     }
174
175     return;
176 }
177
178 sub checks {
179     my $self = shift;
180     return @{ $self->{ 'checks' } };
181 }
182
183 sub validate_config {
184     my $self = shift;
185
186     die "GLOBAL.logdir was not provided in config!\n" unless defined $self->{ 'logdir' };
187     die "There are no checks to be run!\n"            unless defined $self->{ 'pre_checks' };
188
189     my @checks = ();
190     while ( my ( $check, $C ) = each %{ $self->{ 'pre_checks' } } ) {
191         $C->{ 'name' } = $check;
192         push @checks, $C;
193
194         die "Bad type " . $C->{ 'type' } . " in check $check!\n" unless $C->{ 'type' } =~ m{\A(?:persistent|periodic)\z};
195         next unless $C->{ 'type' } eq 'periodic';
196
197         die "Undefined interval for check $check!\n" unless defined $C->{ 'interval' };
198         die "Bad interval (" . $C->{ 'interval' } . ") in check $check!\n" unless $C->{ 'interval' } =~ m{\A[1-9]\d*\z};
199     }
200
201     $self->{ 'checks' } = \@checks;
202     delete $self->{ 'pre_checks' };
203
204     return;
205 }
206
207 sub read_config {
208     my $self = shift;
209
210     die "You have to provide name of config file! Check: perldoc $PROGRAM_NAME\n" if 0 == scalar @ARGV;
211     my $config_file_name = shift @ARGV;
212
213     open my $fh, '<', $config_file_name or die "Cannot open config file ($config_file_name) : $OS_ERROR\n";
214     while ( my $line = <$fh> ) {
215         next if $line =~ m{^\s*#};     # comment
216         next if $line =~ m{^\s*\z};    # empty line
217         $line =~ s{\A\s*}{};           # removing leading spaces
218         $line =~ s{\s*\z}{};           # removing trailing spaces
219         if ( $line =~ m{ \A GLOBAL\.logdir \s* = \s* (\S.*) \z }xmsi ) {
220             $self->{ 'logdir' } = $1;
221             next;
222         }
223         elsif ( $line =~ m{ \A check\.([A-Za-z0-9_]+)\.(type|exec|interval|ignore) \s* = \s* (\S.*) \z }xmsi ) {
224             $self->{ 'pre_checks' }->{ $1 }->{ $2 } = $3;
225             next;
226         }
227         die "Unknown line: [ $line ]\n";
228     }
229     close $fh;
230     return;
231 }
232
233 1;
234
235 =head1 system_monitoring.pl
236
237 =head2 USAGE
238
239 omnipitr-archive <config_file>
240
241 =head2 DESCRIPTION
242
243 system_monitoring.pl script is meant to provide single and solution for
244 logging system data which change more often than it's practical for systems
245 like cacti/nagios.
246
247 It is meant to be run on some low-privilege account, and gather the data,
248 which are partitioned automatically by source, and time, and stored in
249 simple text files.
250
251 After running, system_monitor.pl will check config, and if there are no
252 errors - will start processing checks.
253
254 All checks work in parallel, so there is no chance single check could lock
255 whole system_monitoring.pl.
256
257 =head2 Configuration file
258
259 Format of the configuration file is kept as simple as possible, to make this
260 script very portable - which in this particular case means: no external
261 (aside from core perl) dependencies.
262
263 Each line should be one of:
264
265 =over
266
267 =item * Comment (starts with #)
268
269 =item * Empty line (just white space characters)
270
271 =item * Setting
272
273 =back
274
275 Where setting line looks like:
276
277     PARAM=value
278
279 with optional leading, trailing or around "=" whitespace.
280
281 Recognized parameters are:
282
283 =over
284
285 =item * GLOBAL.logdir - full path to log directory
286
287 =item * check.XXX.type - type of check with name XXX
288
289 =item * check.XXX.exec - what should be executed to get data for check XXX
290
291 =item * check.XXX.interval - how often to run check XXX
292
293 =item * check.XXX.ignore - should output be ignored?
294
295 =back
296
297 There are only two supported types:
298
299 =over
300
301 =item * persistent - which means given program is to be run in background,
302 and whatever it will return should be logged. Such program "interval" will
303 be ignored.
304
305 =item * periodic - which means that given program is to be run periodically
306 as it will exit after returning data
307
308 =back
309
310 "exec" parameter is simply command line, to be run via shell, that will run
311 the program.
312
313 If exec parameter starts with '<' character (with optional whitespace
314 characters after), it is treated as filename to be read, and logged.
315
316 Due to the way it is internally processed - using "<" approach makes sense
317 only for periodic checks - in case of permenent checks it would simply copy
318 the file at start of system_monitoring.pl, and ignore any changes to it
319 afterwards. If you'd like to have something like 'tail -f' - use tail -f.
320
321 interval is time (in seconds) how often given program (of periodic type)
322 should be run.
323
324 ignore is optional parameter which is checked using Perl boolean logic (any
325 value other than empty string or 0 ar treated as true). Since
326 system_monitoring doesn't let setting empty string as value for option -
327 it's best to not include ignore option for checks you want to log, and just
328 add '...ignore=1' for those that you want to ignore.
329
330 If ignore is set, system_monitoring will not log output from such check.
331
332 This is helpful to build-in compression of older logs, using for example:
333
334     check.cleanup.type=periodic
335     check.cleanup.interval=300
336     check.cleanup.exec=find /var/log/monitoring -type f -name '*.log' -mmin +120 -print0 | xargs -0 gzip
337     check.cleanup.ignore=1
338
339 "XXX" (name of check) can consist only of upper and lower case letters,
340 digits, and character _. That is it has to match regular expression:
341
342     /\A[A-Za-z0-9_]+\z/
343
344 Output from all programs will be logged in files named:
345
346     /logdir/YYYY/MM/DD/XXX-YYY-MM-DD-HH.log
347
348 where YYYY, MM, DD and HH are date and time parts of current (as of logging
349 moment) time.
350
351 HH is 0 padded 24-hour style hour.
352
353 Example configuration:
354
355     # Global configuration, log directory
356     GLOBAL.logdir=/var/tmp/monitoring
357
358     # Logging iostat output in 10 second intervals
359     check.iostat.type=persistent
360     check.iostat.exec=iostat -kx 10
361
362     # Logging "ps auxwwn" every 30 seconds.
363     check.ps.type=periodic
364     check.ps.exec=ps auxwwn
365     check.ps.interval=30
366
367 =head2 INTERNALS
368
369 Program itself is very short:
370
371     my $program = Monitoring->new();
372     $program->run();
373
374 This creates $program as object of Monitoring class (defined in the same
375 file), and calls method run() on it.
376
377 =head3 METHODS
378
379 =head4 new
380
381 Just object constructor. Nothing to see there.
382
383 =head4 run
384
385 Initialization of stuff, and call to main_loop. Reads and validates config
386 (by calls to appropriate methods), initializes IO::Select object for
387 asynchronous I/O, starts persistent checks (again, using special metod), and
388 enters main_loop();
389
390 =head4 main_loop
391
392 The core of the program. Infinite loop, which - upon every iteration:
393
394 =over
395
396 =item * updates logging filehandles
397
398 =item * checks if there is anything to read in input filehandles (from
399 checks)
400
401 =item * reads whatever is to be read from checks
402
403 =item * runs new periodic checks if the time has come to do it
404
405 =back
406
407 Checking for data in input filehandles is done with timeout, which is
408 calculated to finish when next check will have to be run, so the program
409 doesn't use virtually no CPU unless there are some data to be worked on.
410
411 =head4 handle_read
412
413 Since all we get from IO::Select is filehandle to read from, this method has
414 first to find which check given filehandle belongs to.
415
416 Afterwards, it reads whatever is available in the filehandle. In case there
417 is error on the filehandle - it closes the filehandle - as it means that
418 output for given check ended.
419
420 Every line from check is prefixed with timestamp and logged to appropriate
421 logfile.
422
423 Additionally, when closing the filehandle (on error), it sets when given
424 check should be run next time.
425
426 =head4 run_check
427
428 Simple helper function which runs external program (or opens filehandle for
429 reading from file), and puts it into check data.
430
431 =head4 start_periodic_processes
432
433 Iterates over all periodic processes, checks which should be already run,
434 and runs them.
435
436 =head4 start_persistent_processes
437
438 Iterates over all persistent processes and runs them. This is done only
439 once, from run() method.
440
441 =head4 calculate_timeout
442
443 Helper function which calculates how long should main_loop() wait for data
444 from IO::Select before it has to run another round of
445 start_periodic_processes().
446
447 =head4 update_logger_filehandles
448
449 Checks if current timestamp has changed enough to require swapping files,
450 and if yes - closes old ones and opens new ones - making all necessary
451 directories to make it happen.
452
453 =head4 checks
454
455 Wrapper to be able to write:
456
457     for my $C ( $self->checks ) {
458
459 instead of:
460
461     for my $C ( @{ $self->{ 'checks'} } ) {
462
463 =head4 validate_config
464
465 Verifies that config values make sense, and reorganizes them into final data
466 structure (checks hashes in $self->{'checks'} arrayref).
467
468 =head4 read_config
469
470 Just like name suggests - reads given config to memory. Very simple parser
471 based on regular expressions.
472
473 =head2 LICENSE
474
475 Copyright (c) 2010, OmniTI, Inc.
476
477 Permission to use, copy, modify, and distribute this software and its
478 documentation for any purpose, without fee, and without a written agreement
479 is hereby granted, provided that the above copyright notice and this
480 paragraph and the following two paragraphs appear in all copies.
481
482 IN NO EVENT SHALL OmniTI, Inc. BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
483 SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
484 ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
485 OmniTI, Inc. HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
486
487 OmniTI, Inc. SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
488 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
489 PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
490 AND OmniTI, Inc. HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
491 UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
492
493 =head2 COPYRIGHT
494
495 The OmniPITR project is Copyright (c) 2010 OmniTI. All rights reserved.
496
Note: See TracBrowser for help on using the browser.