root/trunk/tools/system_monitoring.pl

Revision 266, 14.5 kB (checked in by depesz, 3 years ago)

fix mentions of omnipitr

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl -w
2
3 package main;
4 my $program = Monitoring->new();
5 $program->run();
6
7 exit;
8
9 package Monitoring;
10 use strict;
11 use English qw( -no_match_vars );
12 use Time::HiRes qw( time sleep );
13 use POSIX qw( strftime );
14 use File::Spec;
15 use File::Path qw( mkpath );
16 use IO::Select;
17 use IO::Handle;
18
19 sub new {
20     my $class = shift;
21     return bless {}, $class;
22 }
23
24 sub run {
25     my $self = shift;
26
27     $self->read_config();
28     $self->validate_config();
29
30     $self->{ 'select' } = IO::Select->new();
31     $self->start_persistent_processes();
32
33     $self->main_loop();
34     return;
35 }
36
37 sub main_loop {
38     my $self = shift;
39     while ( 1 ) {
40         $self->{ 'current_time' } = time();
41         $self->update_logger_filehandles();
42
43         my $timeout = $self->calculate_timeout();
44         my @ready   = $self->{ 'select' }->can_read( $timeout );
45         for my $fh ( @ready ) {
46             $self->handle_read( $fh );
47         }
48         $self->start_periodic_processes();
49     }
50 }
51
52 sub handle_read {
53     my $self = shift;
54     my $fh   = shift;
55
56     my $C;
57     for my $tmp ( $self->checks ) {
58         next unless $tmp->{ 'input' };
59         my $tmp_fh = $tmp->{ 'input' };
60         next if "$tmp_fh" ne "$fh";    # Stringified reference to io handle
61         $C = $tmp;
62         last;
63     }
64     die "Data from unknown input?! It shouldn't *ever* happen\n" unless $C;
65
66     my $read_data = '';
67     while ( 1 ) {
68         my $buffer;
69         my $read_bytes = sysread( $fh, $buffer, 8192 );
70         $read_data .= $buffer;
71         last if 8192 > $read_bytes;
72     }
73     $C->{ 'buffer' } .= $read_data unless $C->{ 'ignore' };
74
75     if ( '' eq $read_data ) {
76         $self->{ 'select' }->remove( $fh );
77         close $fh;
78         delete $C->{ 'input' };
79         return unless 'periodic' eq $C->{ 'type' };
80         $C->{ 'next_call' } = $self->{ 'current_time' } + $C->{ 'interval' } if $self->{ 'current_time' } < $C->{ 'next_call' };
81         $C->{ 'buffer' } .= "\n" if ( defined $C->{ 'buffer' } ) && ( $C->{ 'buffer' } =~ /[^\n]\z/ );
82         $self->print_log( $C ) unless $C->{ 'ignore' };
83         return;
84     }
85
86     delete $C->{ 'buffer' } if $C->{ 'ignore' };
87     $self->print_log( $C ) unless $C->{ 'ignore' };
88     return;
89 }
90
91 sub print_log {
92     my $self = shift;
93     my $C    = shift;
94
95     my $line_prefix = strftime( '%Y-%m-%d %H:%M:%S %Z | ', localtime( $self->{ 'current_time' } ) );
96     while ( $C->{ 'buffer' } =~ s{\A([^\n]*\n)}{} ) {
97         my $line = $1;
98         print { $C->{ 'fh' } } $line_prefix . $line;
99     }
100     $C->{ 'fh' }->flush();
101     return;
102 }
103
104 sub run_check {
105     my $self    = shift;
106     my $C       = shift;
107     my $command = $C->{ 'exec' };
108
109     my $mode = '-|';
110     $mode = '<' if $command =~ s/\A\s*<\s*//;
111
112     open my $fh, $mode, $command or die "Cannot open [$command] in mode [$mode]: $OS_ERROR\n";
113     $self->{ 'select' }->add( $fh );
114     $C->{ 'input' } = $fh;
115
116     return;
117 }
118
119 sub start_periodic_processes {
120     my $self = shift;
121     for my $C ( $self->checks ) {
122         next unless 'periodic' eq $C->{ 'type' };
123         next if defined $C->{ 'input' };
124         next if ( defined $C->{ 'next_call' } ) && ( $C->{ 'next_call' } > $self->{ 'current_time' } );
125         $self->run_check( $C );
126         $C->{ 'next_call' } = $self->{ 'current_time' } + $C->{ 'interval' };
127     }
128     return;
129 }
130
131 sub start_persistent_processes {
132     my $self = shift;
133     for my $C ( $self->checks ) {
134         next unless 'persistent' eq $C->{ 'type' };
135         $self->run_check( $C );
136     }
137     return;
138 }
139
140 sub calculate_timeout {
141     my $self = shift;
142
143     my $nearest = undef;
144
145     for my $C ( $self->checks ) {
146         next if 'persistent' eq $C->{ 'type' };
147         next if defined $C->{ 'input' };
148         return 0 unless defined $C->{ 'next_call' };
149         if ( defined $nearest ) {
150             $nearest = $C->{ 'next_call' } if $C->{ 'next_call' } < $nearest;
151         }
152         else {
153             $nearest = $C->{ 'next_call' };
154         }
155     }
156
157     $nearest = $self->{ 'current_time' } unless defined $nearest;
158     my $sleep_time = $nearest - $self->{ 'current_time' };
159
160     return $sleep_time < 0.5 ? 0.5 : $sleep_time;    # limit sleep time to 0.5s to avoid too aggresive calls.
161 }
162
163 sub update_logger_filehandles {
164     my $self = shift;
165
166     my $file_suffix = strftime( '-%Y-%m-%d-%H.log', localtime( $self->{ 'current_time' } ) );
167     return if ( defined $self->{ 'previous-suffix' } ) && ( $self->{ 'previous-suffix' } eq $file_suffix );
168     $self->{ 'previous-suffix' } = $file_suffix;
169
170     my $directory_prefix = strftime( '%Y/%m/%d', localtime( $self->{ 'current_time' } ) );
171     my $full_directory = File::Spec->catfile( $self->{ 'logdir' }, $directory_prefix );
172
173     mkpath( [ $full_directory ], 0, oct( "750" ) ) unless -e $full_directory;
174
175     for my $C ( $self->checks ) {
176         next if $C->{ 'ignore' };
177
178         if ( $C->{ 'fh' } ) {
179             close $C->{ 'fh' };
180             delete $C->{ 'fh' };
181         }
182
183         my $full_name = File::Spec->catfile( $full_directory, $C->{ 'name' } . $file_suffix );
184         open my $fh, '>>', $full_name or die "Cannot write to $full_name: $OS_ERROR\n";
185         $C->{ 'fh' } = $fh;
186     }
187
188     return;
189 }
190
191 sub checks {
192     my $self = shift;
193     return @{ $self->{ 'checks' } };
194 }
195
196 sub validate_config {
197     my $self = shift;
198
199     die "GLOBAL.logdir was not provided in config!\n" unless defined $self->{ 'logdir' };
200     die "There are no checks to be run!\n"            unless defined $self->{ 'pre_checks' };
201
202     die "Cannot chdir to " . $self->{ 'logdir' } . ": $OS_ERROR\n" unless chdir $self->{ 'logdir' };
203
204     my @checks = ();
205     while ( my ( $check, $C ) = each %{ $self->{ 'pre_checks' } } ) {
206         $C->{ 'name' } = $check;
207         push @checks, $C;
208
209         die "Bad type " . $C->{ 'type' } . " in check $check!\n" unless $C->{ 'type' } =~ m{\A(?:persistent|periodic)\z};
210         next unless $C->{ 'type' } eq 'periodic';
211
212         die "Undefined interval for check $check!\n" unless defined $C->{ 'interval' };
213         die "Bad interval (" . $C->{ 'interval' } . ") in check $check!\n" unless $C->{ 'interval' } =~ m{\A[1-9]\d*\z};
214     }
215
216     $self->{ 'checks' } = \@checks;
217     delete $self->{ 'pre_checks' };
218
219     return;
220 }
221
222 sub read_config {
223     my $self = shift;
224
225     die "You have to provide name of config file! Check: perldoc $PROGRAM_NAME\n" if 0 == scalar @ARGV;
226     my $config_file_name = shift @ARGV;
227
228     open my $fh, '<', $config_file_name or die "Cannot open config file ($config_file_name) : $OS_ERROR\n";
229     while ( my $line = <$fh> ) {
230         next if $line =~ m{^\s*#};     # comment
231         next if $line =~ m{^\s*\z};    # empty line
232         $line =~ s{\A\s*}{};           # removing leading spaces
233         $line =~ s{\s*\z}{};           # removing trailing spaces
234         if ( $line =~ m{ \A GLOBAL\.logdir \s* = \s* (\S.*) \z }xmsi ) {
235             $self->{ 'logdir' } = $1;
236             next;
237         }
238         elsif ( $line =~ m{ \A check\.([A-Za-z0-9_]+)\.(type|exec|interval|ignore) \s* = \s* (\S.*) \z }xmsi ) {
239             $self->{ 'pre_checks' }->{ $1 }->{ $2 } = $3;
240             next;
241         }
242         die "Unknown line: [ $line ]\n";
243     }
244     close $fh;
245     return;
246 }
247
248 1;
249
250 =head1 system_monitoring.pl
251
252 =head2 USAGE
253
254 system_monitoring.pl <config_file>
255
256 =head2 DESCRIPTION
257
258 system_monitoring.pl script is meant to provide single and solution for
259 logging system data which change more often than it's practical for systems
260 like cacti/nagios.
261
262 It is meant to be run on some low-privilege account, and gather the data,
263 which are partitioned automatically by source, and time, and stored in
264 simple text files.
265
266 After running, system_monitor.pl will check config, and if there are no
267 errors - will start processing checks.
268
269 All checks work in parallel, so there is no chance single check could lock
270 whole system_monitoring.pl.
271
272 =head2 Configuration file
273
274 Format of the configuration file is kept as simple as possible, to make this
275 script very portable - which in this particular case means: no external
276 (aside from core perl) dependencies.
277
278 Each line should be one of:
279
280 =over
281
282 =item * Comment (starts with #)
283
284 =item * Empty line (just white space characters)
285
286 =item * Setting
287
288 =back
289
290 Where setting line looks like:
291
292     PARAM=value
293
294 with optional leading, trailing or around "=" whitespace.
295
296 Recognized parameters are:
297
298 =over
299
300 =item * GLOBAL.logdir - full path to log directory
301
302 =item * check.XXX.type - type of check with name XXX
303
304 =item * check.XXX.exec - what should be executed to get data for check XXX
305
306 =item * check.XXX.interval - how often to run check XXX
307
308 =item * check.XXX.ignore - should output be ignored?
309
310 =back
311
312 There are only two supported types:
313
314 =over
315
316 =item * persistent - which means given program is to be run in background,
317 and whatever it will return should be logged. Such program "interval" will
318 be ignored.
319
320 =item * periodic - which means that given program is to be run periodically
321 as it will exit after returning data
322
323 =back
324
325 "exec" parameter is simply command line, to be run via shell, that will run
326 the program.
327
328 If exec parameter starts with '<' character (with optional whitespace
329 characters after), it is treated as filename to be read, and logged.
330
331 Due to the way it is internally processed - using "<" approach makes sense
332 only for periodic checks - in case of permenent checks it would simply copy
333 the file at start of system_monitoring.pl, and ignore any changes to it
334 afterwards. If you'd like to have something like 'tail -f' - use tail -f.
335
336 interval is time (in seconds) how often given program (of periodic type)
337 should be run.
338
339 ignore is optional parameter which is checked using Perl boolean logic (any
340 value other than empty string or 0 ar treated as true). Since
341 system_monitoring doesn't let setting empty string as value for option -
342 it's best to not include ignore option for checks you want to log, and just
343 add '...ignore=1' for those that you want to ignore.
344
345 If ignore is set, system_monitoring will not log output from such check.
346
347 This is helpful to build-in compression of older logs, using for example:
348
349     check.cleanup.type=periodic
350     check.cleanup.interval=300
351     check.cleanup.exec=find /var/log/monitoring -type f -name '*.log' -mmin +120 -print0 | xargs -0 gzip
352     check.cleanup.ignore=1
353
354 "XXX" (name of check) can consist only of upper and lower case letters,
355 digits, and character _. That is it has to match regular expression:
356
357     /\A[A-Za-z0-9_]+\z/
358
359 Output from all programs will be logged in files named:
360
361     /logdir/YYYY/MM/DD/XXX-YYY-MM-DD-HH.log
362
363 where YYYY, MM, DD and HH are date and time parts of current (as of logging
364 moment) time.
365
366 HH is 0 padded 24-hour style hour.
367
368 Example configuration:
369
370     # Global configuration, log directory
371     GLOBAL.logdir=/var/tmp/monitoring
372
373     # Logging iostat output in 10 second intervals
374     check.iostat.type=persistent
375     check.iostat.exec=iostat -kx 10
376
377     # Logging "ps auxwwn" every 30 seconds.
378     check.ps.type=periodic
379     check.ps.exec=ps auxwwn
380     check.ps.interval=30
381
382 =head2 INTERNALS
383
384 Program itself is very short:
385
386     my $program = Monitoring->new();
387     $program->run();
388
389 This creates $program as object of Monitoring class (defined in the same
390 file), and calls method run() on it.
391
392 =head3 METHODS
393
394 =head4 new
395
396 Just object constructor. Nothing to see there.
397
398 =head4 run
399
400 Initialization of stuff, and call to main_loop. Reads and validates config
401 (by calls to appropriate methods), initializes IO::Select object for
402 asynchronous I/O, starts persistent checks (again, using special metod), and
403 enters main_loop();
404
405 =head4 main_loop
406
407 The core of the program. Infinite loop, which - upon every iteration:
408
409 =over
410
411 =item * updates logging filehandles
412
413 =item * checks if there is anything to read in input filehandles (from
414 checks)
415
416 =item * reads whatever is to be read from checks
417
418 =item * runs new periodic checks if the time has come to do it
419
420 =back
421
422 Checking for data in input filehandles is done with timeout, which is
423 calculated to finish when next check will have to be run, so the program
424 doesn't use virtually no CPU unless there are some data to be worked on.
425
426 =head4 handle_read
427
428 Since all we get from IO::Select is filehandle to read from, this method has
429 first to find which check given filehandle belongs to.
430
431 Afterwards, it reads whatever is available in the filehandle. In case there
432 is error on the filehandle - it closes the filehandle - as it means that
433 output for given check ended.
434
435 Every line from check is prefixed with timestamp and logged to appropriate
436 logfile.
437
438 Additionally, when closing the filehandle (on error), it sets when given
439 check should be run next time.
440
441 =head4 run_check
442
443 Simple helper function which runs external program (or opens filehandle for
444 reading from file), and puts it into check data.
445
446 =head4 start_periodic_processes
447
448 Iterates over all periodic processes, checks which should be already run,
449 and runs them.
450
451 =head4 start_persistent_processes
452
453 Iterates over all persistent processes and runs them. This is done only
454 once, from run() method.
455
456 =head4 calculate_timeout
457
458 Helper function which calculates how long should main_loop() wait for data
459 from IO::Select before it has to run another round of
460 start_periodic_processes().
461
462 =head4 update_logger_filehandles
463
464 Checks if current timestamp has changed enough to require swapping files,
465 and if yes - closes old ones and opens new ones - making all necessary
466 directories to make it happen.
467
468 =head4 checks
469
470 Wrapper to be able to write:
471
472     for my $C ( $self->checks ) {
473
474 instead of:
475
476     for my $C ( @{ $self->{ 'checks'} } ) {
477
478 =head4 validate_config
479
480 Verifies that config values make sense, and reorganizes them into final data
481 structure (checks hashes in $self->{'checks'} arrayref).
482
483 =head4 read_config
484
485 Just like name suggests - reads given config to memory. Very simple parser
486 based on regular expressions.
487
488 =head2 LICENSE
489
490 Copyright (c) 2010,2011, OmniTI, Inc.
491
492 Permission to use, copy, modify, and distribute this software and its
493 documentation for any purpose, without fee, and without a written agreement
494 is hereby granted, provided that the above copyright notice and this
495 paragraph and the following two paragraphs appear in all copies.
496
497 IN NO EVENT SHALL OmniTI, Inc. BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
498 SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
499 ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
500 OmniTI, Inc. HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
501
502 OmniTI, Inc. SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
503 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
504 PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
505 AND OmniTI, Inc. HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
506 UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
507
508 =head2 COPYRIGHT
509
510 The system_monitoring project is Copyright (c) 2010,2011 OmniTI. All rights reserved.
511
Note: See TracBrowser for help on using the browser.