root/trunk/tools/system_monitoring.pl

Revision 199, 10.1 kB (checked in by depesz, 3 years ago)

comparing the same variable is stupid. I should compare with previous stamp to know if I should swap files

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl -w
2
3 package main;
4 my $program = Monitoring->new();
5 $program->run();
6
7 exit;
8
9 package Monitoring;
10 use strict;
11 use English qw( -no_match_vars );
12 use Time::HiRes qw( time );
13 use POSIX qw( strftime );
14 use File::Spec;
15 use File::Path qw( mkpath );
16 use Data::Dumper;
17 use IO::Select;
18 use IO::Handle;
19
20 sub new {
21     my $class = shift;
22     return bless {}, $class;
23 }
24
25 sub run {
26     my $self = shift;
27
28     $self->read_config();
29     $self->validate_config();
30
31     $self->{ 'select' } = IO::Select->new();
32     $self->start_persistent_processes();
33
34     $self->main_loop();
35     return;
36 }
37
38 sub main_loop {
39     my $self = shift;
40     while ( 1 ) {
41         $self->{ 'current_time' } = time();
42         $self->update_logger_filehandles();
43
44         my $timeout = $self->calculate_timeout();
45         my @ready   = $self->{ 'select' }->can_read( $timeout );
46         for my $fh ( @ready ) {
47             $self->handle_read( $fh );
48         }
49         $self->start_periodic_processes();
50     }
51 }
52
53 sub handle_read {
54     my $self = shift;
55     my $fh   = shift;
56
57     my $C;
58     for my $check ( keys %{ $self->{ 'checks' } } ) {
59         my $tmp = $self->{ 'checks' }->{ $check };
60         next unless $tmp->{ 'input' };
61         my $tmp_fh = $tmp->{ 'input' };
62         next if "$tmp_fh" ne "$fh";    # Stringified reference to io handle
63         $C = $tmp;
64         last;
65     }
66     die "Data from unknown input?! It shouldn't *ever* happen\n" unless $C;
67
68     my $read_data = '';
69     while ( 1 ) {
70         my $buffer;
71         my $read_bytes = sysread( $fh, $buffer, 8192 );
72         $read_data .= $buffer;
73         last if 8192 > $read_bytes;
74     }
75     if ( '' eq $read_data ) {
76         $self->{ 'select' }->remove( $fh );
77         close $fh;
78         delete $C->{ 'input' };
79         $C->{ 'next_call' } = $self->{ 'current_time' } + $C->{ 'interval' } if $self->{ 'current_time' } < $C->{ 'next_call' };
80         return;
81     }
82     my @lines = split( /\r?\n/, $read_data );
83     my $line_prefix = strftime( '%Y-%m-%d %H:%M:%S %Z | ', localtime( $self->{ 'current_time' } ) );
84     for my $line ( @lines ) {
85         print { $C->{ 'fh' } } "$line_prefix$line\n";
86     }
87     return;
88 }
89
90 sub start_periodic_processes {
91     my $self = shift;
92     for my $check ( keys %{ $self->{ 'checks' } } ) {
93         my $C = $self->{ 'checks' }->{ $check };
94         next if 'persistent' eq $C->{ 'type' };
95         next if defined $C->{ 'input' };
96         next if ( defined $C->{ 'next_call' } ) && ( $C->{ 'next_call' } > $self->{ 'current_time' } );
97         open my $fh, '-|', $C->{ 'exec' } or die "Cannot run [" . $C->{ 'exec' } . "]: $OS_ERROR\n";
98         $self->{ 'select' }->add( $fh );
99         $C->{ 'input' }     = $fh;
100         $C->{ 'next_call' } = $self->{ 'current_time' } + $C->{ 'interval' };
101     }
102     return;
103 }
104
105 sub calculate_timeout {
106     my $self    = shift;
107     my $nearest = undef;
108     for my $check ( keys %{ $self->{ 'checks' } } ) {
109         my $C = $self->{ 'checks' }->{ $check };
110         next if 'persistent' eq $C->{ 'type' };
111         next if defined $C->{ 'input' };
112         return 0 unless defined $C->{ 'next_call' };
113         if ( defined $nearest ) {
114             $nearest = $C->{ 'next_call' } if $C->{ 'next_call' } < $nearest;
115         }
116         else {
117             $nearest = $C->{ 'next_call' };
118         }
119     }
120     $nearest = $self->{ 'current_time' } unless defined $nearest;
121     my $sleep_time = $nearest - $self->{ 'current_time' };
122     return $sleep_time < 0.5 ? 0.5 : $sleep_time;    # limit sleep time to 0.5s to avoid too aggresive calls.
123 }
124
125 sub start_persistent_processes {
126     my $self = shift;
127     for my $check ( keys %{ $self->{ 'checks' } } ) {
128         my $C = $self->{ 'checks' }->{ $check };
129         next unless 'persistent' eq $C->{ 'type' };
130         open my $fh, '-|', $C->{ 'exec' } or die "Cannot run [" . $C->{ 'exec' } . "]: $OS_ERROR\n";
131         $self->{ 'select' }->add( $fh );
132         $C->{ 'input' } = $fh;
133     }
134     return;
135 }
136
137 sub update_logger_filehandles {
138     my $self = shift;
139
140     my $file_suffix = strftime( '-%Y-%m-%d-%H.log', localtime( $self->{ 'current_time' } ) );
141     return if ( defined $self->{ 'previous-suffix' } ) && ( $self->{ 'previous-suffix' } eq $file_suffix );
142     $self->{ 'previous-suffix' } = $file_suffix;
143
144     my $directory_prefix = strftime( '%Y/%m/%d', localtime( $self->{ 'current_time' } ) );
145     my $full_directory = File::Spec->catfile( $self->{ 'logdir' }, $directory_prefix );
146
147     mkpath( [ $full_directory ], 0, oct( "750" ) ) unless -e $full_directory;
148
149     for my $check ( keys %{ $self->{ 'checks' } } ) {
150         my $C = $self->{ 'checks' }->{ $check };
151         if ( $C->{ 'fh' } ) {
152             close $C->{ 'fh' };
153             delete $C->{ 'fh' };
154         }
155
156         my $full_name = File::Spec->catfile( $full_directory, $check . $file_suffix );
157         open my $fh, '>>', $full_name or die "Cannot write to $full_name: $OS_ERROR\n";
158         $fh->autoflush( 1 );
159         $C->{ 'fh' } = $fh;
160     }
161
162     return;
163 }
164
165 sub validate_config {
166     my $self = shift;
167
168     die "GLOBAL.logdir was not provided in config!\n" unless defined $self->{ 'logdir' };
169     die "There are no checks to be run!\n"            unless defined $self->{ 'checks' };
170
171     for my $check ( sort keys %{ $self->{ 'checks' } } ) {
172         my $C = $self->{ 'checks' }->{ $check };
173         die "Bad type " . $C->{ 'type' } . " in check $check!\n" unless $C->{ 'type' } =~ m{\A(?:persistent|periodic)\z};
174         next unless $C->{ 'type' } eq 'periodic';
175
176         die "Undefined interval for check $check!\n" unless defined $C->{ 'interval' };
177         die "Bad interval (" . $C->{ 'interval' } . ") in check $check!\n" unless $C->{ 'interval' } =~ m{\A[1-9]\d*\z};
178     }
179     return;
180 }
181
182 sub read_config {
183     my $self = shift;
184
185     die "You have to provide name of config file! Check: perldoc $PROGRAM_NAME\n" if 0 == scalar @ARGV;
186     my $config_file_name = shift @ARGV;
187
188     open my $fh, '<', $config_file_name or die "Cannot open config file ($config_file_name) : $OS_ERROR\n";
189     while ( my $line = <$fh> ) {
190         next if $line =~ m{^\s*#};     # comment
191         next if $line =~ m{^\s*\z};    # empty line
192         $line =~ s{\A\s*}{};           # removing leading spaces
193         $line =~ s{\s*\z}{};           # removing trailing spaces
194         if ( $line =~ m{ \A GLOBAL\.logdir \s* = \s* (\S.*) \z }xmsi ) {
195             $self->{ 'logdir' } = $1;
196             next;
197         }
198         elsif ( $line =~ m{ \A check\.([A-Za-z0-9_]+)\.(type|exec|interval) \s* = \s* (\S.*) \z }xmsi ) {
199             $self->{ 'checks' }->{ $1 }->{ $2 } = $3;
200             next;
201         }
202         die "Unknown line: [ $line ]\n";
203     }
204     close $fh;
205     return;
206 }
207
208 1;
209
210 =head1 system_monitoring.pl
211
212 =head2 USAGE
213
214 omnipitr-archive <config_file>
215
216 =head2 DESCRIPTION
217
218 system_monitoring.pl script is meant to provide single and solution for logging system data which change more often than it's practical for systems like cacti/nagios.
219
220 It is meant to be run on some low-privilege account, and gather the data, which are partitioned automatically by source, and time, and stored in simple text files.
221
222 After running, system_monitor.pl will check config, and if there are no errors - will start processing checks.
223
224 All checks work in parallel, so there is no chance single check could lock whole system_monitoring.pl.
225
226 =head2 Configuration file
227
228 Format of the configuration file is kept as simple as possible, to make this script very portable - which in this particular case means: no external (aside from core perl) dependencies.
229
230 Each line should be one of:
231
232 =over
233
234 =item * Comment (starts with #)
235
236 =item * Empty line (just white space characters)
237
238 =item * Setting
239
240 =back
241
242 Where setting line looks like:
243
244     PARAM=value
245
246 with optional leading, trailing or around "=" whitespace.
247
248 Recognized parameters are:
249
250 =over
251
252 =item * GLOBAL.logdir - full path to log directory
253
254 =item * check.XXX.type - type of check with name XXX
255
256 =item * check.XXX.exec - what should be executed to get data for check XXX
257
258 =item * check.XXX.interval - how often to run check XXX
259
260 =back
261
262 There are only two supported types:
263
264 =over
265
266 =item * persistent - which means given program is to be run in background, and whatever it will return should be logged. Such program "interval" will be ignored.
267
268 =item * periodic - which means that given program is to be run periodically as it will exit after returning data
269
270 =back
271
272 "exec" parameter is simply command line, to be run via shell, that will run the program.
273
274 interval is time (in seconds) how often given program (of periodic type) should be run.
275
276 "XXX" (name of check) can consist only of upper and lower case letters, digits, and character _. That is it has to match regular expression:
277
278     /\A[A-Za-z0-9_]+\z/
279
280 Output from all programs will be logged in files named:
281
282     /logdir/YYYY/MM/DD/XXX-YYY-MM-DD-HH.log
283
284 where YYYY, MM, DD and HH are date and time parts of current (as of logging moment) time.
285
286 HH is 0 padded 24-hour style hour.
287
288 Example configuration:
289
290     # Global configuration, log directory
291     GLOBAL.logdir=/var/tmp/monitoring
292
293     # Logging iostat output in 10 second intervals
294     check.iostat.type=persistent
295     check.iostat.exec=iostat -kx 10
296
297     # Logging "ps auxwwn" every 30 seconds.
298     check.ps.type=periodic
299     check.ps.exec=ps auxwwn
300     check.ps.interval=30
301
302 =head2 LICENSE
303
304 Copyright (c) 2010, OmniTI, Inc.
305
306 Permission to use, copy, modify, and distribute this software and its
307 documentation for any purpose, without fee, and without a written agreement
308 is hereby granted, provided that the above copyright notice and this
309 paragraph and the following two paragraphs appear in all copies.
310
311 IN NO EVENT SHALL OmniTI, Inc. BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
312 SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
313 ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
314 OmniTI, Inc. HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
315
316 OmniTI, Inc. SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
317 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
318 PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
319 AND OmniTI, Inc. HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
320 UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
321
322 =head2 COPYRIGHT
323
324 The OmniPITR project is Copyright (c) 2010 OmniTI. All rights reserved.
325
Note: See TracBrowser for help on using the browser.