root/trunk/tools/gzip_with_file_size_limit.pl

Revision 81, 3.6 kB (checked in by depesz, 4 years ago)

script that gzips input to set of file, keeping each file under given size limit, but separately uncompressable

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Carp;
5 use English qw( -no_match_vars );
6 use Data::Dumper;
7 use Getopt::Long;
8 use IO::Handle;
9
10 $OUTPUT_AUTOFLUSH = 1;
11
12 my $CFG        = get_config();
13 my $read_lines = 0;
14 my $fh_data    = {};
15
16 while ( my $line = <STDIN> ) {
17     $read_lines++;
18     write_line_to_gzip( $line );
19     print "\rLine $read_lines processed. : " . $fh_data->{ 'current_position' } . "               " if $CFG->{ 'verbose' };
20 }
21 print "\n" if $CFG->{ 'verbose' };
22
23 exit;
24
25 sub write_line_to_gzip {
26     my $line = shift;
27
28     my $fh = get_fh();
29     print $fh $line;
30     $fh->flush();
31     $fh_data->{ 'current_position' } = ( stat( $fh_data->{ 'file_name' } ) )[ 7 ];
32     $fh_data->{ 'current_position' } = 0 unless defined $fh_data->{ 'current_position' };
33     return;
34 }
35
36 sub create_new_output_handle {
37     my $file_name = get_output_file_name();
38
39     open my $fh, '|-', 'gzip -c - > ' . quotemeta( $file_name ) or croak( "Cannot write to $file_name : $OS_ERROR\n" );
40     print "Created file: $file_name\n" if $CFG->{ 'verbose' };
41
42     $fh_data->{ 'fh' }               = $fh;
43     $fh_data->{ 'current_position' } = 0;
44     $fh_data->{ 'file_name' }        = $file_name;
45
46     return $fh;
47 }
48
49 sub get_fh {
50     unless ( $fh_data->{ 'fh' } ) {
51         $fh_data->{ 'current' } = 1;
52         return create_new_output_handle();
53     }
54
55     my $total_output_size              = ( $fh_data->{ 'current' } - 1 ) * $CFG->{ 'limit' } + $fh_data->{ 'current_position' };
56     my $average_compressed_record_size = $total_output_size / $read_lines;
57
58     if ( $fh_data->{ 'current_position' } + 2 * $average_compressed_record_size > $CFG->{ 'limit' } ) {
59         print "\n" if $CFG->{ 'verbose' };
60         close $fh_data->{ 'fh' };
61         $fh_data->{ 'current' }++;
62         return create_new_output_handle();
63     }
64
65     return $fh_data->{ 'fh' };
66 }
67
68 sub get_output_file_name {
69     return sprintf '%s.%0' . $CFG->{ 'width' } . 'u.gz', $CFG->{ 'filename' }, $fh_data->{ 'current' };
70 }
71
72 sub get_config {
73     my %cfg = ();
74     unless ( GetOptions( \%cfg, 'limit=i', 'verbose', 'help|?', 'filename=s', 'width=i' ) ) {
75         show_help_and_die();
76     }
77     show_help_and_die() if $cfg{ 'help' };
78
79     show_help_and_die( "Limit is too small, it has to be at least 5MB.\n" ) if 5 > $cfg{ 'limit' };
80     show_help_and_die( "Limit is too big, it has to be 500MB at most.\n" )  if 500 < $cfg{ 'limit' };
81
82     show_help_and_die( "Width is too small, it has to be at least 1.\n" ) if 1 > $cfg{ 'width' };
83     show_help_and_die( "Width is too big, it has to be 10 at most.\n" )   if 10 < $cfg{ 'width' };
84
85     $cfg{ 'limit' } *= 1024 * 1024;    # Convert megabytes to bytes
86
87     return \%cfg;
88 }
89
90 sub show_help_and_die {
91     my @args = @_;
92     if ( 0 < scalar @args ) {
93         printf STDERR @args;
94     }
95     print STDERR <<_END_OF_HELP_;
96 Syntax:
97     $PROGRAM_NAME -l 50 -f output -v -w 3
98
99 Options:
100     --filename (-f)     - Prefix of filename to be saved
101     --limit    (-l)     - How many megabytes is the limit for output
102     --width    (-w)     - How many characters should part number be padded to
103     --verbose  (-v)     - Show verbose information, including progress
104     --help     (-?)     - Show this help page
105
106 Example:
107     cat some_file | $PROGRAM_NAME -l 50 -f output -w 3
108
109 Will compress data from some_file, outputting gzipped content to files named:
110  - output.001.gz
111  - output.002.gz
112  - output.003.gz
113 and so on, trying to keep every file below 50 megabytes.
114
115 It can fail at keeping the file under limit in some cases, but generally
116 even if it will not be able to keep it under 50MB, eventual overhead
117 should be minimal (up to 1 record (line) from source data).
118
119 _END_OF_HELP_
120     exit( 1 );
121 }
Note: See TracBrowser for help on using the browser.