#!/usr/bin/perl -w # segment v1.0: copyright (c) 2002,2003 Scalable Informatics LLC # email: landman@scalableinformatics.com # web: http://scalableinformatics.com/segment.html # # This program is dual licensed, similar in nature to the MySQL license. # License specific information will be on the website indicated above. # # Summary: # This software is licensed for GPL or Artistic Licensed # projects under the Artistic License. For all other cases # (commercial, commercial support, bundling, redistribution, # to name several) you will need to obtain a commercial license. # # Information on the GPL may be found at # http://www.gnu.org/licenses/gpl.txt # # Information on the Artistic license may be found at # http://www.perl.com/language/misc/Artistic.html # # Commercial licensing information may be obtained by contacting # Scalable Informatics LLC. # This is an auxillary program designed to be used # with the Scalable Informatics Computing Engine (SICE) platform. When # bundled with the full SICE platform, the commercial license # will be used. # All code is copyright 2003 Scalable Informatics, subject to # the aforementioned licenses. # # segment an input stream of FASTA formatted data into a sequence of # files with a consistent naming template. #V0.8 2-September-2002 basic idea down #V1.0 7-October-2002 alpha #V1.0 7-October-2002 beta #V1.0 5-May-2003 "release" bug fixes, feature upgrades, major internals rewrite: # added --size, --nowrite, --debug, buffer engine rework, # spill code, fill code, use of unbuffered I/O for # better performance on large files # basic startup, modules, etc use strict; no strict "subs"; use Getopt::Long; use IO::Dir; use IO::File; use Carp; use Data::Dumper; # constants use constant true => (1==1); use constant false => (1==0); use constant kB => 1024; use constant MB => 1024*kB; use constant GB => 1024*MB; # variables my ($input_cl,$nseq_cl,$output_cl,$path_cl,$index); my ($buffer,$count,$done,$actual,@seq_raw,$last); my ($output_file_name,$total_bytes_read,$total_bytes_written); my ($nseq_in,$nseq_out,$tmp); # option names my $verbose; # use --verbose to note about what we # are doing to the user my $input; # use --input=/path/to/input/file, will default to the # STDIN my $nseq; # use --nseq=number of sequences per output file my $output; # use --output="printf format string" # such as "name%i". Defaults to 'segment.%i.fa' my $output_buffer; my $output_number; my $number_in_output_buffer; my $debug; # use --debug to speak volumes about what we # are doing to the user my $path; # use --path=/path/where/to/place/output/files my $nowrite=0; # don't write output file, just go through all the motions... # useful for debugging. my $buffer_size;# size in MB of buffer my $size; # to be used instead of splitting by my $infile = IO::File->new; # prepare a file object for input my $outfile = IO::File->new; # prepare a file object for output # set defaults $verbose = false; $debug = false; $done = false; $path = ''; $nseq = 0; $size = 0; $input = undef; # by default, input from STDIN, check for undef # and fix up if needed $output = 'segment.%i.fa'; $buffer_size = 4; # 4MB buffer $total_bytes_read = $total_bytes_written = 0; $nseq_in = $nseq_out = 0; # parse command line options # # note: due to parser being somewhat dumb in GetOptions, # there is no way I can specify a "number" (integer OR float) # so I use a string, and numerify (new word!) it. GetOptions ( 'verbose' => \$verbose, 'debug' => \$debug, 'path=s' => \$path, 'nseq=s' => \$nseq, 'size=s' => \$size, 'output=s' => \$output, 'input=s' => \$input, 'nowrite' => \$nowrite, 'buffer=s' => \$buffer_size, ); # VERY IMPORTANT: the output files are built in memory. This will be changed # in subsequent releases, as this can and will cause swapping in large input file # cases (genomic database size). The logic to do this is not trivial, and # will be worked in in short order. A new release will be created for this # when ready. If you want to use this to split huge databases, you may wish to run # it on a large RAM machine, with a Perl compiled for 64 bit features. # if debug set, then set verbose as well... $verbose = true if ($debug); # sanity check: cannot have both nseq and size set. # size means dump output buffer after hitting a specific size in MB if (($size != 0) && ($nseq != 0)) { die "FATAL ERROR: must use only one of --nseq or --size\n"; } if ($size != 0) { $size *= MB; } # size is in units of MB if ($size > (2047*MB)) { warn "WARNING: some Perl implementations are not compiled to handle files larger than 2 GB. This may not work\n"; } printf "Using chunks of size=%s MB\n",($size/MB) if (($size != 0) && ($verbose)); # set input file source. No --input=filename means get input from # the STDIN filehandle if (defined($input)) { if ($infile->open('< '.$input)) { printf "input file = %s opened successfully\n",$input if ($verbose); } else { die "FATAL ERROR: Unable to open file = ".$input." for reading\n"; } } else { $infile=\*STDIN; # no -input=... so take input from STDIN printf "input from STDIN\n",$input if ($verbose); } # reformat the output so that it looks like %-i if (!defined($output)) { die "FATAL ERROR: must use --output=filename\n"; } if ($output =~ /(\%)(i)/) { printf "changing output from %s to ",$output if ($debug); $output =~ s/(\%)(i)/$1-$2/ ; printf " %s \n",$output if ($debug); } # if no index pattern is found, then generate one if ($output !~ /\%-i/) { $output .= '.%-i'; } # set up buffer: Note: I assume a maximum of 256 MB buffer size. There # probably should be no limit, in case you have enough ram, and # a reasonable OS. However, if you ask for 1024 MB and you only have # 256 MB ram, well, you will do a bit of swapping, and that will kill the # performance of the run. You really do not want to do this. # We should get physical and logical memory information, but each machine # and OS handle that differently, to the point that BSD::Resource module # results need to be taken with a grain of salt... if (($buffer_size < 0.1) || ($buffer_size > 256)) {$buffer_size = 4;} $buffer = " " x ($buffer_size*MB) ; # allocate buffer printf "buffer of length %i bytes allocated\n",length($buffer) if ($debug); warn "WARNING: Large buffers can cause massive swapping, which defeats the purpose of buffering in the first place...\n" if ($size > (64*MB)); # loop over input, and write to output buffer. Spill buffer to disk if full. $output_number=1; $output_buffer=""; $number_in_output_buffer=0; $last=""; $count=0; do { $tmp=eval { (substr($last,0,20)|| " "x20 )}; printf "\nLoop=%i,\n\toutput number=%i\n\tlast(20)=%s\n",$count,$output_number,$tmp if ($debug); printf "\tTotal(read)=%i\n\tTotal(write)=%i\n",$total_bytes_read,$total_bytes_written if ($verbose); printf "\tNseq(in)=%i\n\tNseq(out)=%i\n",$nseq_in,$nseq_out if ($verbose); printf "\tfirst 3 chars of buffer = |%s|\n",substr($buffer,0,3) if ($debug); $actual = $infile->sysread($buffer,($buffer_size*MB)); printf "\tactual bytes read = %i\n",$actual if ($debug); $buffer = $last .$buffer if (substr($buffer,0,3) ne ">gi"); $total_bytes_read += $actual; $done = true if ($actual < ($buffer_size*MB)); undef @seq_raw; # split fasta files by noting the ">" starts each new sequence @seq_raw=split(">",$buffer); shift(@seq_raw) if ($count == 0); # otherwise the split # causes seq_raw[0] =" blank ", which triggers another bug.... $nseq_in+=$#seq_raw; $tmp=eval { (substr($last,0,20)|| " "x20 )}; $last = pop(@seq_raw) if (!$done); $last ="" if ($done); printf "\tN(seq)[in]=%i\n",$#seq_raw+1 if ($verbose); printf "\tdone=%i\n",$done if ($debug); # fill (and spill if needed) the output buffer to drain # the input buffer &fill_output_buffer; $count++; } until ($done); # flush the last of the input out &fill_last_output_buffer; print "Done\n" if ($verbose); exit; sub fill_output_buffer { my $seq_single; print "In fill_output_buffer:\n" if ($verbose); printf "\tseq_raw=%s\n\tseq_raw[0](20)=%s\n\tlast=%s\n",$#seq_raw,substr($seq_raw[0],0,20),substr($last,0,20) if ($verbose); while (($#seq_raw >= 0) && ($seq_raw[0] ne $last)) { print "\t-- entering read loop\n" if ($debug); if ($size !=0) { print "\t-- using size mode\n" if ($debug); } else { print "\t-- using N(sequence) mode\n" if ($debug); } # for N(seq) based splitting... if ($nseq != 0) { while ( ( ($number_in_output_buffer < $nseq) && ($#seq_raw >= 0) && ($seq_single = ">".shift(@seq_raw)) && (length($seq_single) > 1) ) ) { $nseq_out++; $number_in_output_buffer++; $output_buffer.=$seq_single; # printf "\tprocessed %i sequences\n",$nseq_out if ((!($nseq_out % 1000)) && ($debug)); } } # for size based splitting ... if ($size != 0 ) { while ( (length($output_buffer) <= $size) && ($#seq_raw >= 0) && ($seq_single = ">".shift(@seq_raw)) && (length($seq_single) > 1) ) { $nseq_out++; $number_in_output_buffer++; $output_buffer.=$seq_single; } } print "\t-- entering spill section\n" if ($debug); if ( (($number_in_output_buffer == $nseq) && ($nseq != 0)) || ((length($output_buffer) >= $size) && ($size != 0)) ) { &spill_buffer_to_file; $total_bytes_written+=length($output_buffer); # reset counters $output_buffer=""; $number_in_output_buffer=0; $output_number++; } } } sub fill_last_output_buffer { print "In fill_last_output_buffer:\n" if ($verbose); my $seq_single; if ($last ne "") { $seq_single = ">".$last; $output_buffer.=$seq_single; } &spill_buffer_to_file; } sub spill_buffer_to_file { # build output file my $nwritten=0; print "In spill_buffer_to_file:\n" if ($verbose); my $output_file_name = sprintf($output,$output_number); if (!($nowrite)) { if ($outfile->open('> '.$output_file_name)) { printf "output file = %s opened successfully\n",$output_file_name if ($debug); } else { die "FATAL ERROR: Unable to open file = ". $output_file_name ." for writing\n"; } } $nwritten=$outfile->syswrite($output_buffer) if (!($nowrite)); # write only if write protect tab is off ... $nwritten=length($output_buffer) if ($nowrite); # report buffer as writing completely printf "\twrote %i byte(s)\n", $nwritten if (($nwritten>=0) && ($verbose)); printf "ERROR: Unable to write\n" if (!defined($nwritten)); if (!($nowrite)) { $outfile->close or die "FATAL ERROR: Unable to close file\n"; } }