#!/usr/bin/perl -w # # db_dlaf v1.0: copyright (c) 2003 Scalable Informatics LLC # email: landman@scalableinformatics.com # web: http://scalableinformatics.com/db_dlaf.html # # Distributed under the GNU Public License # See http://www.fsf.org/licenses/gpl.html for a complete # specification of this license. # # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # Joseph Landman 1-May-2003 # # v1.1 add the --binary option # TODO: Documentation use strict; use POSIX qw(strftime); use Getopt::Long; use Time::HiRes qw(gettimeofday tv_interval); use Data::Dumper; use constant true => (1==1); use constant false => (1==0); use constant kB => 1024; use constant MB => kB * 1024; use constant GB => MB * 1024; my ($db_path, @db_list,$db_l,$list,$l); my ($wget,$curl,$url,$date,$rc,$tmp,%stats); my ($formatdb,$help,$verbose,$gzip,$full); my (@compressed_extensions,%normal_db); my ($unzip,$bzip2,%zipped,$fdb_bin); ### use wget and curl $wget='/usr/bin/wget'; $curl='/usr/bin/curl'; $gzip='/usr/bin/gzip -d '; $unzip='/usr/bin/unzip'; $bzip2='/usr/bin/bzip2 -d '; $fdb_bin=""; ### default location to grab databases from $url='ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/'; ### indicate which set of extensions indicate compressed files %zipped=( 'Z' => $gzip, 'gz' => $gzip, 'zip' => $unzip, 'bz2' => $bzip2, 'bzip2' => $bzip2 ); ### current date $date=strftime '%Y%m%d',localtime; $rc = GetOptions ( "path=s" => \$db_path, "db=s" => \$db_l, "url=s" => \$url, "tmp=s" => \$tmp, "list" => \$list, "l" => \$l, "formatdb=s" => \$formatdb, "binary=s" => \$fdb_bin, "help" => \$help, "verbose" => \$verbose, "fdb=s" => \$fdb_bin ); if ($help) {&help;exit;} if (defined($list) || defined($l)) { if ($url =~ /^ftp:/) { my $out=`$curl -s -X LIST $url`; my @list=split(/\n/,$out); foreach my $line (@list) { my @x=split(/\s+/,$line); printf "filename=%s,\tsize=%.3f MB\n",$x[8],$x[4]/MB if (defined($list)); printf "%s\n",$x[8] if (defined($l)); } } if ($url =~ /^http:/i) { my $out=`$curl -s -l $url`; my @list=split(/\n/,$out); foreach my $line (@list) { $line =~ /href=\"([\w+,\S+])\"*\d+:\d+\s+(\w+[k,M,G])/; printf "filename=%s,\tsize=%.3f MB\n",$1,$2 if (defined($list)); printf "%s\n",$1 if (defined($l)); } } } # # if the path (where we are going to place the databases) doesnt exist, # use a default of current directory with a date branch below if (!defined($db_path)) { $db_path='./'.$date; } printf "destination path = %s\n",$db_path if ($verbose); # if (defined($url)) { printf "url = %s\n",$url if ($verbose); } # make the directory if it does not exist if (!(-e $db_path)) {$rc=`/bin/mkdir -p $db_path`;} chdir $db_path or die "FATAL ERROR: Unable to change to directory ".$db_path."\n"; # return an error if no databases have been listed if (defined($db_l)) { @db_list=split(/[:,\,\+]/,$db_l); # use any of :,+ to seperate the db names printf "Using the following database(s)\n%s\n",join(", ",@db_list) if ($verbose); } else { die "FATAL ERROR: you must use the --db=database\n"; } # loop over db's, build full urls, and get the files $rc=($url =~ s/(\w+)\/$/$1/g); # remove the "/" if someone left it on # the urls need to be protocol://host/path # and not protocol://host/path/ # (the last "/" needs to be removed) foreach my $db (@db_list) { $full=join("/",$url,$db); # build the full path &get_file($full,$db); # get the file $normal_db{$db}=true if ($stats{$db}); } # loop over the db's that we have properly recieved, and uncompress if needed foreach my $db (@db_list) { &uncompress_file($db) if ($stats{$db}) ; } # exit if the formatdb string is blank if (!defined($formatdb)) { die "WARNING: No --formatdb=\"formatdb_option\" specified, exiting\n"; } # loop over the db's that we have and format them foreach my $db (keys %normal_db) { printf "formatdb against db= %s\n",$db; &run_formatdb($db); } #print Dumper(\%normal_db); exit; sub uncompress_file { my $file=shift; my ($rc,@fstat,$ext,$phile); @fstat=stat($file); printf "file=%s size=%.2f MB, ",$file,$fstat[7]/MB if ($verbose); foreach $ext (keys %zipped) { if ($file =~ /(\S+)\.$ext/i) # see which extension we have, as that determines the # uncompression program to use { $phile=$1; delete $normal_db{$file}; $normal_db{$phile}=true; $rc=`$zipped{$ext} $file`; # uncompress file by selecting appropriate program @fstat=stat($phile); printf "uncompressed file=%s, size=%.2f MB\n",$phile,$fstat[7]/MB if ($verbose); } } } sub run_formatdb { my $file=shift; my ($rc); my $run="$fdb_bin -i $file $formatdb"; printf "preparing to run formatdb\ncommand line = %s\n",$run; $rc=`$run`; # formatdb file by selecting appropriate program } sub get_file { my $full_url=shift; my $file=shift; printf "starting transfer of %s\n",$full_url if ($verbose); my $t0 = [gettimeofday]; my $out=`$curl -s $full_url -o $file`; my $elapsed = tv_interval ( $t0, [gettimeofday]); my @fstat; @fstat=stat($file); if ( (!defined($fstat[7])) ) { printf "file %s was not transfered or is zero size\n",$file; $stats{$file}=false; } else { $stats{$file}=true; printf "transfer for file=%s is %.1f seconds, rate=%.2f MB/s, size=%.2f MB\n",$file,$elapsed,$fstat[7]/(MB*$elapsed),$fstat[7]/MB if ($verbose); } } sub help { print "db_dlaf.pl: copyright 2003 Scalable Informatics LLC\n"; print " web: http:\/\/scalableinformatics.com\n"; print " email: landman\@scalableinformatics.com\n"; print "\n"; print "Usage:\n"; print "\tdb_dlaf.pl [--l | --list] [--path=/path] [--db=db1:db2:...] \\ \n"; print "\t\t[--url={http|ftp}://host/path] [--tmp=/path] \\ \n"; print "\t\t[--formatdb=\"options\"] [--help]\n"; print "\n"; print "\t\t--l\t\tlist of files\n"; print "\t\t--list\t\tlonger list\n"; print "\t\t--path\t\twhere to put the database indices\n"; print "\t\t--db\t\tlist of databases to grab, use : to seperate\n"; print "\t\t--url\t\thttp or ftp path to databases\n"; print "\t\t--tmp\t\ttemporary disk space\n"; print "\t\t--formatdb\tformatdb options to use on each db\n"; print "\t\t--binary\tformatdb binary path\n"; print "\n\n"; }