5cd7aa7b3364cb744c638c16fd0ca95015dd6079 braney Thu Apr 25 09:57:53 2024 -0700 small change to allow script to recognize knownGeneV as a mammal. diff --git src/hg/utils/automation/doHgNearBlastp.pl src/hg/utils/automation/doHgNearBlastp.pl index c87c626..21f5f74 100755 --- src/hg/utils/automation/doHgNearBlastp.pl +++ src/hg/utils/automation/doHgNearBlastp.pl @@ -1,568 +1,568 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/doHgNearBlastp.pl instead. # $Id: doHgNearBlastp.pl,v 1.11 2009/10/02 05:14:15 kent Exp $ use Getopt::Long; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; # Hardcoded params: my $tSplitCount = 1000; my $selfE = 0.01; my $selfMaxPer = 1000; my $pairwiseMaxPer = 1; # Option variable names: use vars qw/ $opt_clusterHub $opt_distrHost $opt_dbHost $opt_workhorse $opt_blastPath $opt_noSelf $opt_targetOnly $opt_queryOnly $opt_noLoad $opt_debug $opt_verbose $opt_help /; # Option defaults: # my $clusterHub = 'swarm'; my $clusterHub = 'ku'; # my $distrHost = 'swarm'; my $distrHost = 'ku'; my $dbHost = 'hgwdev'; my $workhorse = 'least loaded'; my $blastPath = '/cluster/bin/blast/x86_64/blast-2.2.20'; my $defaultVerbose = 1; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; my $base = $0; $base =~ s/^(.*\/)?//; print STDERR " usage: $base config.ra options: -clusterHub mach Parasol hub for blastp cluster runs. Default: $clusterHub -distrHost mach Host which distributes files to the cluster-attached fast storage for the blastp cluster runs. Default: $distrHost -dbHost mach Mysql server into which we load *BlastTab tables. Default: $dbHost -workhorse mach Use machine (default: $workhorse) for compute or memory-intensive steps. -blastPath dir Directory in which the latest blat has been installed (should also be compiled for the same architecture as cluster hosts). Default: $blastPath -noSelf Don't do self alignments (update pairwise only). -targetOnly Perform target vs. all queries, not vice versa. -queryOnly Perform all queries vs. target, not vice versa. -noLoad Perform alignments but don't load database tables. -debug Don't actually run commands, just display them. -verbose num Set verbose level to num. Default $defaultVerbose -help Show detailed help (config.ra variables) and exit. Automates the protein blast runs for hgNear and loads the *BlastTab tables. config.ra specifies one target db and multiple query dbs. Self-blastp for the target and pairwise blastp for the target vs. all queries, and all queries vs. the target, are performed by default but can be selectively disabled with -noSelf, -targetOnly or -queryOnly. To see detailed information about what should appear in config.ra, run \"$base -help\". "; # Detailed help (-help): print STDERR " config.ra must define the following settings: targetGenesetPrefix xxx - xxx is the prefix used to name the self *BlastTab table. targetDb xxx - xxx is the target database (the one that we just got new genes for) queryDbs xxx yyy zzz ... - space-separated list of query databases (the other hgNear dbs) xxxFa fff yyyFa ggg ... - for each db in queryDbs, define a *Fa variable with the complete path of a single fasta file containing protein sequence for genes. buildDir xxx - Working directory (usually /cluster/data/targetDb/bed/hgNearBlastp) in which scriptlets and results will be stored. scratchDir xxx - Cluster-attached fast storage (usually under /san/sanvol1/scratch/...) where fasta will be split/formatted and used during cluster blastp runs. Optional settings: recipBest xxx yyy zzz ... - space-separated list of query databases for which we are to find reciprocal-best blast hits. Even if -targetOnly or -queryOnly is specified, we will still do alignments in both directions if recipBest is specified. targetAbbrev yz - yz is a two-letter abbreviation for the target organism, e.g. mm for mouse. It is used to name the yzBlastTab tables in each queryDb. Normally the abbreviation is distilled from targetDb, but if targetDb is a temporary database with a name that bears no resemblance to the real genome database name, then this overrides. " if ($detailed); print STDERR "\n"; exit $status; } # usage # Globals: my $CONFIG; sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions("clusterHub=s", "distrHost=s", "dbHost=s", "workhorse=s", "blastPath=s", "verbose=n", "noSelf", "targetOnly", "queryOnly", "noLoad", "debug", "help"); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); $opt_verbose = $defaultVerbose if (! defined $opt_verbose); $clusterHub = $opt_clusterHub if (defined $opt_clusterHub); $distrHost = $opt_distrHost if (defined $opt_distrHost); $dbHost = $opt_dbHost if (defined $opt_dbHost); $blastPath = $opt_blastPath if (defined $opt_blastPath); if ($opt_targetOnly && $opt_queryOnly) { warn "\nBoth -targetOnly and -queryOnly were specified -- if you want " . "both target and query, then omit both of those options.\n"; usage(1); } } # checkOptions my ($buildDir, $scratchDir, $tGenesetPrefix, $tDb, @qDbs, %dbToFasta, %recipBest, $targetAbbrev); sub parseConfig { # Parse config.ra file, make sure it contains the required variables. my ($config) = @_; my $fh = &HgAutomate::mustOpen($config); my %vars; while (<$fh>) { next if (/^\s*#/ || /^\s*$/); if (/^\s*(\w+)\s*(.*)$/) { $vars{$1} = $2; $vars{$1} = " " if (! $2 && $1 eq 'queryDbs'); } else { die "Parse error line $. of $config:\n$_\n--"; } } close($fh); # Required variables. $buildDir = $vars{'buildDir'} || die "Error: $config is missing required variable buildDir.\n"; delete $vars{'buildDir'}; $scratchDir = $vars{'scratchDir'} || die "Error: $config is missing required variable scratchDir.\n"; delete $vars{'scratchDir'}; $tDb = $vars{'targetDb'} || die "Error: $config is missing required variable targetDb.\n"; delete $vars{'targetDb'}; $tGenesetPrefix = $vars{'targetGenesetPrefix'} || die "Error: $config is missing required variable targetGenesetPrefix.\n"; delete $vars{'targetGenesetPrefix'}; my $queryDbs = $vars{'queryDbs'} || die "Error: $config is missing required variable queryDbs.\n"; delete $vars{'queryDbs'}; @qDbs = split(/\s+/, $queryDbs); # Variables we expect to be there given the contents of $queryDbs: foreach my $db ($tDb, @qDbs) { my $faVar = $db . "Fa"; my $fa = $vars{$faVar} || die "Error: $config is missing required variable $faVar.\n"; $dbToFasta{$db} = $fa; delete $vars{$faVar}; } # Optional variables. my $recipBests = $vars{'recipBest'}; if ($recipBests) { foreach my $r (split(/\s+/, $recipBests)) { die "Error: $config recipBest item \"$r\" is not in queryDbs.\n" if (scalar(grep(/^$r$/, @qDbs)) == 0); $recipBest{$r} = 1; } delete $vars{'recipBest'}; } $targetAbbrev = $vars{'targetAbbrev'}; if ($targetAbbrev) { die "Error: $config targetAbbrev must consist of two lower-case letters " . "(Genus species --> gs)" if ($targetAbbrev !~ /^[a-z]{2}$/); delete $vars{'targetAbbrev'}; } # Bogus/misspelled variables. if (scalar(keys %vars) > 0) { die "Error: $config contains extra variable(s) that I don't know " . "what to do with: " . join(", ", sort keys %vars) . "\n"; } die "Sorry, this script can't rsync to $scratchDir -- please use san or " . "bluearc for scratchDir in $config.\n" if ($scratchDir =~ /^\/i?scratch\//); } # parseConfig sub splitSequence { # Split the target gene fasta file into small pieces for cluster run. my ($tDb, $tFasta) = @_; my $runDir = $buildDir; my $bossScript = "$runDir/doSplit.csh"; my $fh = &HgAutomate::mustOpen(">$bossScript"); print $fh <<_EOF_ #!/bin/csh -efx # This script was automatically generated by $0 # from $CONFIG. # It is to be executed on $distrHost. # It runs faSplit on a monolithic protein fasta file. # This script will fail if any of its commands fail. mkdir $scratchDir/$tDb.split faSplit sequence $tFasta $tSplitCount $scratchDir/$tDb.split/${tDb}_ _EOF_ ; close($fh); &HgAutomate::run("chmod a+x $bossScript"); &HgAutomate::nfsNoodge("$bossScript"); &HgAutomate::run("$HgAutomate::runSSH $distrHost nice $bossScript"); } sub formatSequence { # Run formatdb on the specified query ($scratchDir should be cluster-mounted) my ($qDb, $qFasta) = @_; my $runDir = $buildDir; &HgAutomate::mustMkdir($runDir); my $bossScript = "$runDir/doFormat_$qDb.csh"; my $fh = &HgAutomate::mustOpen(">$bossScript"); print $fh <<_EOF_ #!/bin/csh -efx # This script was automatically generated by $0 # from $CONFIG. # It is to be executed on $distrHost. # It runs formatdb to preprocess query protein fasta. # This script will fail if any of its commands fail. mkdir $scratchDir/$qDb.formatdb cd $scratchDir/$qDb.formatdb $blastPath/bin/formatdb -i $qFasta -t $qDb -n $qDb _EOF_ ; close($fh); &HgAutomate::run("chmod a+x $bossScript"); &HgAutomate::nfsNoodge("$bossScript"); &HgAutomate::run("$HgAutomate::runSSH $distrHost nice $bossScript"); } # formatSequence # Make a matrix of clade-pair -e (max E-value) threshold: my ($mammal, $fish, $fly, $worm, $yeast) = (0..5); my %dbToClade = ( hg => $mammal, mm => $mammal, rn => $mammal, tmpFoo => $mammal, danRer => $fish, dm => $fly, ce => $worm, sacCer => $yeast, knownGeneV => $mammal, knownGeneVM => $mammal, ); my @cladeEs; # Different species within clade (not self alignments -- see $selfE): $cladeEs[$mammal][$mammal] = $cladeEs[$fish][$fish] = $cladeEs[$fly][$fly] = $cladeEs[$worm][$worm] = $cladeEs[$yeast][$yeast] = 0.001; # Cross-clade: $cladeEs[$mammal][$fish] = $cladeEs[$fish][$mammal] = 0.005; $cladeEs[$mammal][$fly] = $cladeEs[$fly][$mammal] = 0.01; $cladeEs[$mammal][$worm] = $cladeEs[$worm][$mammal] = 0.01; $cladeEs[$mammal][$yeast] = $cladeEs[$yeast][$mammal] = 0.01; $cladeEs[$fish][$fly] = $cladeEs[$fly][$fish] = 0.01; $cladeEs[$fish][$worm] = $cladeEs[$worm][$fish] = 0.01; $cladeEs[$fish][$yeast] = $cladeEs[$yeast][$fish] = 0.01; $cladeEs[$fly][$worm] = $cladeEs[$worm][$fly] = 0.01; $cladeEs[$fly][$yeast] = $cladeEs[$yeast][$fly] = 0.01; $cladeEs[$worm][$yeast] = $cladeEs[$yeast][$worm] = 0.01; sub calcE { # Look up the blastp e parameter (max E-value) by clade distances my ($tDb, $qDb) = @_; - (my $t = $tDb) =~ s/\d+$//; + (my $t = $tDb) =~ s/\d+.*$//; defined ($t = $dbToClade{$t}) || die "Can't figure out clade of '$tDb'"; - (my $q = $qDb) =~ s/\d+$//; + (my $q = $qDb) =~ s/\d+.*$//; defined ($q = $dbToClade{$q}) || die "Can't figure out clade of '$qDb'"; my $e = $cladeEs[$t][$q] || die "No e for clades '$t', '$q'"; return $e; } # calcE sub runPairwiseBlastp { # Do a pairwise blastp cluster run for the given query. # $b is the blastp -b param value... note -b is at the end of $blastpParams. my ($tDb, $qDb, $e, $b) = @_; my $runDir = "$buildDir/run.$tDb.$qDb"; &HgAutomate::mustMkdir($runDir); my $fh = &HgAutomate::mustOpen(">$runDir/blastSome"); print $fh <<_EOF_ #!/bin/csh -ef setenv BLASTMAT $blastPath/data $blastPath/bin/blastall -p blastp -d $scratchDir/$qDb.formatdb/$qDb \\ -i \$1 -o \$2 -e $e -m 8 -b $b _EOF_ ; close($fh); &HgAutomate::makeGsub($runDir, "blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}"); my $bossScript = "$runDir/doBlastp.csh"; $fh = &HgAutomate::mustOpen(">$bossScript"); print $fh <<_EOF_ #!/bin/csh -efx # This script was automatically generated by $0 # from $CONFIG. # It is to be executed on $clusterHub in $runDir . # It does the blastp cluster run. # This script will fail if any of its commands fail. cd $runDir mkdir out chmod a+x blastSome ls -1S $scratchDir/$tDb.split/*.fa > split.lst gensub2 split.lst single gsub spec para make spec para time _EOF_ ; close($fh); &HgAutomate::run("chmod a+x $bossScript"); &HgAutomate::nfsNoodge("$bossScript"); &HgAutomate::run("$HgAutomate::runSSH $clusterHub $bossScript"); } # runPairwiseBlastp sub dbToPrefix { # Condense database name to a 2-character prefix for the *BlastTab table. my ($db) = @_; my $prefix; return $targetAbbrev if ($targetAbbrev && $db eq $tDb); if ($db =~ /^(\w\w)\d+$/) { $prefix = $1; } elsif ($db =~ /^(\w)\w\w(\w)\w\w\d+$/) { $prefix = $1 . lc($2); } else { # just use the whole db name $prefix = $db; } return $prefix; } sub loadPairwise { my ($tDb, $qDb, $tablePrefix, $max, $filePattern) = @_; $filePattern = 'out/*.tab' if (! defined $filePattern); my $tableName = $tablePrefix . 'BlastTab'; my $bestOnly = ($max > 1) ? '-bestOnly' : ''; my $runDir = "$buildDir/run.$tDb.$qDb"; my $bossScript = "$buildDir/run.$tDb.$qDb/loadPairwise.csh"; my $fh = HgAutomate::mustOpen(">$bossScript"); print $fh <<_EOF_ #!/bin/csh -efx # This script was automatically generated by $0 # from $CONFIG. # It is to be executed on $dbHost in $runDir . # It does the blastp cluster run. # This script will fail if any of its commands fail. cd $runDir hgLoadBlastTab $tDb $tableName -maxPer=$max $bestOnly $filePattern _EOF_ ; close($fh); &HgAutomate::run("chmod a+x $bossScript"); &HgAutomate::nfsNoodge("$bossScript"); return if ($opt_noLoad); &HgAutomate::run("$HgAutomate::runSSH $dbHost nice $bossScript"); } # loadPairwise sub recipBest { my ($tDb, $qDb) = @_; my $runDir = "$buildDir/run.$tDb.$qDb"; my $workhorse = &HgAutomate::chooseWorkhorse(); my $whatItDoes = "It runs blastRecipBest on the $tDb-$qDb and $qDb-$tDb " . "results."; my $bossScript = new HgRemoteScript("$runDir/doRecipBest.csh", $workhorse, $runDir, $whatItDoes, $CONFIG); $bossScript->add(<<_EOF_ cat $buildDir/run.$tDb.$qDb/out/*.tab \\ > $tDb.$qDb.all.tab cat $buildDir/run.$qDb.$tDb/out/*.tab \\ > $qDb.$tDb.all.tab blastRecipBest $tDb.$qDb.all.tab $qDb.$tDb.all.tab \\ $tDb.$qDb.recipBest.tab $qDb.$tDb.recipBest.tab mv $qDb.$tDb.recipBest.tab $buildDir/run.$qDb.$tDb/ _EOF_ ); $bossScript->execute(); } # recipBest sub cleanup { # Remove what we added in $scratchDir. foreach my $db (@_) { &HgAutomate::run("$HgAutomate::runSSH $distrHost rm -rf $scratchDir/$db.split"); &HgAutomate::run("$HgAutomate::runSSH $distrHost rm -rf $scratchDir/$db.formatdb"); } &HgAutomate::run("$HgAutomate::runSSH $distrHost rmdir $scratchDir"); } # cleanup sub celebrate { # Hooray, we're done. HgAutomate::verbose(1, "\n *** All done!\n"); if ($opt_noLoad) { if (! $opt_noSelf) { HgAutomate::verbose(1, " *** -noLoad was specified -- you can run this script " . "manually to load $tDb tables:\n"); HgAutomate::verbose(1, " run.$tDb.$tDb/loadPairwise.csh\n\n"); } if (! $opt_queryOnly) { HgAutomate::verbose(1, " *** -noLoad was specified -- you can run these scripts " . "manually to load $tDb tables:\n"); foreach my $qDb (@qDbs) { my $qPrefix = &dbToPrefix($qDb); HgAutomate::verbose(1, " run.$tDb.$qDb/loadPairwise.csh\n"); } HgAutomate::verbose(1, "\n"); } if (! $opt_targetOnly) { my $tPrefix = &dbToPrefix($tDb); HgAutomate::verbose(1, " *** -noLoad was specified -- you can run these scripts " . "manually to load ${tPrefix}BlastTab in query databases:\n"); foreach my $qDb (@qDbs) { my $qPrefix = &dbToPrefix($qDb); HgAutomate::verbose(1, " run.$qDb.$tDb/loadPairwise.csh\n"); } HgAutomate::verbose(1, "\n"); } } else { if (! $opt_queryOnly) { HgAutomate::verbose(1, " *** Check these tables in $tDb:\n *** "); if (! $opt_noSelf) { HgAutomate::verbose(1, $tGenesetPrefix . 'BlastTab '); } foreach my $qDb (@qDbs) { my $qPrefix = &dbToPrefix($qDb); HgAutomate::verbose(1, $qPrefix . 'BlastTab '); } HgAutomate::verbose(1, "\n"); } if (! $opt_targetOnly) { my $tPrefix = &dbToPrefix($tDb); HgAutomate::verbose(1, " *** Check $tPrefix" . "BlastTab in these databases:\n *** "); foreach my $qDb (@qDbs) { HgAutomate::verbose(1, "$qDb "); } HgAutomate::verbose(1, "\n"); } } HgAutomate::verbose(1, "\n"); } # celebrate ########################################################################### # MAIN # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); &checkOptions(); # Parse config file into target and query db specs and fasta paths. &usage(1) if (scalar(@ARGV) != 1); ($CONFIG) = @ARGV; &parseConfig($CONFIG); # Split target fasta. &HgAutomate::run("$HgAutomate::runSSH $distrHost mkdir $scratchDir"); my $tFasta = $dbToFasta{$tDb}; &splitSequence($tDb, $tFasta); # Self blastp. &formatSequence($tDb, $tFasta); if (! $opt_noSelf && ! $opt_queryOnly) { &runPairwiseBlastp($tDb, $tDb, $selfE, $selfMaxPer); &loadPairwise($tDb, $tDb, $tGenesetPrefix, $selfMaxPer); } # For each query db, pairwise blastp. my $tPrefix = &dbToPrefix($tDb); foreach my $qDb (@qDbs) { my $qFasta = $dbToFasta{$qDb}; my $qPrefix = &dbToPrefix($qDb); my $e = &calcE($tDb, $qDb); if ($recipBest{$qDb} || ! $opt_queryOnly) { # tDb vs qDb &formatSequence($qDb, $qFasta); &runPairwiseBlastp($tDb, $qDb, $e, $pairwiseMaxPer); } if (! $recipBest{$qDb} && ! $opt_queryOnly) { &loadPairwise($tDb, $qDb, $qPrefix, $pairwiseMaxPer); } if ($recipBest{$qDb} || ! $opt_targetOnly) { # qDb vs tDb &splitSequence($qDb, $qFasta); &runPairwiseBlastp($qDb, $tDb, $e, $pairwiseMaxPer); } if (! $recipBest{$qDb} && ! $opt_targetOnly) { &loadPairwise($qDb, $tDb, $tPrefix, $pairwiseMaxPer); } if ($recipBest{$qDb}) { &recipBest($tDb, $qDb); if (! $opt_queryOnly) { &loadPairwise($tDb, $qDb, $qPrefix, $pairwiseMaxPer, "$tDb.$qDb.recipBest.tab"); } if (! $opt_targetOnly) { &loadPairwise($qDb, $tDb, $tPrefix, $pairwiseMaxPer, "$qDb.$tDb.recipBest.tab"); } } } # Clean up. &cleanup($tDb, @qDbs); &celebrate();