#!/usr/bin/perl # # Module to generate phylogenetic profiles and perform statistical analysis. This module will probably not be updated again # since I'm working on more efficient implementations of the same functions. use Bio::SeqIO; use Bio::Seq; use Bio::SeqFeatureI; use Bio::AnnotationI; use Bio::Tools::Run::RemoteBlast; sub blast_here { my($in_file, $start, $end, @query_species) = @_; my %species = init_prof($in_file, $start, $end, @query_species); my $count = $start-1; my %total; for $x ($start .. $end) { print "\nProgress: $count of $end\n\n"; my $blast_report = new Bio::SearchIO ('-format' => 'blast', '-file' => $in_file.$x.".txt"); my $result = $blast_report->next_result; process_result($result, \%species, \%total, @query_species); $count++; } final_stats($count - $start + 1, \%total, @query_species); } sub blast_away { my($in_file, $in_format, $out_file, $start, $end, @query_species) = @_; my %species = init_prof($in_file, $start, $end, @query_species); # We load the sequences and create our sequence objects my $seqio = Bio::SeqIO->new( '-format' => "$in_format" , -file => "$in_file"); if ($start > 1) { for (2 .. $start) { $seq = $seqio->next_seq(); $x++; } } while (my $seq = $seqio->next_seq()) { last if $x == $end; $ref_name{$seq->display_id()} = $seq->accession_number(); # Get the reference sequence's accession number, if available push @query, $seq; $x++; } $end = ($#query+1) unless defined($end); # Creates the object for the method my $factory = Bio::Tools::Run::RemoteBlast->new( '-prog' => 'blastp', # We will BLAST proteins, so BLASTP is advizable '-data' => 'nr', _READMETHOD => "Blast", '-expect' => '1e-10' # This is the E cut-off value; if you are using evolutionary distant species, you should use a higher value ); # Multiple submission to the NCBI server my $blast_report = $factory->submit_blast(\@query); my $max_number = 100; my $trial = 0; my %total; my $count = $start-1; # Retrieve the results from the queue at NCBI while ( my @rids = $factory->each_rid ) { print STDERR "\nSorry, maximum number of retries $max_number exceeded\n" if $trial >= $max_number; last if $trial >= $max_number; $trial++; print STDERR "waiting... ".(5*$trial)." units of time\n" ; # RID = Remote Blast ID (e.g: 1017772174-16400-6638) foreach my $rid ( @rids ) { my $rc = $factory->retrieve_blast($rid); if( !ref($rc) ) { if( $rc < 0 ) { # retrieve_blast returns -1 on error $factory->remove_rid($rid); } # retrieve_blast returns 0 on 'job not finished' sleep 5*$trial; } else { #---- Blast done ---- # Save the resulting BLAST report print "\nProgress: $count of $end\n\n"; $factory->save_output(">".$out_file.($count+1).".txt") if defined($out_file); $factory->remove_rid($rid); my $result = $rc->next_result; process_result($result, \%species, \%total, @query_species); $count++; } } } final_stats($count - $start + 1, \%total, @query_species); } sub init_prof { my($in_file, $start, $end, @query_species) = @_; # In order to add another species, all you must do is add the corresponding pair as below my %species = ("man", "Homo sapiens", "mouse", "Mus musculus", "rat", "Rattus norvegicus", "fugu", "Takifugu rubripes", "worm", "Caenorhabditis elegans", "fly", "Drosophila melanogaster", "yeast", "Saccharomyces cerevisiae", "ecoli", "Escherichia coli"); # If we don't start the query at the first sequence, it's assumed there was a connection problem and so we add the results we get to the already existing file instead of creating a new file my $tmp = ">"; if ($start > 1) { $tmp = ">>"; } # The simple 1/0 phylogenetic profile will be saved as profile.txt open RESULT, "$tmp"."profile.txt"; # A more detailed comparison between proteins is saved as comparison.txt open REPORT, "$tmp"."comparison.txt"; # We create the headers in both output files unless ($start > 1) { my $xy = "Reference gene "; foreach (@query_species) { my $int = length($_); foreach (1 .. (6 - $int)) { $xy .= " "; } $xy .= $_; #foreach (1 .. (6 - $int)) { $xy .= " "; #} } print RESULT "$xy\n"; print REPORT "Multiple species comparison\n"; } return %species; } sub process_result { my $result = shift @_; my $species = shift @_; my $total = shift @_; my @query_species = @_; # We start by print the reference gene name and length my $query_name = $result->query_name(); my $query_length = $result->query_length(); if ($ref_name{$query_name} =~ /\w+/) { $query_acc = $ref_name{$query_name}; } else { $query_acc = "Not available"; } my $xy = "$query_name ($query_length aa):"; foreach (1 .. (24 - length($xy))) { $xy .= " "; } print RESULT $xy; print REPORT "\nReference gene: $query_name, $query_length aa & $query_acc\n"; undef %retrieve; # Here we use an object from the Bio::Search::Hit::GenericHit class while( $hit = $result->next_hit ) { foreach my $organism (@query_species) { if ($hit->description() =~ /$species->{$organism}/i ) { my @out = split / /, $retrieve{$organism}; my $score = $hit->raw_score(); if ( $score > $out[1] ) { undef @out; my $name = $hit->name(); my $length = $hit->length(); my $ident = $hit->frac_identical(); my $conserv = $hit->frac_conserved(); my $hitac = $hit->accession(); print "$query_name hit found in $organism\nScore: ", $score, " frac_identical: ", $ident, "\n"; # So the user knows what is going on and keeps track of the progress @out = ($name, $score, $length, $ident, $conserv, $hitac); $retrieve{$organism} = "@out"; } } } } # Now we treat the results statistically and print them into file foreach $org (@query_species) { undef @find; @find = split / /, $retrieve{$org}; my $found = 0; # This is the formula that determines whether a hit is a functionally similar protein or not. At present, the hit must have at least 60% of the length of the query sequence and have a conservation of at least 40% if ( $find[1] != 0 && ($find[2]/$query_length) > 0.6 && $find[4] > 0.4) { $found = 1; my @z = undef; @z = split / /, $total->{$org}; ++$z[3]; $total->{$org} = "@z"; } print RESULT "$found "; # The following piece of code prints a more detailed report print REPORT "$org: "; if ( $find[1] != 0) { my @z = undef; @z = split / /, $total->{$org}; ++$z[0]; # Number of hits $z[1] += $find[3]; # Similarity $z[2] += $find[4]; # Positives $total->{$org} = "@z"; print REPORT "$find[5], $find[2] aa, "; printf REPORT "Similarity = %.1f%%, Positives = %.1f%%\n", $find[3]*100, $find[4]*100; } else { print REPORT "Not found\n"; } } print RESULT "\n"; } sub final_stats { my $count = shift @_; my $total = shift @_; my @query_species = @_; # Printing the final stats print RESULT "Final results "; foreach $org (@query_species) { my @z = undef; @z = split / /, $total->{$org}; $z[3] = 0 unless defined($z[3]); my $xy = $z[3]."/".$count; foreach (1 .. (9 - length($xy))) { $xy .= " "; } print RESULT $xy; } print RESULT "\nPercentage "; print REPORT "\nFinal results:\n"; foreach $org (@query_species) { my @z = undef; @z = split / /, $total->{$org}; my $temp = ""; my $xy = ($z[3]/$count)*100; printf RESULT "%.0f%%", $xy; foreach (1 .. (8 - length(int("$xy")))) { $temp .= " "; } print RESULT $temp; # For the detailed results print REPORT "$org: "; my $xy = ($z[0]/$count)*100; if ($z[0] == 0) { $aa_sim = 0; $aa_con = 0; } else { $aa_sim = $z[1]/$z[0]*100; $aa_con = $z[2]/$z[0]*100; } printf REPORT "Homologs found: %.1f%%, Overall aa similarity = %.1f%%, Overall aa positives = %.1f%%\n", $xy, $aa_sim, $aa_con; } print RESULT "\n"; print REPORT "\n"; } # Subroutine to find 1:1 orthologs, assuming man to be the reference organism sub find_orthologs { my ($organism, @user) = @_; my %query; # hash with the reference => query accession number pairs my %names; # hash with the names of the genes we are loooking for foreach (@user) { my @x = split /\&/, $_; my $query_acc = ""; my $find_acc = ""; foreach (@x) { my @z = split /,/, $_; if ($z[0] eq "man") { # At present, we get the query accession number from the man hit because I normally keep the old accession number (often from a DNA sequence) in the EMBL sequence file. This can be changed in the future quite easily. $query_acc = $z[1]; $query_acc =~ s/ //; my @y = split /,/, $x[0]; $names{"$query_acc"} = $y[0]; } if ($z[0] eq $organism) { $find_acc = $z[1]; $find_acc =~ s/ //; } } $query{"$query_acc"} = $find_acc; # %query holds the pair of accession numbers we want to compare } foreach (keys %query) { my $query_id = $query{$_}; if ( $query_id =~ /NP(\w)+/) { $database = new Bio::DB::RefSeq; } else { $database = new Bio::DB::GenPept; } $seq = $database->get_Seq_by_acc("$query_id"); push @query_seq, $seq; } my $factory = Bio::Tools::Run::RemoteBlast->new( '-prog' => 'blastp', '-data' => 'nr', _READMETHOD => "Blast", '-expect' => '1e-20' # Since we are looking for human proteins, the score can be quite high ); my $blast_report = $factory->submit_blast(\@query_seq); my $max_number = 100; my $trial = 0; my $count = 0; # Retrieve the results from the queue at NCBI while ( my @rids = $factory->each_rid ) { print STDERR "\nSorry, maximum number of retries $max_number exceeded\n" if $trial >= $max_number; last if $trial >= $max_number; $trial++; print STDERR "waiting... ".(5*$trial)." units of time\n" ; # RID = Remote Blast ID (e.g: 1017772174-16400-6638) foreach my $rid ( @rids ) { my $rc = $factory->retrieve_blast($rid); if( !ref($rc) ) { if( $rc < 0 ) { # retrieve_blast returns -1 on error $factory->remove_rid($rid); } # retrieve_blast returns 0 on 'job not finished' sleep 5*$trial; } else { #---- Blast done ---- $factory->save_output(">orthol".($count+1).".txt"); # Save blast reports $count++; undef @hits; print "Progress: $count of $end\n"; $factory->remove_rid($rid); my $result = $rc->next_result; while( $hit = $result->next_hit ) { if ($hit->description() =~ /Homo sapiens/i ) { my $score = $hit->raw_score(); my $hitac = $hit->accession(); @hits = ($hitac, $score) unless ($hits[1] > $score); } } print "The best human homologue is: $hits[0]\n"; foreach (keys %query) { if ($_ eq $hits[0]) { print "Found match of $names{$_}\n"; push @orthologs, $_; } } } } } open OUT, ">orthologs.txt"; print OUT "List of 1:1 orthologs in $organism:\n\n"; foreach $find (keys %query) { print OUT "$names{$find}: "; my $tmp = 0; foreach (@orthologs) { $tmp = -1 unless $_ ne $find; } if ($tmp = -1) { print OUT "found 1:1 ortholog\n"; } else { print OUT "did not found 1:1 ortholog\n"; } } } 1;