Esse script carrega em um hash a tabela de codon usage (reference) de 6.222 ORF's de leveduras, depois conta a frequência de códons de um arquivo de genes (query) e compara a frequência de códons da query x reference.
Versão beta (com super auxílio do Rondon)
#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;
# use diagnostics;
die "\nUSAGE::: perl $0 [query.fasta] [reference.fasta(optional)]\n\n" if !$ARGV[0];
#gera tabela de codon usage para a query
my $query_file = $ARGV[0];
my %aux_query1= %{usage ($query_file)};
open(FILE, 'table_yeast_usage_parsed.txt') or die;
my %table_usage;
foreach my $line (<FILE>){
chomp $line;
if ( $line =~ /^Codon/ ) { next }
elsif ( $line !~ /\w/ ) { next }
my ($codon,$aa,$contagem,$fraction) = split ('\t', $line);
# print "$fraction'\n";
$table_usage{$codon}{aa}=$aa;
$table_usage{$codon}{contagem}=$contagem;
$table_usage{$codon}{fraction}=$fraction;
}
# my $contagem_total = %{%usage_reference};
my $ref = contagem_aa_totais( \%aux_query1 );
my %hash = %{$ref};
my $ref2 = gera_valor_normalizado_reference(\%hash);
my %fraction_query = %{$ref2};
foreach my $key(sort {$fraction_query{$a}{aa} cmp $fraction_query{$b}{aa}} keys %fraction_query){ #acessar os valores dentro do hash
print "$key\t$fraction_query{$key}{aa}\t$fraction_query{$key}{normalizado}\t$table_usage{$key}{fraction}\n";
}
close FILE;
exit;
sub contagem_aa_totais {
#Faz a contagem total das aparicoes dos aa pra normalizar os valores. Pega o hash de dados da Referencia e insere os resultados na mesma tabela.
my $ref = shift;
my %usage_reference = %{$ref};
#Gera array com os aa usados
my @aa;
while (my ($codon, $ref) = each %usage_reference) {
my %valores = %{$ref};
foreach my $key (keys %valores){
if ($key eq "aa") { push @aa, $valores{$key} }
}
}
#soma o total de cada aa
foreach my $a (@aa) {
my $sum = 0;
#soma cada aa
foreach my $codon (keys %usage_reference){
if ( $a eq $usage_reference{$codon}{aa} ) { $sum += $usage_reference{$codon}{contagem} }
}
#insere o valor no hash
foreach my $codon (keys %usage_reference){
if ( $a eq $usage_reference{$codon}{aa} ) { $usage_reference{$codon}{aa_totais} = $sum }
}
}
return \%usage_reference;
}
sub gera_valor_normalizado_reference {
my $ref = shift;
my %usage_reference = %{$ref};
foreach my $codon (keys %usage_reference) {
$usage_reference{$codon}{normalizado} = $usage_reference{$codon}{contagem} / $usage_reference{$codon}{aa_totais};
}
return \%usage_reference;
}
sub conta_codons {
# conta os codons de um arquivo fasta
my $file_name = shift;
my $file_obj = Bio::SeqIO -> new ( -file => $file_name,
-format => 'fasta' );
#gera hash com a quantidade de vezes que cada codon aparece.
my %codons;
while (my $inseq = $file_obj->next_seq) {
# grava cada seq na variavel $sequence
my $sequence = $inseq->seq();
#anda de 3 em 3 (tem q ver se da certo)
for(my $i=0; $i < (length($sequence) - 2) ; $i += 3){
my $codon = substr($sequence,$i,3);
#se o codon for novo na tabela, ele recebe o valor 1
if (!defined $codons{$codon}) { $codons{$codon} = 1 }
# se ele for recorrente, apenas soma 1.
else { $codons{$codon}++; }
}
}
return \%codons;
}
sub gera_valor_normalizado_query {
#insere na tabela da query, qual e a freq normalizada (em %) de cada codon
# essa subrotina recebe dois valores: has_query e hash_referencia
my $ref_query = shift;
my %usage_query = %{$ref_query};
my $ref_reference = shift;
my %usage_reference = %{$ref_reference};
foreach my $codon (keys %usage_query) {
$usage_query{$codon}{freq_codon_normalizada} = $usage_reference{$codon}{normalizado};
}
return \%usage_query;
}
sub table_aa {
#Verifica e salva quais sao os aa dos codons e suas freq
my $ref = shift;
my %codons = %{$ref};
my %usage;
foreach my $codon (keys %codons) {
if ( $codon =~ /GC./i) { $usage{$codon}{aa}= "ala"; $usage{$codon}{contagem}= $codons{$codon}; } # Alanine
elsif ( $codon =~ /TG[TC]/i) { $usage{$codon}{aa}= "cys"; $usage{$codon}{contagem}= $codons{$codon}; } # Cysteine
elsif ( $codon =~ /GA[TC]/i) { $usage{$codon}{aa}= "aca"; $usage{$codon}{contagem}= $codons{$codon}; } # Aspartic Acid
elsif ( $codon =~ /GA[AG]/i) { $usage{$codon}{aa}= "glu"; $usage{$codon}{contagem}= $codons{$codon}; } # Glutamic Acid
elsif ( $codon =~ /TT[TC]/i) { $usage{$codon}{aa}= "phe"; $usage{$codon}{contagem}= $codons{$codon}; } # Phenylalanine
elsif ( $codon =~ /GG./i) { $usage{$codon}{aa}= "gly"; $usage{$codon}{contagem}= $codons{$codon}; } # Glycine
elsif ( $codon =~ /CA[TC]/i) { $usage{$codon}{aa}= "his"; $usage{$codon}{contagem}= $codons{$codon}; } # Histidine
elsif ( $codon =~ /AT[TCA]/i) { $usage{$codon}{aa}= "iso"; $usage{$codon}{contagem}= $codons{$codon}; } # Isoleucine
elsif ( $codon =~ /AA[AG]/i) { $usage{$codon}{aa}= "lys"; $usage{$codon}{contagem}= $codons{$codon}; } # Lysine
elsif ( $codon =~ /TT[AG]|CT./i) { $usage{$codon}{aa}= "leu"; $usage{$codon}{contagem}= $codons{$codon}; } # Leucine
elsif ( $codon =~ /ATG/i) { $usage{$codon}{aa}= "met"; $usage{$codon}{contagem}= $codons{$codon}; } # Methionine
elsif ( $codon =~ /AA[TC]/i) { $usage{$codon}{aa}= "asp"; $usage{$codon}{contagem}= $codons{$codon}; } # Asparagine
elsif ( $codon =~ /CC./i) { $usage{$codon}{aa}= "pro"; $usage{$codon}{contagem}= $codons{$codon}; } # Proline
elsif ( $codon =~ /CA[AG]/i) { $usage{$codon}{aa}= "glu"; $usage{$codon}{contagem}= $codons{$codon}; } # Glutamine
elsif ( $codon =~ /CG.|AG[AG]/i) { $usage{$codon}{aa}= "arg"; $usage{$codon}{contagem}= $codons{$codon}; } # Arginine
elsif ( $codon =~ /TC.|AG[TC]/i) { $usage{$codon}{aa}= "ser"; $usage{$codon}{contagem}= $codons{$codon}; } # Serine
elsif ( $codon =~ /AC./i) { $usage{$codon}{aa}= "thr"; $usage{$codon}{contagem}= $codons{$codon}; } # Threonine
elsif ( $codon =~ /GT./i) { $usage{$codon}{aa}= "val"; $usage{$codon}{contagem}= $codons{$codon}; } # Valine
elsif ( $codon =~ /TGG/i) { $usage{$codon}{aa}= "try"; $usage{$codon}{contagem}= $codons{$codon}; } # Tryptophan
elsif ( $codon =~ /TA[TC]/i) { $usage{$codon}{aa}= "tyr"; $usage{$codon}{contagem}= $codons{$codon}; } # Tyrosine
elsif ( $codon =~ /TA[AG]|TGA/i) { $usage{$codon}{aa}= "stp"; $usage{$codon}{contagem}= $codons{$codon}; } # Stop
else { $usage{$codon}{aa}= "err"; $usage{$codon}{contagem}= $codons{$codon}; } # base diferente
}
return \%usage;
}
sub usage {
#conta codons num arquivo fasta ou multifasta e gera tabela com codon, aa e freq e outros dados
my $query_file = shift;
#fre de codons
my %codons = %{conta_codons($query_file)};
#associa a contagem dos codons com os aminoacidos
my %usage = %{table_aa(\%codons)};
return \%usage;
}