[ensembl-dev] retrieve conservation score of all human exons

Julien Roux julien.roux at unil.ch
Thu Jan 16 08:03:00 GMT 2014


Dear list,
I have been trying to extract the mean GERP conservation score for all 
human exons using the Ensembl API.
My script is working properly, however it is quite slow (only ~1000 
exons per hour) and I am wondering if there would be a way to speed it up.
I have tried to set "display_size" to 1, but t doesn't seem to give 
yield the exact mean score (because it is an average of precalculated 
averages I guess).
I am putting the script below.
Thanks for your help
Julien

#!/usr/bin/env perl
use strict;
use warnings;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Utils::Exception qw(throw);

# This script is inspired by the example script at 
http://cvs.sanger.ac.uk/cgi-bin/viewvc.cgi/ensembl-compara/scripts/examples/dna_getConservationScores.pl?root=ensembl&view=markup

my $reg = "Bio::EnsEMBL::Registry";
my $species = "Homo sapiens";
$reg->load_registry_from_db(
       -host => "ensembldb.ensembl.org",
       -user => "anonymous",
);

#get slice adaptor for $species
my $slice_adaptor = $reg->get_adaptor($species, 'core', 'Slice');

#get exon adaptor for $species
my $exon_adaptor = $reg->get_adaptor($species, 'Core', 'Exon');

#get method_link_species_set adaptor
my $mlss_adaptor = $reg->get_adaptor("Multi", "compara", 
"MethodLinkSpeciesSet");

#get method_link_species_set object for gerp conservation scores for mammals
my $mlss = 
$mlss_adaptor->fetch_by_method_link_type_species_set_name("GERP_CONSERVATION_SCORE", 
"mammals");

#get conservation score adaptor
my $cs_adaptor = $reg->get_adaptor("Multi", 'compara', 'ConservationScore');

# fetch all exon IDs
my @exon_ids = @{$exon_adaptor->list_stable_ids()};

foreach my $exon (@exon_ids){
   print $exon, "\n";
   my $slice = $slice_adaptor->fetch_by_exon_stable_id($exon);

   #get one score per base in the slice
   my $display_size = $slice->end - $slice->start + 1;
   my $scores = 
$cs_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($mlss, $slice, 
$display_size);

   # average the scores in the slice
   my $sum = 0;
   my $count = 0;
   my $diff_score;
   foreach my $score (@$scores) {
     if (defined $score->diff_score) {
       $sum += $score->diff_score;
       $count++;
     }
   }
   if ($count > 0){
     $diff_score = $sum / $count;
   }
   if (defined $diff_score){
     print "$exon\t$diff_score\n";
   }
   else {
     print "$exon\tNA\n";
   }
}
close OUT;
exit;

-- 
Julien Roux, PhD
Gilad lab, Department of Human Genetics, University of Chicago
http://giladlab.uchicago.edu/
920 East 58th Street, CLSC 317, Chicago, IL 60637, USA
tel: +1-773-834-1984   fax: +1-773-834-8470


-- 
Julien Roux, PhD
Gilad lab, Department of Human Genetics, University of Chicago
http://giladlab.uchicago.edu/
920 East 58th Street, CLSC 317, Chicago, IL 60637, USA
tel: +1-773-834-1984   fax: +1-773-834-8470




More information about the Dev mailing list