[ensembl-dev] different figures between ensembl and biomart

Andrea Edwards edwardsa at cs.man.ac.uk
Thu Nov 18 17:24:04 GMT 2010


Hi

I have some code (see below) to get all of the exons in ensembl for cow 
database release 58. I got 225838 using this method. However a colleague 
of mine accessed all of the cow  exons using biomart and obtained 257029.

At present i don't have access to the code that accessed biomart but

1) is it possible to ask someone at ensembl for a definitive count of 
the number of exons in cow 58
2) can anyone see anything obvious in my code below why i might be 
missing any exons. I simply get all genes and all their transripts and 
all their exons. This includes all gene biotypes including protein 
coding and rna genes. My count without including RNA genes was 220 000 
ish. I can't imagine where an extra 25000 exons can come from unless 
(and I know this is speculation) the biomart script has duplicates for 
an exon when it appears in multiple transcripts whereas i only get 
unique exons.

thanks in advance


#!/usr/local/bin/perl

use lib '/home/andrea_bio/myperl/libs/ensembl-58/modules';
use lib '/home/andrea_bio/myperl/libs/ensembl-variation-58/modules';
use warnings;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor;
use DBI;
use DBD::mysql;


require "utilities.pl";

open INSERT_EXON_LOG, ">>insert_exon_log.txt" or die "could not open 
insert exon log: $!";
print INSERT_EXON_LOG "Start: ", getTime() , "\n";

my $registry = 'Bio::EnsEMBL::Registry';
$registry_file= "/home/andrea_bio/myperl/dev/snp/config.txt";
$registry->load_all($registry_file);



my $gene_adaptor = $registry->get_adaptor( 'bos_taurus', 'Core', 'Gene' );
my $genes = $gene_adaptor->fetch_all();
my $transcript_adaptor = $registry->get_adaptor( 'bos_taurus', 'Core', 
'Transcript' );
my $exon_adaptor = $registry->get_adaptor( 'bos_taurus', 'Core', 'Exon' );
my $dbh = DBI->connect('dbi:mysql:annotationDB','root','and1jon1', { 
'RaiseError' =>0});


my $sth = $dbh->prepare("insert into Bta40_Ens58_all  (EnsemblExonID, 
EnsemblGeneID, EnsemblTranscriptID, ChromosomeName, GeneStart, GeneEnd, 
TranscriptStart, TranscriptEnd, Strand, AssociatedGeneName, 5UTRStart, 
5UTREnd, 3UTRStart, 3UTREnd, CDSStart, CDSEnd, CDSLength,Description, 
GeneBiotype, ExonChrStart, ExonChrEnd, 
ConstitutiveExon,ExonRankinTranscript, Phase) Values( 
?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");


foreach $gene(@{$genes}) {

     #if ($gene->biotype eq 'protein_coding' ) {

         my $transcripts = $transcript_adaptor->fetch_all_by_Gene($gene);

         foreach $transcript(@{$transcripts}) {

             $exons = $exon_adaptor->fetch_all_by_Transcript($transcript);

             foreach $exon(@{$exons}) {


                 $select_query = "Select count(*) from Bta40_Ens58_all 
where EnsemblExonID = ?";
                 $rs = $dbh->prepare($select_query);
                 $rs->execute($exon->display_id);
                 $count = $rs->fetch()->[0];
                 print $exon->display_id, "\n";

                 unless ($count)  {

                     print "exon not found\n";





                     if ($sth->execute($exon->display_id , 
$gene->display_id,  $transcript->display_id , $gene->chr_name,  
$gene->start ,$gene->end ,$transcript->start ,$transcript->end , 
$gene->strand ,$gene->external_name , 0, 0, 0, 0, 
$transcript->cdna_coding_start , $transcript->cdna_coding_end ,0, 
$gene->description,  $gene->biotype,$exon->start , $exon->end 
,$exon->is_constitutive ,1,$exon->phase ))         {
                         $inserts++;
                                                 print "$inserts\n";

                     } else {
                         $insert_errors++;
                         my @inputs = ($exon->display_id , 
$gene->display_id,  $transcript->display_id , $gene->chr_name,  
$gene->start ,$gene->end ,$transcript->start ,$transcript->end , 
$gene->strand ,$gene->external_name , 0, 0, 0, 0, 
$transcript->cdna_coding_start , $transcript->cdna_coding_end ,0, 
$gene->description,  $gene->biotype,$exon->start , $exon->end 
,$exon->is_constitutive ,1,$exon->phase);
                         $" = ', ';
                         print INSERT_EXON_LOG "failed to insert known 
exon: @inputs\nError: $DBI::errstr\n";

                     }#end if insert ok


                 } #end exon not found so insert it

             }#end for each exon
         }#end for each transcript
     #}#end if gene is protein coding

}#end for each gene


print INSERT_EXON_LOG "inserts: $inserts\n";
print INSERT_EXON_LOG "insert errors: $insert_errors\n";
print INSERT_EXON_LOG "End: ", getTime() , "\n";
close INSERT_EXON_LOG;
print "done";






More information about the Dev mailing list