[ensembl-dev] Memory leaks (cyclic references)

Kim Brugger kim.brugger at easih.ac.uk
Fri Sep 17 12:08:15 BST 2010


  Hi

I have a some data sets with ~40,000 SNPs that I am analysing this with 
a modified version of the snp_effect_predictor.pl script. Oddly enough 
the scripts never manage to finish due to excessive memory usage (+12GB 
ram). After cutting down the script to its minimum it is now clear that 
the ensembl code suffers from memory leaks and I suspect this is due to 
cyclic references.

My primary way of doing this has been by using Devel::Leak. This tells 
you the number of objects in the scalar table. In a perfect world the 
number of entries in this table should be constant as the variables are 
being garbage collected. This is not the case with the objects 
originating from the ensembl code. Looking at some of these objects with 
Devel::Cycle shows that the slice objects contains ~40 cyclic 
references, and TranscriptVariation upto +6000 cyclic references.

The number of scalars at the end of the runs with variable number of 
looping shows:

Normal:
10xloop : 141207 scalars
20xloop:  147866 scalars
100xloop: 195713 scalars

No caching:
10xloop : 140819 scalars
20xloop:  147608 scalars

Weaking Objects:
10xloop :  134188 scalars
20xloop:   133303 scalars

Undef Objects:
10xloop :  134188 scalars
20xloop:   133303 scalars
30xloop:   133537 scalars

I have not verified that the data from the weaked run match that from 
the normal as I have removed all prints and storage of data.

Looking through the core ensembl code I can only see weaken called once. 
As the objects contain a large number of cyclic references this is 
probably something that should be considered people analysing large 
datasets will run into this problem.

I have concatenated the script that I have used for my tests at the end 
of this email.

Cheers,

Kim

-- 
==========================================================
Kim Brugger
EASIH, University of Cambridge
www.easih.ac.uk
==========================================================


#!/usr/bin/perl
#
#
#
#
# Kim Brugger (17 Sep 2010), contact: kim.brugger at easih.ac.uk

use Devel::Leak;
use Devel::Cycle;
use Devel::Peek;
use Scalar::Util qw/weaken/;

use strict;
use warnings;
use Data::Dumper;

use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor;
use Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor;
use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;

my $species     = "human";
my $host        = 'ensembldb.ensembl.org';
my $user        = 'anonymous';
my $loops       = shift || 10;

my $weaken = 0;
my $undef  = 0;

# get registry
my $reg = 'Bio::EnsEMBL::Registry';
$reg->load_registry_from_db(-host => $host,-user => $user, -NO_CACHE => 0);
my $vfa = $reg->get_adaptor($species, 'variation', 'variationfeature');
my $tva = $reg->get_adaptor($species, 'variation', 'transcriptvariation');
my $sa  = $reg->get_adaptor($species, 'core', 'slice');
my $ga  = $reg->get_adaptor($species, 'core', 'gene');
my $afa = $reg->get_adaptor($species, 'funcgen', 'AnnotatedFeature');

# Always look at the same set of "random" data
srand( 2345678);

my $handle;
Devel::Leak::NoteSV($handle);
print STDERR "Start:".(Devel::Leak::NoteSV($handle))."\n";
for( my $i = 0; $i< $loops; $i++ ) {

   my @vcfs;
   foreach my $chr (1, 2) {

     my $slice;
     $slice = $sa->fetch_by_region('chromosome', $chr);

     my $pos =    int(rand(200000));

     my $new_vf = Bio::EnsEMBL::Variation::VariationFeature->new(
       -start          => $pos,
       -end            => $pos + 1,
       -slice          => $slice,
       -allele_string  => "AA",
       -strand         => 1,
       -map_weight     => 1,
       -adaptor        => $vfa,
       -variation_name => '--',
      );
     push @vcfs, $new_vf;
     $slice = undef;
   }
   my $effects = variation_effects(\@vcfs);
   @vcfs = ();

#    find_cycle($slice);
#    find_weakened_cycle($slice);
#    exit;
#    print Dumper( $slice );
}
print STDERR "Finished :".(Devel::Leak::NoteSV($handle))."\n";


#
#
#
# Kim Brugger (17 Sep 2010), contact: kim.brugger at easih.ac.uk
sub variation_effects {
   my ($var_features) = @_;

   $tva->fetch_all_by_VariationFeatures( $var_features );
   foreach my $vf (@$var_features) {

     my $name = $vf->variation_name();

     my $existing_vf = "";

     if(defined($vf->adaptor->db)) {
       my $fs = $vf->feature_Slice;

       if($fs->start > $fs->end) {
     ($fs->{'start'}, $fs->{'end'}) = ($fs->{'end'}, $fs->{'start'});
       }
       foreach my $existing_vf_obj 
(@{$vf->adaptor->fetch_all_by_Slice($fs)}) {
     $existing_vf = $existing_vf_obj->variation_name
         if ($existing_vf_obj->seq_region_start == $vf->seq_region_start &&
         $existing_vf_obj->seq_region_end   == $vf->seq_region_end );
       }
     }

     # the get_all_TranscriptVariations here now just retrieves the
     # objects that were attached above - it doesn't go off and do
     # the calculation again
     foreach my $con (@{$vf->get_all_TranscriptVariations}) {

#      find_cycle( $con);

       foreach my $string (@{$con->consequence_type}) {

     ($con->{'cdna_start'}, $con->{'cdna_end'}) = ($con->{'cdna_end'}, 
$con->{'cdna_start'})
         if ($con->cdna_start && $con->cdna_end && $con->cdna_start > 
$con->cdna_end);


     ($con->{'translation_start'}, $con->{'translation_end'}) = 
($con->{'translation_end'}, $con->{'translation_start'})
         if($con->translation_start &&  $con->translation_end && 
$con->translation_start > $con->translation_end);

     if ( $con->transcript ) {

       my $gene = 
$ga->fetch_by_transcript_stable_id($con->transcript->stable_id);
       my $xref = $con->transcript->get_all_DBEntries('RefSeq_dna' );

       if ( $con->translation_start) {
         my ( $old, $new ) = split("\/", $con->pep_allele_string);

         my $protein = $con->transcript->translation();

         my $prot_feats = $protein->get_all_ProteinFeatures();

         while (my $prot_feat = shift @{ $prot_feats }) {
           my $logic_name = $prot_feat->analysis()->logic_name();

           next if ( $logic_name ne 'Pfam');

           if ($con->translation_start >= $prot_feat->start() and
           $con->translation_end <= $prot_feat->end() ) {
           }
           $prot_feat = undef if ( $undef );
           weaken($prot_feat) if ( $weaken );
         }
         weaken($protein) if ( $weaken );
         $protein = undef if ( $undef );
         weaken($prot_feats) if ( $weaken );
         $prot_feats = undef if ( $undef );
       }
       weaken($gene) if ( $weaken );
       $gene = undef if ( $undef );
     }

       }
       weaken($con) if ( $weaken );
       $con = undef if ( $undef );
     }
   }
}





More information about the Dev mailing list