[ensembl-dev] Memory footprint of script

Tue Nov 1 12:58:44 GMT 2011

Thank you Andy

I will try your workaround...

can you clarify if I should use both approaches together (the -no_cache
flag and the iteration) or I can I select only one of the methods?

Best regards

Duarte

On 1 November 2011 11:39, Andy Yates <ayates at ebi.ac.uk> wrote:

> Hi Duarte,
>
> This is a known issue in the Ensembl API where some adaptors did not
> create the correct caching data structure & held onto memory for the
> runtime of the program & has been fixed in the HEAD checkout. Until e65 is
> released I would suggest using the following flag in your call to the
> Registry:
>
> $reg->load_registry_from_db(
>        -host       => $host,
>        -user       => $user,
>        -pass       => $password,
>        -port       => $port,
>        -species    => 'Homo sapiens',
>        -verbose    => 1,
>         -no_cache   => 1,
> );
>
> It is also possible to use the Ensembl Iterators which chunk a region
> query into smaller chunks therefore reducing the overall memory consumption
> but at the cost of some runtime e.g.
>
> #get slice for chr
> my $slice;
> eval { $slice = $sa->fetch_by_region('chromosome', $chr); };
> # if failed, try to get any seq region
> if(!defined($slice)) {
>  $slice = $sa->fetch_by_region(undef, $chr);
> }
>
> # get all vars
> my $iter = $vfa->fetch_Iterator_by_Slice($slice, undef, 200_000);
> while(my $var = $iter->next()) {
>  #Do things with your variation
> }
>
> The 3rd parameter in the method call is the size of chunking slice you
> require. The sizing of this will depend on how much memory you want to
> consume against the total runtime of the program.
>
> The other thing to be aware of is that under Linux systems (and a lot of
> *nix systems) once a process has consumed a block of memory it will never
> release that memory back to the system but Perl will still reuse the memory
> so long as the data is not held onto by a strong reference.
>
> Hope this helps,
>
> Andy
>
> On 1 Nov 2011, at 11:05, Duarte Molha wrote:
>
> > Dear Ensembl Devs
> >
> > I am struggling with an issue of the memory footprint of a script.
> > I have installed a local version of Ensembl V64 and have created a
> script that basically retrieves all known variation on any given chromosome
> and outputs to files some relevant information regarding allele frequencies
> and any annotations connected with each variation.
> >
> > Because I am looping though all variants and printing the data gathered
> to files I did not expect the memory footprint of the script to be as large
> as it is and it seems that for big chromosomes the script keeps increasing
> the memory allocated to him until it will eventually crash.
> >
> > Can any of you find where I have gone wrong on my code?
> > I appreciate any insights into what I might be doing wrong.
> >
> > Best regards
> >
> >       Duarte Molha
> >
> > Script:
> >
> > #!/usr/bin/perl -w
> >
> >
> > use lib
> '/array/dept/compbio/ensembl/current_api/ensembl-variation/modules';
>    # points to latest ensemblt core API
> >
> > use Bio::EnsEMBL::Registry;
> > use Statistics::Descriptive;
> > use Term::ProgressBar;
> > use IO::File;
> > use strict;
> >
> > my $host     = 'myHost';
> > my $user     =  'user';
> > my $password = 'pass';
> > my $port     = 3306;
> >
> > # get registry
> > my $reg = 'Bio::EnsEMBL::Registry';
> >
> >
> > $reg->load_registry_from_db(
> >       -host       => $host,
> >       -user       => $user,
> >       -pass       => $password,
> >       -port       => $port,
> >       -species    => 'Homo sapiens',
> >       -verbose    => 1,
> > );
> >
> > my $vfa   = $reg->get_adaptor('Homo sapiens', 'variation',
> 'variationfeature');
> > my $va    = $reg->get_adaptor('Homo sapiens', 'variation', 'variation');
> > my $sa    = $reg->get_adaptor('Homo sapiens', 'core', 'slice');
> > my $vaa   = $reg->get_adaptor('Homo sapiens', 'variation',
> 'variationannotation');
> >
> > my $freqs_file;
> > my $annot_file;
> >
> > my $chr = shift;
> >
> >
> > my $core_mca = $reg->get_adaptor('Homo sapiens', 'core',
> 'metacontainer');
> > unless ($core_mca){
> >       print STDERR "Failed to connect to local database\n";
> >       die;
> > }
> >
> >
> > $freqs_file = new
> IO::File(">/ReferenceData/NGS_preprocessed_data/v64/".$chr."_freqs.txt");
> > $annot_file = new
> IO::File(">/ReferenceData/NGS_preprocessed_data/v64/".$chr."_annots.txt");
> >
> > run_data($chr);
> >
> > $freqs_file->close();
> > $annot_file->close();
> >
> > #main sub to run all the vars found on each inputr chr
> > sub run_data{
> >       my $chr = shift;
> >
> >       #get slice for chr
> >       my $slice = "";
> >       eval { $slice = $sa->fetch_by_region('chromosome', $chr); };
> >       # if failed, try to get any seq region
> >       if(!defined($slice)) {
> >               $slice = $sa->fetch_by_region(undef, $chr);
> >       }
> >
> >       # get all vars
> >       my @vfas = @{$vfa->fetch_all_by_Slice($slice)};
> >
> >       # progress bar initialization
> >       my $number_of_entries = scalar (@vfas);
> >       my $progress = Term::ProgressBar->new ({count =>
> $number_of_entries ,
> >                                               name => "Gathering Data
> for chr $chr :",
> >                                               ETA => 'linear',
> >                                       });
> >       $progress->max_update_rate(1);
> >       my $next_update = 0;
> >       my $i = 0;
> >
> >       # loop though all vars found
> >       foreach my $vf (@vfas) {
> >               #get freq data for var
> >               my @data1 = get_freqs_for_existing($vf);
> >               #print to file
> >               if ($data1[0]){
> >                       print $freqs_file @data1;
> >               }
> >
> >               #get annotation data for far
> >               my @data2 = get_phenotype_info($vf);
> >               #print to file
> >               if ($data2[0]){
> >                       print $annot_file @data2;
> >               }
> >
> >               #progress bar iteration
> >               $i++;
> >               $next_update = $progress->update() if ($i > $next_update);
> >       }
> >
> >       #update progress bar
> >       $progress->update($number_of_entries) if $number_of_entries >=
> $next_update;
> > }
> >
> >
> > # sub to retrieve allele frequency info for input variation
> > sub get_freqs_for_existing{
> >       my $vf = shift;
> >
> >       my %data;
> >       my @return_data;
> >       foreach my $allele (@{$vf->get_all_Alleles()}) {
> >
> >               next unless defined $allele->{population} || defined
> $allele->{'_population_id'};
> >               my $pop_name = $allele->population->name;
> >
> >               #populate hash if allele frequency info is available
> >               if ($allele->allele && $allele->frequency){
> >                       #merge hapmap data and 1kg frequeny data
> >                       if ($pop_name =~ /^(CSHL-HAPMAP)|(1000)/i){
> >                               if ($pop_name =~ /^CSHL-HAPMAP/i){
> >
> push(@{$data{hap}{$allele->allele}}, $allele->frequency);
> >                               }
> >                               if($pop_name =~ /^1000/i){
> >
> push(@{$data{"1kg"}{$allele->allele}}, $allele->frequency);
> >                               }
> >                               push(@{$data{any}{$allele->allele}},
> $allele->frequency);
> >                       }
> >                       #gather frequency data for any population
> >                       push(@{$data{$pop_name}{$allele->allele}},
> $allele->frequency);
> >               }
> >       }
> >
> >       # get statistics
> >       foreach my $pop (keys %data){
> >               foreach my $allele (keys %{$data{$pop}}){
> >                       my $stat = Statistics::Descriptive::Sparse->new();
> >                       $stat->add_data(@{$data{$pop}{$allele}});
> >                       my $mean =  sprintf "%.3f", $stat->mean();
> >                       my $std  =  sprintf "%.3f",
> $stat->standard_deviation();
> >                       my $min  =  sprintf "%.3f", $stat->min();
> >                       push (@return_data,
> $vf->variation_name."\t"."chr".$vf->slice->seq_region_name().":".$vf->seq_region_start().'-'.$vf->seq_region_end()."\t".$pop."\t".$allele."\t".$mean."\t".$min."\t".$std."\n");
> >               }
> >       }
> >
> >       return ((@return_data) ? @return_data : undef);
> >
> > }
> >
> > # sub to get all available annotation for input variation
> > sub get_phenotype_info{
> >       my $vf = shift;
> >       my @alleles = split /\//, $vf->allele_string;
> >       my $var = $vf->variation();
> >       my %vas;
> >       my @return_data;
> >
> >       #get all variation annotations available
> >       for my $va (@{ $var->get_all_VariationAnnotations() }) {
> >               my $risk_allele = "-";
> >               foreach my $allele (@alleles){
> >                       if ($va->associated_variant_risk_allele){
> >                               my $risk_allele_annot = (split /\-/,
> $va->associated_variant_risk_allele)[1];
> >                               if ($risk_allele_annot &&
> $risk_allele_annot eq $allele){
> >                                       $risk_allele = $risk_allele_annot;
> >                                       last;
> >                               }
> >                       }
> >               }
> >               my $associated_genes = $va->associated_gene || "-";
> >               my $pheno_desc       = $va->phenotype_description || "-";
> >               my $pheno_source     = $va->source_name || "-";
> >               my $pheno_study      = $va->study_description || "-";
> >               my $pheno_study_reference = $va->external_reference || "-";
> >               my $string = join "\t", $associated_genes, $pheno_desc,
> $pheno_source, $pheno_study, $pheno_study_reference;
> >               push (@{$vas{$risk_allele}}, $string);
> >       }
> >
> >       #retrieve all annotations info gathered
> >       if (%vas){
> >               foreach my $allele (keys %vas){
> >                       foreach my $annotation (@{$vas{$allele}}){
> >                               push (@return_data,
> $var->name."\t"."chr".$vf->slice->seq_region_name().":".$vf->seq_region_start().'-'.$vf->seq_region_end()."\tannot\t".$allele."\t".$annotation."\n");
> >                       }
> >               }
> >       }
> >
> >       return ((@return_data) ? @return_data : undef);
> > }
> > _______________________________________________
> > Dev mailing list    Dev at ensembl.org
> > List admin (including subscribe/unsubscribe):
> http://lists.ensembl.org/mailman/listinfo/dev
> > Ensembl Blog: http://www.ensembl.info/
>
> ---
> Andrew Yates                   Ensembl Core Software Project Leader
> EMBL-EBI                       Tel: +44-(0)1223-492538
> Wellcome Trust Genome Campus   Fax: +44-(0)1223-494468
> Cambridge CB10 1SD, UK         http://www.ensembl.org/
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.ensembl.org/pipermail/dev_ensembl.org/attachments/20111101/10aec082/attachment.html>