[ensembl-dev] Memory footprint of script
Duarte Molha
duartemolha at gmail.com
Tue Nov 1 12:58:44 GMT 2011
Thank you Andy
I will try your workaround...
can you clarify if I should use both approaches together (the -no_cache
flag and the iteration) or I can I select only one of the methods?
Best regards
Duarte
On 1 November 2011 11:39, Andy Yates <ayates at ebi.ac.uk> wrote:
> Hi Duarte,
>
> This is a known issue in the Ensembl API where some adaptors did not
> create the correct caching data structure & held onto memory for the
> runtime of the program & has been fixed in the HEAD checkout. Until e65 is
> released I would suggest using the following flag in your call to the
> Registry:
>
> $reg->load_registry_from_db(
> -host => $host,
> -user => $user,
> -pass => $password,
> -port => $port,
> -species => 'Homo sapiens',
> -verbose => 1,
> -no_cache => 1,
> );
>
> It is also possible to use the Ensembl Iterators which chunk a region
> query into smaller chunks therefore reducing the overall memory consumption
> but at the cost of some runtime e.g.
>
> #get slice for chr
> my $slice;
> eval { $slice = $sa->fetch_by_region('chromosome', $chr); };
> # if failed, try to get any seq region
> if(!defined($slice)) {
> $slice = $sa->fetch_by_region(undef, $chr);
> }
>
> # get all vars
> my $iter = $vfa->fetch_Iterator_by_Slice($slice, undef, 200_000);
> while(my $var = $iter->next()) {
> #Do things with your variation
> }
>
> The 3rd parameter in the method call is the size of chunking slice you
> require. The sizing of this will depend on how much memory you want to
> consume against the total runtime of the program.
>
> The other thing to be aware of is that under Linux systems (and a lot of
> *nix systems) once a process has consumed a block of memory it will never
> release that memory back to the system but Perl will still reuse the memory
> so long as the data is not held onto by a strong reference.
>
> Hope this helps,
>
> Andy
>
> On 1 Nov 2011, at 11:05, Duarte Molha wrote:
>
> > Dear Ensembl Devs
> >
> > I am struggling with an issue of the memory footprint of a script.
> > I have installed a local version of Ensembl V64 and have created a
> script that basically retrieves all known variation on any given chromosome
> and outputs to files some relevant information regarding allele frequencies
> and any annotations connected with each variation.
> >
> > Because I am looping though all variants and printing the data gathered
> to files I did not expect the memory footprint of the script to be as large
> as it is and it seems that for big chromosomes the script keeps increasing
> the memory allocated to him until it will eventually crash.
> >
> > Can any of you find where I have gone wrong on my code?
> > I appreciate any insights into what I might be doing wrong.
> >
> > Best regards
> >
> > Duarte Molha
> >
> > Script:
> >
> > #!/usr/bin/perl -w
> >
> >
> > use lib
> '/array/dept/compbio/ensembl/current_api/ensembl-variation/modules';
> # points to latest ensemblt core API
> >
> > use Bio::EnsEMBL::Registry;
> > use Statistics::Descriptive;
> > use Term::ProgressBar;
> > use IO::File;
> > use strict;
> >
> > my $host = 'myHost';
> > my $user = 'user';
> > my $password = 'pass';
> > my $port = 3306;
> >
> > # get registry
> > my $reg = 'Bio::EnsEMBL::Registry';
> >
> >
> > $reg->load_registry_from_db(
> > -host => $host,
> > -user => $user,
> > -pass => $password,
> > -port => $port,
> > -species => 'Homo sapiens',
> > -verbose => 1,
> > );
> >
> > my $vfa = $reg->get_adaptor('Homo sapiens', 'variation',
> 'variationfeature');
> > my $va = $reg->get_adaptor('Homo sapiens', 'variation', 'variation');
> > my $sa = $reg->get_adaptor('Homo sapiens', 'core', 'slice');
> > my $vaa = $reg->get_adaptor('Homo sapiens', 'variation',
> 'variationannotation');
> >
> > my $freqs_file;
> > my $annot_file;
> >
> > my $chr = shift;
> >
> >
> > my $core_mca = $reg->get_adaptor('Homo sapiens', 'core',
> 'metacontainer');
> > unless ($core_mca){
> > print STDERR "Failed to connect to local database\n";
> > die;
> > }
> >
> >
> > $freqs_file = new
> IO::File(">/ReferenceData/NGS_preprocessed_data/v64/".$chr."_freqs.txt");
> > $annot_file = new
> IO::File(">/ReferenceData/NGS_preprocessed_data/v64/".$chr."_annots.txt");
> >
> > run_data($chr);
> >
> > $freqs_file->close();
> > $annot_file->close();
> >
> > #main sub to run all the vars found on each inputr chr
> > sub run_data{
> > my $chr = shift;
> >
> > #get slice for chr
> > my $slice = "";
> > eval { $slice = $sa->fetch_by_region('chromosome', $chr); };
> > # if failed, try to get any seq region
> > if(!defined($slice)) {
> > $slice = $sa->fetch_by_region(undef, $chr);
> > }
> >
> > # get all vars
> > my @vfas = @{$vfa->fetch_all_by_Slice($slice)};
> >
> > # progress bar initialization
> > my $number_of_entries = scalar (@vfas);
> > my $progress = Term::ProgressBar->new ({count =>
> $number_of_entries ,
> > name => "Gathering Data
> for chr $chr :",
> > ETA => 'linear',
> > });
> > $progress->max_update_rate(1);
> > my $next_update = 0;
> > my $i = 0;
> >
> > # loop though all vars found
> > foreach my $vf (@vfas) {
> > #get freq data for var
> > my @data1 = get_freqs_for_existing($vf);
> > #print to file
> > if ($data1[0]){
> > print $freqs_file @data1;
> > }
> >
> > #get annotation data for far
> > my @data2 = get_phenotype_info($vf);
> > #print to file
> > if ($data2[0]){
> > print $annot_file @data2;
> > }
> >
> > #progress bar iteration
> > $i++;
> > $next_update = $progress->update() if ($i > $next_update);
> > }
> >
> > #update progress bar
> > $progress->update($number_of_entries) if $number_of_entries >=
> $next_update;
> > }
> >
> >
> > # sub to retrieve allele frequency info for input variation
> > sub get_freqs_for_existing{
> > my $vf = shift;
> >
> > my %data;
> > my @return_data;
> > foreach my $allele (@{$vf->get_all_Alleles()}) {
> >
> > next unless defined $allele->{population} || defined
> $allele->{'_population_id'};
> > my $pop_name = $allele->population->name;
> >
> > #populate hash if allele frequency info is available
> > if ($allele->allele && $allele->frequency){
> > #merge hapmap data and 1kg frequeny data
> > if ($pop_name =~ /^(CSHL-HAPMAP)|(1000)/i){
> > if ($pop_name =~ /^CSHL-HAPMAP/i){
> >
> push(@{$data{hap}{$allele->allele}}, $allele->frequency);
> > }
> > if($pop_name =~ /^1000/i){
> >
> push(@{$data{"1kg"}{$allele->allele}}, $allele->frequency);
> > }
> > push(@{$data{any}{$allele->allele}},
> $allele->frequency);
> > }
> > #gather frequency data for any population
> > push(@{$data{$pop_name}{$allele->allele}},
> $allele->frequency);
> > }
> > }
> >
> > # get statistics
> > foreach my $pop (keys %data){
> > foreach my $allele (keys %{$data{$pop}}){
> > my $stat = Statistics::Descriptive::Sparse->new();
> > $stat->add_data(@{$data{$pop}{$allele}});
> > my $mean = sprintf "%.3f", $stat->mean();
> > my $std = sprintf "%.3f",
> $stat->standard_deviation();
> > my $min = sprintf "%.3f", $stat->min();
> > push (@return_data,
> $vf->variation_name."\t"."chr".$vf->slice->seq_region_name().":".$vf->seq_region_start().'-'.$vf->seq_region_end()."\t".$pop."\t".$allele."\t".$mean."\t".$min."\t".$std."\n");
> > }
> > }
> >
> > return ((@return_data) ? @return_data : undef);
> >
> > }
> >
> > # sub to get all available annotation for input variation
> > sub get_phenotype_info{
> > my $vf = shift;
> > my @alleles = split /\//, $vf->allele_string;
> > my $var = $vf->variation();
> > my %vas;
> > my @return_data;
> >
> > #get all variation annotations available
> > for my $va (@{ $var->get_all_VariationAnnotations() }) {
> > my $risk_allele = "-";
> > foreach my $allele (@alleles){
> > if ($va->associated_variant_risk_allele){
> > my $risk_allele_annot = (split /\-/,
> $va->associated_variant_risk_allele)[1];
> > if ($risk_allele_annot &&
> $risk_allele_annot eq $allele){
> > $risk_allele = $risk_allele_annot;
> > last;
> > }
> > }
> > }
> > my $associated_genes = $va->associated_gene || "-";
> > my $pheno_desc = $va->phenotype_description || "-";
> > my $pheno_source = $va->source_name || "-";
> > my $pheno_study = $va->study_description || "-";
> > my $pheno_study_reference = $va->external_reference || "-";
> > my $string = join "\t", $associated_genes, $pheno_desc,
> $pheno_source, $pheno_study, $pheno_study_reference;
> > push (@{$vas{$risk_allele}}, $string);
> > }
> >
> > #retrieve all annotations info gathered
> > if (%vas){
> > foreach my $allele (keys %vas){
> > foreach my $annotation (@{$vas{$allele}}){
> > push (@return_data,
> $var->name."\t"."chr".$vf->slice->seq_region_name().":".$vf->seq_region_start().'-'.$vf->seq_region_end()."\tannot\t".$allele."\t".$annotation."\n");
> > }
> > }
> > }
> >
> > return ((@return_data) ? @return_data : undef);
> > }
> > _______________________________________________
> > Dev mailing list Dev at ensembl.org
> > List admin (including subscribe/unsubscribe):
> http://lists.ensembl.org/mailman/listinfo/dev
> > Ensembl Blog: http://www.ensembl.info/
>
> ---
> Andrew Yates Ensembl Core Software Project Leader
> EMBL-EBI Tel: +44-(0)1223-492538
> Wellcome Trust Genome Campus Fax: +44-(0)1223-494468
> Cambridge CB10 1SD, UK http://www.ensembl.org/
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.ensembl.org/pipermail/dev_ensembl.org/attachments/20111101/10aec082/attachment.html>
More information about the Dev
mailing list