[ensembl-dev] Memory footprint of script

Tue Nov 1 11:39:06 GMT 2011

Hi Duarte,

This is a known issue in the Ensembl API where some adaptors did not create the correct caching data structure & held onto memory for the runtime of the program & has been fixed in the HEAD checkout. Until e65 is released I would suggest using the following flag in your call to the Registry:

$reg->load_registry_from_db(
	-host       => $host,
	-user       => $user,
	-pass       => $password,
	-port       => $port,
	-species    => 'Homo sapiens',
	-verbose    => 1,
        -no_cache   => 1,
);

It is also possible to use the Ensembl Iterators which chunk a region query into smaller chunks therefore reducing the overall memory consumption but at the cost of some runtime e.g.

#get slice for chr
my $slice;
eval { $slice = $sa->fetch_by_region('chromosome', $chr); };
# if failed, try to get any seq region
if(!defined($slice)) {
  $slice = $sa->fetch_by_region(undef, $chr);
}

# get all vars
my $iter = $vfa->fetch_Iterator_by_Slice($slice, undef, 200_000);
while(my $var = $iter->next()) {
  #Do things with your variation
}

The 3rd parameter in the method call is the size of chunking slice you require. The sizing of this will depend on how much memory you want to consume against the total runtime of the program.

The other thing to be aware of is that under Linux systems (and a lot of *nix systems) once a process has consumed a block of memory it will never release that memory back to the system but Perl will still reuse the memory so long as the data is not held onto by a strong reference. 

Hope this helps,

Andy

On 1 Nov 2011, at 11:05, Duarte Molha wrote:

> Dear Ensembl Devs
> 
> I am struggling with an issue of the memory footprint of a script.
> I have installed a local version of Ensembl V64 and have created a script that basically retrieves all known variation on any given chromosome and outputs to files some relevant information regarding allele frequencies and any annotations connected with each variation.
> 
> Because I am looping though all variants and printing the data gathered to files I did not expect the memory footprint of the script to be as large as it is and it seems that for big chromosomes the script keeps increasing the memory allocated to him until it will eventually crash.
> 
> Can any of you find where I have gone wrong on my code?
> I appreciate any insights into what I might be doing wrong.
> 
> Best regards
> 
>       Duarte Molha
> 
> Script:
> 
> #!/usr/bin/perl -w
> 
> 
> use lib '/array/dept/compbio/ensembl/current_api/ensembl-variation/modules';		# points to latest ensemblt core API
> 
> use Bio::EnsEMBL::Registry;
> use Statistics::Descriptive;
> use Term::ProgressBar;
> use IO::File;
> use strict;
> 
> my $host     = 'myHost';
> my $user     =  'user';
> my $password = 'pass';
> my $port     = 3306;
> 
> # get registry
> my $reg = 'Bio::EnsEMBL::Registry';
>     
> 
> $reg->load_registry_from_db(
> 	-host       => $host,
> 	-user       => $user,
> 	-pass       => $password,
> 	-port       => $port,
> 	-species    => 'Homo sapiens',
> 	-verbose    => 1,
> );
> 
> my $vfa   = $reg->get_adaptor('Homo sapiens', 'variation', 'variationfeature');
> my $va    = $reg->get_adaptor('Homo sapiens', 'variation', 'variation');
> my $sa    = $reg->get_adaptor('Homo sapiens', 'core', 'slice');
> my $vaa   = $reg->get_adaptor('Homo sapiens', 'variation', 'variationannotation');
> 
> my $freqs_file;
> my $annot_file;
> 
> my $chr = shift;
> 
> 
> my $core_mca = $reg->get_adaptor('Homo sapiens', 'core', 'metacontainer');
> unless ($core_mca){
> 	print STDERR "Failed to connect to local database\n";
> 	die;
> }
> 
> 
> $freqs_file = new IO::File(">/ReferenceData/NGS_preprocessed_data/v64/".$chr."_freqs.txt");
> $annot_file = new IO::File(">/ReferenceData/NGS_preprocessed_data/v64/".$chr."_annots.txt");
> 
> run_data($chr);
> 
> $freqs_file->close();
> $annot_file->close();
> 
> #main sub to run all the vars found on each inputr chr
> sub run_data{
> 	my $chr = shift;
> 	
> 	#get slice for chr
> 	my $slice = "";
> 	eval { $slice = $sa->fetch_by_region('chromosome', $chr); };
> 	# if failed, try to get any seq region
> 	if(!defined($slice)) {
> 		$slice = $sa->fetch_by_region(undef, $chr);
> 	}
> 	
> 	# get all vars
> 	my @vfas = @{$vfa->fetch_all_by_Slice($slice)};
> 	
> 	# progress bar initialization
> 	my $number_of_entries = scalar (@vfas);
> 	my $progress = Term::ProgressBar->new ({count => $number_of_entries ,
> 						name => "Gathering Data for chr $chr :", 
> 						ETA => 'linear', 
> 					});
> 	$progress->max_update_rate(1);
> 	my $next_update = 0;
> 	my $i = 0;
> 	
> 	# loop though all vars found
> 	foreach my $vf (@vfas) {
> 		#get freq data for var
> 		my @data1 = get_freqs_for_existing($vf);
> 		#print to file
> 		if ($data1[0]){
> 			print $freqs_file @data1;
> 		}
> 		
> 		#get annotation data for far
> 		my @data2 = get_phenotype_info($vf);
> 		#print to file
> 		if ($data2[0]){
> 			print $annot_file @data2;
> 		}
> 		
> 		#progress bar iteration
> 		$i++;
> 		$next_update = $progress->update() if ($i > $next_update);
> 	}
> 	
> 	#update progress bar
> 	$progress->update($number_of_entries) if $number_of_entries >= $next_update;
> }
> 
> 
> # sub to retrieve allele frequency info for input variation
> sub get_freqs_for_existing{
> 	my $vf = shift;
> 
> 	my %data;
> 	my @return_data;
> 	foreach my $allele (@{$vf->get_all_Alleles()}) {
> 		
> 		next unless defined $allele->{population} || defined $allele->{'_population_id'};
> 		my $pop_name = $allele->population->name;
> 		
> 		#populate hash if allele frequency info is available
> 		if ($allele->allele && $allele->frequency){
> 			#merge hapmap data and 1kg frequeny data
> 			if ($pop_name =~ /^(CSHL-HAPMAP)|(1000)/i){
> 				if ($pop_name =~ /^CSHL-HAPMAP/i){
> 					push(@{$data{hap}{$allele->allele}}, $allele->frequency);
> 				}
> 				if($pop_name =~ /^1000/i){
> 					push(@{$data{"1kg"}{$allele->allele}}, $allele->frequency);
> 				}
> 				push(@{$data{any}{$allele->allele}}, $allele->frequency);
> 			}
> 			#gather frequency data for any population
> 			push(@{$data{$pop_name}{$allele->allele}}, $allele->frequency);
> 		}
> 	}
> 	
> 	# get statistics
> 	foreach my $pop (keys %data){
> 		foreach my $allele (keys %{$data{$pop}}){
> 			my $stat = Statistics::Descriptive::Sparse->new();
> 			$stat->add_data(@{$data{$pop}{$allele}}); 
> 			my $mean =  sprintf "%.3f", $stat->mean();
> 			my $std  =  sprintf "%.3f", $stat->standard_deviation();
> 			my $min  =  sprintf "%.3f", $stat->min();
> 			push (@return_data, $vf->variation_name."\t"."chr".$vf->slice->seq_region_name().":".$vf->seq_region_start().'-'.$vf->seq_region_end()."\t".$pop."\t".$allele."\t".$mean."\t".$min."\t".$std."\n");
> 		}
> 	}
> 	
> 	return ((@return_data) ? @return_data : undef);
> 
> }
> 
> # sub to get all available annotation for input variation
> sub get_phenotype_info{
> 	my $vf = shift;
> 	my @alleles = split /\//, $vf->allele_string;
> 	my $var = $vf->variation();
> 	my %vas;
> 	my @return_data;
> 
> 	#get all variation annotations available
> 	for my $va (@{ $var->get_all_VariationAnnotations() }) {
> 		my $risk_allele = "-";
> 		foreach my $allele (@alleles){
> 			if ($va->associated_variant_risk_allele){
> 				my $risk_allele_annot = (split /\-/, $va->associated_variant_risk_allele)[1];
> 				if ($risk_allele_annot && $risk_allele_annot eq $allele){
> 					$risk_allele = $risk_allele_annot;
> 					last;
> 				}
> 			}
> 		}
> 		my $associated_genes = $va->associated_gene || "-";
> 		my $pheno_desc       = $va->phenotype_description || "-";
> 		my $pheno_source     = $va->source_name || "-";
> 		my $pheno_study      = $va->study_description || "-";
> 		my $pheno_study_reference = $va->external_reference || "-";
> 		my $string = join "\t", $associated_genes, $pheno_desc, $pheno_source, $pheno_study, $pheno_study_reference;
> 		push (@{$vas{$risk_allele}}, $string);
> 	}
> 
> 	#retrieve all annotations info gathered 
> 	if (%vas){
> 		foreach my $allele (keys %vas){
> 			foreach my $annotation (@{$vas{$allele}}){
> 				push (@return_data, $var->name."\t"."chr".$vf->slice->seq_region_name().":".$vf->seq_region_start().'-'.$vf->seq_region_end()."\tannot\t".$allele."\t".$annotation."\n");
> 			}
> 		}
> 	}
> 	
> 	return ((@return_data) ? @return_data : undef);
> }
> _______________________________________________
> Dev mailing list    Dev at ensembl.org
> List admin (including subscribe/unsubscribe): http://lists.ensembl.org/mailman/listinfo/dev
> Ensembl Blog: http://www.ensembl.info/

---
Andrew Yates                   Ensembl Core Software Project Leader
EMBL-EBI                       Tel: +44-(0)1223-492538
Wellcome Trust Genome Campus   Fax: +44-(0)1223-494468
Cambridge CB10 1SD, UK         http://www.ensembl.org/