[ensembl-dev] Difficulty finding genomic slices using Ensembl Perl API.

Andy Yates ayates at ebi.ac.uk
Mon Mar 14 10:35:34 GMT 2011


Hi Allan,

I think the problem here is to do with your input rather than the code. By the looks of things you are attempting to find the first supercontig of Aedes by the INSDC accession & version CH477186.1 which none of the sequence regions in Aedes database are named after. Ensembl Genomes does attempt to name the contig level sequence regions after its INSDC accession but currently we have no mechanism for automatically performing this mapping (ENA is working on a solution for us so this will not be the case for long). It is a convention you cannot rely on.

For the moment you should query the database using the names we currently have assigned. I've written up an example script which is hosted as a Gist @ https://gist.github.com/868984 which displays the available toplevel sequence regions &  shows you how to extract the information you require from the region in question.

Best regards,

Andy

On 14 Mar 2011, at 07:38, Allan Kamau wrote:

> Dear All,
> 
> I have been trying with little success to obtain Slices based on the
> following information.
> 1)species_name
> 2)genome_assembly_name
> 3)chromosome_name
> 4)start_pos
> 5)end_pos
> 
> This are the actual variables and values I have.
> 
> $species_name2='Aedes aegypti';
> $genome_assembly_name="AaegL1";
> $chromosome_name="CH477186.1"; #the value here can be that of a
> contig, chromosome or supercontig/scaffold or other sequence
> identifier.
> $start_pos=5248294;
> $end_pos=5248380;
> 
> 
> After trying to obtain slices for several such cases for many days I
> would like to ask for help with a few lines of working code that would
> obtain these slices sustainably. I can manage to get the SliceAdaptor
> but a call on fetch_by_region() yields undef. Please help.
> 
> 
> Below is my current code.
> 
> $registry->load_registry_from_db(
> 	-host=>'mysql.ebi.ac.uk',
> 	-user=>'anonymous',
> 	-port=>4157
> );
> 
> my $species_name2='Aedes aegypti';
> my $chromosome_name="CH477190.1";
> my $start_pos=2769446;
> my $end_pos=2769635;
> my $genome_assembly_name="AaegL1";
> 
> $species_name2='Anopheles gambiae';
> $genome_assembly_name="AGAMP3";
> $chromosome_name="3L";
> #34112621;34112703;34110621;34114703
> $start_pos=34112621;
> $end_pos=34112703;
> 
> $species_name2='Aedes aegypti';
> $genome_assembly_name="AaegL1";
> #$chromosome_name="CH477186.1";
> $chromosome_name="477186";
> $start_pos=5248294;
> $end_pos=5248380;
> 
> my $filename="test.slice.txt";
> 
> my $slice_found=undef;
> if(1==1)
> {
> 	if($registry)
> 	{
> 		$species_name2=~s/\s/_/gi;
> 		$species_name2=lc($species_name2);
> 		my $species_name_available=$registry->get_alias($species_name2);
> 		
> 		
> 		print ("species_name2 is:".$species_name2.", species_name_available
> is:$species_name_available\n");
> 		#my $db_adaptor2=$registry->get_DBAdaptor(-species=>'human');
> 		#print Dumper $db_adaptor2;
> 		my @db_adaptors=@{$registry->get_all_DBAdaptors(-species=>$species_name_available)};
> 		#my @db_adaptors=@{$registry->get_all_DBAdaptors(-species=>'human')};
> 			
> 		foreach my $db_adaptor(@db_adaptors)
> 		{
> 			my $db_connection=$db_adaptor->dbc();
> 			
> 			printf(
> 				"species/group\t%s/%s\ndatabase\t%s\nhost:port\t%s:%s\n\n",
> 				$db_adaptor->species(),$db_adaptor->group(),
> 				$db_connection->dbname(),$db_connection->host(),
> 				$db_connection->port()
> 			);
> 		}
> 		if(@db_adaptors)
> 		{
> 		}
> 		else
> 		{
> 			my $db_adaptor2=$registry->get_DBAdaptor($species_name_available,'core');
> 			if($db_adaptor2)
> 			{
> 				@db_adaptors=($db_adaptor2);
> 			}
> 			else
> 			{
> 				$db_adaptor2=$registry->get_DBAdaptor($species_name_available);
> 				if($db_adaptor2)
> 				{
> 					@db_adaptors=($db_adaptor2);
> 				}
> 			}
> 		}
> 		my $found_db_adaptor="false";
> 		if(1==1 and @db_adaptors)
> 		{
> 			foreach my $db_adaptor (@db_adaptors)
> 			{
> 				$found_db_adaptor="true";
> 				my $db_connection=$db_adaptor->dbc();
> 	
> 	
> 				my $coordinate_system_name='chromosome';
> 				if($chromosome_name=~m/.*?scaffold.*/i)
> 				{
> 					$coordinate_system_name='scaffold';
> 				}
> 				if($chromosome_name=~m/.*?contig.*/i)
> 				{
> 					$coordinate_system_name='contig';
> 				}
> 				my ($highest_cs)=@{$db_adaptor->get_CoordSystemAdaptor->fetch_all_by_name($coordinate_system_name)};
> 				#my ($highest_cs)=@{$db_adaptor->get_CoordSystemAdaptor->fetch_all()};
> 				my $csa=$db_adaptor->get_CoordSystemAdaptor();
> 				my %versions;
> 				foreach my $cs (@{$csa->fetch_all()})
> 				{
> 					$versions{$cs->version()}=1;
> 					my $str=join':',$cs->name(),$cs->version(),$cs->dbID();
> 					
> 					print("\$str is:$str\n");
> 					
> 					#my $assembly_label=$highest_cs->version();
> 					my $assembly_label=$cs->version();
> 					my $highest_available_coordinate_system_name=$cs->name();
> 					my $genome_assembly_name_available=$assembly_label;
> 					print ">>>species_name_available is:'$species_name_available',
> \$chromosome_name is:'$chromosome_name', \$coordinate_system_name
> is:'$coordinate_system_name',
> \$highest_available_coordinate_system_name
> is:'$highest_available_coordinate_system_name', assembly_label
> is:'$assembly_label', genome_assembly_name_available
> is:'$genome_assembly_name_available'"."\n";
> 					
> 					$coordinate_system_name=$cs->name();
> 	
> 					#print "coordinate_system_name is:$coordinate_system_name,
> highest_available_coordinate_system_name
> is:$highest_available_coordinate_system_name"."\n";
> 					#my $slice_adaptor=$db_adaptor->get_adaptor('Slice');
> 					my $slice_adaptor=$db_adaptor->get_SliceAdaptor();
> 					#print Dumper(\\$slice_adaptor);  # note the \ backslash;
> Dumper() takes references as arguments
> 	
> 					my $slice=$slice_adaptor->fetch_by_region($coordinate_system_name,$chromosome_name,$start_pos,$end_pos,1,$genome_assembly_name_available);
> 					#my $slice=$slice_adaptor->fetch_by_region(undef,$chromosome_name,$start_pos,$end_pos);
> 					if($slice)
> 					{
> 						$slice_found=$slice->seq();
> 						print("\$slice is:$slice");
> 						my $sequence=$slice->seq();
> 						print("\$sequence is:$sequence");
> 					}
> 					my $slices=$slice_adaptor->fetch_all('toplevel',$coordinate_system_name);
> 					if($slices)
> 					{
> 						# store a list of slice names in a file
> 						open(FILE,">$filename") or die("Could not open file $filename");
> 						foreach my $slice(@$slices)
> 						{
> 							print FILE $slice->name(),"\n";
> 						}
> 						close FILE;
> 					}
> 				}
> 			}
> 		}
> 	}
> }
> 
> 
> And the output.
> 
> species_name2 is:aedes_aegypti, species_name_available is:aedes_aegypti
> species/group	aedes_aegypti/core
> database	aedes_aegypti_core_8_61_1e
> host:port	mysql.ebi.ac.uk:4157
> 
> species/group	aedes_aegypti/otherfeatures
> database	aedes_aegypti_otherfeatures_8_61_1e
> host:port	mysql.ebi.ac.uk:4157
> 
> species/group	aedes_aegypti/funcgen
> database	aedes_aegypti_funcgen_8_61_1e
> host:port	mysql.ebi.ac.uk:4157
> 
> $str is:supercontig:AaegL1:1
>>>> species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'chromosome', $highest_available_coordinate_system_name is:'supercontig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
> $str is:contig:AaegL1:2
>>>> species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'supercontig', $highest_available_coordinate_system_name is:'contig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
> $str is:supercontig:AaegL1:1
>>>> species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'chromosome', $highest_available_coordinate_system_name is:'supercontig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
> $str is:contig:AaegL1:2
>>>> species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'supercontig', $highest_available_coordinate_system_name is:'contig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
> $str is:supercontig:AaegL1:1
>>>> species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'chromosome', $highest_available_coordinate_system_name is:'supercontig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
> $str is:contig:AaegL1:2
>>>> species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'supercontig', $highest_available_coordinate_system_name is:'contig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
> 
> 
> Allan.
> 
> _______________________________________________
> Dev mailing list
> Dev at ensembl.org
> http://lists.ensembl.org/mailman/listinfo/dev

-- 
Andrew Yates                   Ensembl Genomes Engineer
EMBL-EBI                       Tel: +44-(0)1223-492538
Wellcome Trust Genome Campus   Fax: +44-(0)1223-494468
Cambridge CB10 1SD, UK         http://www.ensemblgenomes.org/








More information about the Dev mailing list