[ensembl-dev] Difficulty finding genomic slices using Ensembl Perl API.

Allan Kamau kamauallan at gmail.com
Mon Mar 14 07:38:16 GMT 2011


Dear All,

I have been trying with little success to obtain Slices based on the
following information.
1)species_name
2)genome_assembly_name
3)chromosome_name
4)start_pos
5)end_pos

This are the actual variables and values I have.

$species_name2='Aedes aegypti';
$genome_assembly_name="AaegL1";
$chromosome_name="CH477186.1"; #the value here can be that of a
contig, chromosome or supercontig/scaffold or other sequence
identifier.
$start_pos=5248294;
$end_pos=5248380;


After trying to obtain slices for several such cases for many days I
would like to ask for help with a few lines of working code that would
obtain these slices sustainably. I can manage to get the SliceAdaptor
but a call on fetch_by_region() yields undef. Please help.


Below is my current code.

$registry->load_registry_from_db(
	-host=>'mysql.ebi.ac.uk',
	-user=>'anonymous',
	-port=>4157
);

my $species_name2='Aedes aegypti';
my $chromosome_name="CH477190.1";
my $start_pos=2769446;
my $end_pos=2769635;
my $genome_assembly_name="AaegL1";

$species_name2='Anopheles gambiae';
$genome_assembly_name="AGAMP3";
$chromosome_name="3L";
#34112621;34112703;34110621;34114703
$start_pos=34112621;
$end_pos=34112703;

$species_name2='Aedes aegypti';
$genome_assembly_name="AaegL1";
#$chromosome_name="CH477186.1";
$chromosome_name="477186";
$start_pos=5248294;
$end_pos=5248380;

my $filename="test.slice.txt";

my $slice_found=undef;
if(1==1)
{
	if($registry)
	{
		$species_name2=~s/\s/_/gi;
		$species_name2=lc($species_name2);
		my $species_name_available=$registry->get_alias($species_name2);
		
		
		print ("species_name2 is:".$species_name2.", species_name_available
is:$species_name_available\n");
		#my $db_adaptor2=$registry->get_DBAdaptor(-species=>'human');
		#print Dumper $db_adaptor2;
		my @db_adaptors=@{$registry->get_all_DBAdaptors(-species=>$species_name_available)};
		#my @db_adaptors=@{$registry->get_all_DBAdaptors(-species=>'human')};
			
		foreach my $db_adaptor(@db_adaptors)
		{
			my $db_connection=$db_adaptor->dbc();
			
			printf(
				"species/group\t%s/%s\ndatabase\t%s\nhost:port\t%s:%s\n\n",
				$db_adaptor->species(),$db_adaptor->group(),
				$db_connection->dbname(),$db_connection->host(),
				$db_connection->port()
			);
		}
		if(@db_adaptors)
		{
		}
		else
		{
			my $db_adaptor2=$registry->get_DBAdaptor($species_name_available,'core');
			if($db_adaptor2)
			{
				@db_adaptors=($db_adaptor2);
			}
			else
			{
				$db_adaptor2=$registry->get_DBAdaptor($species_name_available);
				if($db_adaptor2)
				{
					@db_adaptors=($db_adaptor2);
				}
			}
		}
		my $found_db_adaptor="false";
		if(1==1 and @db_adaptors)
		{
			foreach my $db_adaptor (@db_adaptors)
			{
				$found_db_adaptor="true";
				my $db_connection=$db_adaptor->dbc();
	
	
				my $coordinate_system_name='chromosome';
				if($chromosome_name=~m/.*?scaffold.*/i)
				{
					$coordinate_system_name='scaffold';
				}
				if($chromosome_name=~m/.*?contig.*/i)
				{
					$coordinate_system_name='contig';
				}
				my ($highest_cs)=@{$db_adaptor->get_CoordSystemAdaptor->fetch_all_by_name($coordinate_system_name)};
				#my ($highest_cs)=@{$db_adaptor->get_CoordSystemAdaptor->fetch_all()};
				my $csa=$db_adaptor->get_CoordSystemAdaptor();
				my %versions;
				foreach my $cs (@{$csa->fetch_all()})
				{
					$versions{$cs->version()}=1;
					my $str=join':',$cs->name(),$cs->version(),$cs->dbID();
					
					print("\$str is:$str\n");
					
					#my $assembly_label=$highest_cs->version();
					my $assembly_label=$cs->version();
					my $highest_available_coordinate_system_name=$cs->name();
					my $genome_assembly_name_available=$assembly_label;
					print ">>>species_name_available is:'$species_name_available',
\$chromosome_name is:'$chromosome_name', \$coordinate_system_name
is:'$coordinate_system_name',
\$highest_available_coordinate_system_name
is:'$highest_available_coordinate_system_name', assembly_label
is:'$assembly_label', genome_assembly_name_available
is:'$genome_assembly_name_available'"."\n";
					
					$coordinate_system_name=$cs->name();
	
					#print "coordinate_system_name is:$coordinate_system_name,
highest_available_coordinate_system_name
is:$highest_available_coordinate_system_name"."\n";
					#my $slice_adaptor=$db_adaptor->get_adaptor('Slice');
					my $slice_adaptor=$db_adaptor->get_SliceAdaptor();
					#print Dumper(\\$slice_adaptor);  # note the \ backslash;
Dumper() takes references as arguments
	
					my $slice=$slice_adaptor->fetch_by_region($coordinate_system_name,$chromosome_name,$start_pos,$end_pos,1,$genome_assembly_name_available);
					#my $slice=$slice_adaptor->fetch_by_region(undef,$chromosome_name,$start_pos,$end_pos);
					if($slice)
					{
						$slice_found=$slice->seq();
						print("\$slice is:$slice");
						my $sequence=$slice->seq();
						print("\$sequence is:$sequence");
					}
					my $slices=$slice_adaptor->fetch_all('toplevel',$coordinate_system_name);
					if($slices)
					{
						# store a list of slice names in a file
						open(FILE,">$filename") or die("Could not open file $filename");
						foreach my $slice(@$slices)
						{
							print FILE $slice->name(),"\n";
						}
						close FILE;
					}
				}
			}
		}
	}
}


And the output.

species_name2 is:aedes_aegypti, species_name_available is:aedes_aegypti
species/group	aedes_aegypti/core
database	aedes_aegypti_core_8_61_1e
host:port	mysql.ebi.ac.uk:4157

species/group	aedes_aegypti/otherfeatures
database	aedes_aegypti_otherfeatures_8_61_1e
host:port	mysql.ebi.ac.uk:4157

species/group	aedes_aegypti/funcgen
database	aedes_aegypti_funcgen_8_61_1e
host:port	mysql.ebi.ac.uk:4157

$str is:supercontig:AaegL1:1
>>>species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'chromosome', $highest_available_coordinate_system_name is:'supercontig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
$str is:contig:AaegL1:2
>>>species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'supercontig', $highest_available_coordinate_system_name is:'contig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
$str is:supercontig:AaegL1:1
>>>species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'chromosome', $highest_available_coordinate_system_name is:'supercontig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
$str is:contig:AaegL1:2
>>>species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'supercontig', $highest_available_coordinate_system_name is:'contig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
$str is:supercontig:AaegL1:1
>>>species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'chromosome', $highest_available_coordinate_system_name is:'supercontig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'
$str is:contig:AaegL1:2
>>>species_name_available is:'aedes_aegypti', $chromosome_name is:'477186', $coordinate_system_name is:'supercontig', $highest_available_coordinate_system_name is:'contig', assembly_label is:'AaegL1', genome_assembly_name_available is:'AaegL1'


Allan.




More information about the Dev mailing list