Skip to content
53 changes: 19 additions & 34 deletions phylogenomics/PlantCompUtils.pm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package PlantCompUtils;
require Exporter;

# Copyright [2019-2023] EMBL-European Bioinformatics Institute
# Copyright [2019-2025] EMBL-European Bioinformatics Institute

@ISA = qw(Exporter);
@EXPORT_OK = qw(
Expand All @@ -23,8 +23,7 @@ use Time::HiRes;
use HTTP::Tiny;
use DBI;

# Fungi Protists Metazoa have collections and one all-vs-all TSV file
# This code won't work there
# Only tested in Plants; Fungi Protists Metazoa have collections, code will need tweaking
our @DIVISIONS = qw( Plants );
our $FTPURL = 'ftp.ensemblgenomes.org';
our $COMPARADIR = '/pub/xxx/current/tsv/ensembl-compara/homologies';
Expand Down Expand Up @@ -180,11 +179,11 @@ sub get_gene_coords_GTF_file {
|| die "# ERROR(get_gene_coords_GTF_file): cannot open $GTF_filename\n";
while ( my $line = <GTF> ) {

#1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";...
#1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";...
#C3 brad gene 4809 5027 . - . gene_id "Bo3g025160";...
if ( $line =~
m/^([^#])\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/
)
{
m/^([^#]+)\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/) {

( $chr, $start, $end, $strand, $geneid ) = ( $1, $2, $3, $4, $5 );
push( @chr_sorted_gene_ids,
[ $geneid, $chr, $start, $end, $strand ] );
Expand Down Expand Up @@ -258,7 +257,7 @@ sub download_GTF_file {

# download compressed TSV file from FTP site, renames it
# and saves it in $targetdir; uses FTP globals defined above
# NOTE: if species file is not found it tries the bulky all-vs-all file
# NOTE: tries only the bulky all-vs-all file (GBs)
sub download_compara_TSV_file {

my ( $dir, $ref_genome, $targetdir ) = @_;
Expand All @@ -274,32 +273,18 @@ sub download_compara_TSV_file {
|| die "# ERROR(download_compara_TSV_file): cannot change working directory to $dir "
. $ftp->message();

# find out which file is to be downloaded
if ( $ftp->cwd($ref_genome) ) {
foreach my $file ( $ftp->ls() ) {
if ( $file =~ m/protein_default.homologies.tsv.gz/ ) {
$compara_file = $file;
$stored_compara_file = "$targetdir/$compara_file";
$stored_compara_file =~ s/tsv.gz/$ref_genome.tsv.gz/;
last;
}
}
}
else { # try all-vs-all file instead (Fungi, Protists, Metazoa)

print "# WARNING(download_compara_TSV_file): cannot find ".
"$ref_genome in $dir, try all-vs-all\n";

foreach my $file ( $ftp->ls() ) {
if ( $file =~ m/protein_default.homologies.tsv.gz/ ) {
$compara_file = $file;
$stored_compara_file = "$targetdir/$compara_file";
foreach my $div (@DIVISIONS) {
if ( $dir =~ m/($div)/i ) {
$div = $1;
$stored_compara_file =~ s/tsv.gz/$div.tsv.gz/;
last;
}
# find file to be downloaded
print "# WARNING(download_compara_TSV_file): try all-vs-all\n";

foreach my $file ( $ftp->ls() ) {
if ( $file =~ m/protein_default.homologies.tsv.gz/ ) {
$compara_file = $file;
$stored_compara_file = "$targetdir/$compara_file";
foreach my $div (@DIVISIONS) {
if ( $dir =~ m/($div)/i ) {
$div = $1;
$stored_compara_file =~ s/tsv.gz/$div.tsv.gz/;
last;
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion phylogenomics/ens_sequences.pl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# Uses canonical transcripts, used in the gene tree analysis,
# which usually are the longest translation with no stop codons
#
# Copyright [2019-2021] EMBL-European Bioinformatics Institute
# Copyright [2019-2025] EMBL-European Bioinformatics Institute

# Ensembl Genomes
my $RESTURL = 'http://rest.ensembl.org';
Expand Down
19 changes: 13 additions & 6 deletions phylogenomics/ens_single-copy_core_genes.pl
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
);

# Retrieves single-copy orthologous genes/proteins shared by (plant) species in clade
# by querying pre-computed Compara data from Ensembl Genomes with a reference genome.
# by querying pre-computed Compara data from Ensembl with a reference genome.
# Multiple copies are optionally allowed for selected or all species.
#
# Copyright [2019-2023] EMBL-European Bioinformatics Institute
# Copyright [2019-2025] EMBL-European Bioinformatics Institute

# Ensembl Genomes
my $RESTURL = 'http://rest.ensembl.org';
Expand Down Expand Up @@ -273,15 +273,23 @@ sub help_message {
$wga_coverage, $high_confidence
) = split(/\t/);

if ( $species ne $ref_genome ) {
next if( !$supported{$species} || !$supported{$hom_species} );

# ref genome forced to be species as opposed to hom_species
if( $hom_species eq $ref_genome ) {
($gene_stable_id, $hom_gene_stable_id) = ($hom_gene_stable_id, $gene_stable_id);
($prot_stable_id, $hom_prot_stable_id) = ($hom_prot_stable_id, $prot_stable_id);
($species, $hom_species) = ($hom_species, $species);
($identity, $hom_identity) = ($hom_identity, $identity);
}

if ( $species ne $ref_genome ) {
if ( keys(%present) == $n_of_species ) {
last;
} # in case all-vs-all file is used
else { next }
}

next if ( !$supported{$hom_species} || $hom_species eq $ref_genome );

if ( defined($high_confidence) ) {
next
if ( $LOWCONF == 0
Expand All @@ -298,7 +306,6 @@ sub help_message {
&& $homology_type eq 'ortholog_one2many' )
)
{

# add $ref_genome protein
if ( !$core{$gene_stable_id} ) {

Expand Down
19 changes: 13 additions & 6 deletions phylogenomics/ens_syntelogs.pl
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
);

# Retrieves orthologous, syntenic genes (syntelogs) shared by (plant) species in clade
# by querying pre-computed Compara data from Ensembl Genomes with a reference genome.
# by querying pre-computed Compara data from Ensembl with a reference genome.
#
# Copyright [2019-2023] EMBL-European Bioinformatics Institute
# Copyright [2019-2025] EMBL-European Bioinformatics Institute

# Ensembl Genomes
my $RESTURL = 'http://rest.ensembl.org';
Expand Down Expand Up @@ -255,15 +255,23 @@ sub help_message {
$wga_coverage, $high_confidence
) = split(/\t/);

next if( !$supported{$species} || !$supported{$hom_species} );

# ref genome forced to be species as opposed to hom_species
if( $hom_species eq $ref_genome ) {
($gene_stable_id, $hom_gene_stable_id) = ($hom_gene_stable_id, $gene_stable_id);
($prot_stable_id, $hom_prot_stable_id) = ($hom_prot_stable_id, $prot_stable_id);
($species, $hom_species) = ($hom_species, $species);
($identity, $hom_identity) = ($hom_identity, $identity);
}

if ( $species ne $ref_genome ) {
if ( keys(%present) == $n_of_species ) {
last;
} # in case all-vs-all file is used
else { next }
}

next if ( !$supported{$hom_species} || $hom_species eq $ref_genome );

if ( defined($high_confidence) ) {
next
if ( $LOWCONF == 0
Expand All @@ -273,8 +281,7 @@ sub help_message {
next if ( $goc_ssynt eq 'NULL' || $goc_ssynt < $GOC );

if ( $homology_type eq 'ortholog_one2one'
|| $homology_type eq 'ortholog_one2many' )
{
|| $homology_type eq 'ortholog_one2many' ) {

# add $ref_genome protein
if ( !$synt{$gene_stable_id} ) {
Expand Down