Skip to content

Commit 59b392e

Browse files
2 parents 7feb0f2 + d8b2e48 commit 59b392e

File tree

4 files changed

+46
-47
lines changed

4 files changed

+46
-47
lines changed

phylogenomics/PlantCompUtils.pm

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package PlantCompUtils;
22
require Exporter;
33

4-
# Copyright [2019-2023] EMBL-European Bioinformatics Institute
4+
# Copyright [2019-2025] EMBL-European Bioinformatics Institute
55

66
@ISA = qw(Exporter);
77
@EXPORT_OK = qw(
@@ -23,8 +23,7 @@ use Time::HiRes;
2323
use HTTP::Tiny;
2424
use DBI;
2525

26-
# Fungi Protists Metazoa have collections and one all-vs-all TSV file
27-
# This code won't work there
26+
# Only tested in Plants; Fungi Protists Metazoa have collections, code will need tweaking
2827
our @DIVISIONS = qw( Plants );
2928
our $FTPURL = 'ftp.ensemblgenomes.org';
3029
our $COMPARADIR = '/pub/xxx/current/tsv/ensembl-compara/homologies';
@@ -180,11 +179,11 @@ sub get_gene_coords_GTF_file {
180179
|| die "# ERROR(get_gene_coords_GTF_file): cannot open $GTF_filename\n";
181180
while ( my $line = <GTF> ) {
182181

183-
#1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";...
182+
#1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";...
183+
#C3 brad gene 4809 5027 . - . gene_id "Bo3g025160";...
184184
if ( $line =~
185-
m/^([^#])\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/
186-
)
187-
{
185+
m/^([^#]+)\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/) {
186+
188187
( $chr, $start, $end, $strand, $geneid ) = ( $1, $2, $3, $4, $5 );
189188
push( @chr_sorted_gene_ids,
190189
[ $geneid, $chr, $start, $end, $strand ] );
@@ -258,7 +257,7 @@ sub download_GTF_file {
258257

259258
# download compressed TSV file from FTP site, renames it
260259
# and saves it in $targetdir; uses FTP globals defined above
261-
# NOTE: if species file is not found it tries the bulky all-vs-all file
260+
# NOTE: tries only the bulky all-vs-all file (GBs)
262261
sub download_compara_TSV_file {
263262

264263
my ( $dir, $ref_genome, $targetdir ) = @_;
@@ -274,32 +273,18 @@ sub download_compara_TSV_file {
274273
|| die "# ERROR(download_compara_TSV_file): cannot change working directory to $dir "
275274
. $ftp->message();
276275

277-
# find out which file is to be downloaded
278-
if ( $ftp->cwd($ref_genome) ) {
279-
foreach my $file ( $ftp->ls() ) {
280-
if ( $file =~ m/protein_default.homologies.tsv.gz/ ) {
281-
$compara_file = $file;
282-
$stored_compara_file = "$targetdir/$compara_file";
283-
$stored_compara_file =~ s/tsv.gz/$ref_genome.tsv.gz/;
284-
last;
285-
}
286-
}
287-
}
288-
else { # try all-vs-all file instead (Fungi, Protists, Metazoa)
289-
290-
print "# WARNING(download_compara_TSV_file): cannot find ".
291-
"$ref_genome in $dir, try all-vs-all\n";
292-
293-
foreach my $file ( $ftp->ls() ) {
294-
if ( $file =~ m/protein_default.homologies.tsv.gz/ ) {
295-
$compara_file = $file;
296-
$stored_compara_file = "$targetdir/$compara_file";
297-
foreach my $div (@DIVISIONS) {
298-
if ( $dir =~ m/($div)/i ) {
299-
$div = $1;
300-
$stored_compara_file =~ s/tsv.gz/$div.tsv.gz/;
301-
last;
302-
}
276+
# find file to be downloaded
277+
print "# WARNING(download_compara_TSV_file): try all-vs-all\n";
278+
279+
foreach my $file ( $ftp->ls() ) {
280+
if ( $file =~ m/protein_default.homologies.tsv.gz/ ) {
281+
$compara_file = $file;
282+
$stored_compara_file = "$targetdir/$compara_file";
283+
foreach my $div (@DIVISIONS) {
284+
if ( $dir =~ m/($div)/i ) {
285+
$div = $1;
286+
$stored_compara_file =~ s/tsv.gz/$div.tsv.gz/;
287+
last;
303288
}
304289
}
305290
}

phylogenomics/ens_sequences.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# Uses canonical transcripts, used in the gene tree analysis,
1919
# which usually are the longest translation with no stop codons
2020
#
21-
# Copyright [2019-2021] EMBL-European Bioinformatics Institute
21+
# Copyright [2019-2025] EMBL-European Bioinformatics Institute
2222

2323
# Ensembl Genomes
2424
my $RESTURL = 'http://rest.ensembl.org';

phylogenomics/ens_single-copy_core_genes.pl

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
);
1616

1717
# Retrieves single-copy orthologous genes/proteins shared by (plant) species in clade
18-
# by querying pre-computed Compara data from Ensembl Genomes with a reference genome.
18+
# by querying pre-computed Compara data from Ensembl with a reference genome.
1919
# Multiple copies are optionally allowed for selected or all species.
2020
#
21-
# Copyright [2019-2023] EMBL-European Bioinformatics Institute
21+
# Copyright [2019-2025] EMBL-European Bioinformatics Institute
2222

2323
# Ensembl Genomes
2424
my $RESTURL = 'http://rest.ensembl.org';
@@ -273,15 +273,23 @@ sub help_message {
273273
$wga_coverage, $high_confidence
274274
) = split(/\t/);
275275

276-
if ( $species ne $ref_genome ) {
276+
next if( !$supported{$species} || !$supported{$hom_species} );
277+
278+
# ref genome forced to be species as opposed to hom_species
279+
if( $hom_species eq $ref_genome ) {
280+
($gene_stable_id, $hom_gene_stable_id) = ($hom_gene_stable_id, $gene_stable_id);
281+
($prot_stable_id, $hom_prot_stable_id) = ($hom_prot_stable_id, $prot_stable_id);
282+
($species, $hom_species) = ($hom_species, $species);
283+
($identity, $hom_identity) = ($hom_identity, $identity);
284+
}
285+
286+
if ( $species ne $ref_genome ) {
277287
if ( keys(%present) == $n_of_species ) {
278288
last;
279289
} # in case all-vs-all file is used
280290
else { next }
281291
}
282292

283-
next if ( !$supported{$hom_species} || $hom_species eq $ref_genome );
284-
285293
if ( defined($high_confidence) ) {
286294
next
287295
if ( $LOWCONF == 0
@@ -298,7 +306,6 @@ sub help_message {
298306
&& $homology_type eq 'ortholog_one2many' )
299307
)
300308
{
301-
302309
# add $ref_genome protein
303310
if ( !$core{$gene_stable_id} ) {
304311

phylogenomics/ens_syntelogs.pl

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
);
1717

1818
# Retrieves orthologous, syntenic genes (syntelogs) shared by (plant) species in clade
19-
# by querying pre-computed Compara data from Ensembl Genomes with a reference genome.
19+
# by querying pre-computed Compara data from Ensembl with a reference genome.
2020
#
21-
# Copyright [2019-2023] EMBL-European Bioinformatics Institute
21+
# Copyright [2019-2025] EMBL-European Bioinformatics Institute
2222

2323
# Ensembl Genomes
2424
my $RESTURL = 'http://rest.ensembl.org';
@@ -255,15 +255,23 @@ sub help_message {
255255
$wga_coverage, $high_confidence
256256
) = split(/\t/);
257257

258+
next if( !$supported{$species} || !$supported{$hom_species} );
259+
260+
# ref genome forced to be species as opposed to hom_species
261+
if( $hom_species eq $ref_genome ) {
262+
($gene_stable_id, $hom_gene_stable_id) = ($hom_gene_stable_id, $gene_stable_id);
263+
($prot_stable_id, $hom_prot_stable_id) = ($hom_prot_stable_id, $prot_stable_id);
264+
($species, $hom_species) = ($hom_species, $species);
265+
($identity, $hom_identity) = ($hom_identity, $identity);
266+
}
267+
258268
if ( $species ne $ref_genome ) {
259269
if ( keys(%present) == $n_of_species ) {
260270
last;
261271
} # in case all-vs-all file is used
262272
else { next }
263273
}
264274

265-
next if ( !$supported{$hom_species} || $hom_species eq $ref_genome );
266-
267275
if ( defined($high_confidence) ) {
268276
next
269277
if ( $LOWCONF == 0
@@ -273,8 +281,7 @@ sub help_message {
273281
next if ( $goc_ssynt eq 'NULL' || $goc_ssynt < $GOC );
274282

275283
if ( $homology_type eq 'ortholog_one2one'
276-
|| $homology_type eq 'ortholog_one2many' )
277-
{
284+
|| $homology_type eq 'ortholog_one2many' ) {
278285

279286
# add $ref_genome protein
280287
if ( !$synt{$gene_stable_id} ) {

0 commit comments

Comments
 (0)