diff --git a/phylogenomics/PlantCompUtils.pm b/phylogenomics/PlantCompUtils.pm index dc2362f3b..330491e36 100644 --- a/phylogenomics/PlantCompUtils.pm +++ b/phylogenomics/PlantCompUtils.pm @@ -1,7 +1,7 @@ package PlantCompUtils; require Exporter; -# Copyright [2019-2023] EMBL-European Bioinformatics Institute +# Copyright [2019-2025] EMBL-European Bioinformatics Institute @ISA = qw(Exporter); @EXPORT_OK = qw( @@ -23,8 +23,7 @@ use Time::HiRes; use HTTP::Tiny; use DBI; -# Fungi Protists Metazoa have collections and one all-vs-all TSV file -# This code won't work there +# Only tested in Plants; Fungi Protists Metazoa have collections, code will need tweaking our @DIVISIONS = qw( Plants ); our $FTPURL = 'ftp.ensemblgenomes.org'; our $COMPARADIR = '/pub/xxx/current/tsv/ensembl-compara/homologies'; @@ -180,11 +179,11 @@ sub get_gene_coords_GTF_file { || die "# ERROR(get_gene_coords_GTF_file): cannot open $GTF_filename\n"; while ( my $line = ) { - #1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";... + #1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";... + #C3 brad gene 4809 5027 . - . gene_id "Bo3g025160";... if ( $line =~ - m/^([^#])\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/ - ) - { + m/^([^#]+)\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/) { + ( $chr, $start, $end, $strand, $geneid ) = ( $1, $2, $3, $4, $5 ); push( @chr_sorted_gene_ids, [ $geneid, $chr, $start, $end, $strand ] ); @@ -258,7 +257,7 @@ sub download_GTF_file { # download compressed TSV file from FTP site, renames it # and saves it in $targetdir; uses FTP globals defined above -# NOTE: if species file is not found it tries the bulky all-vs-all file +# NOTE: tries only the bulky all-vs-all file (GBs) sub download_compara_TSV_file { my ( $dir, $ref_genome, $targetdir ) = @_; @@ -274,32 +273,18 @@ sub download_compara_TSV_file { || die "# ERROR(download_compara_TSV_file): cannot change working directory to $dir " . $ftp->message(); - # find out which file is to be downloaded - if ( $ftp->cwd($ref_genome) ) { - foreach my $file ( $ftp->ls() ) { - if ( $file =~ m/protein_default.homologies.tsv.gz/ ) { - $compara_file = $file; - $stored_compara_file = "$targetdir/$compara_file"; - $stored_compara_file =~ s/tsv.gz/$ref_genome.tsv.gz/; - last; - } - } - } - else { # try all-vs-all file instead (Fungi, Protists, Metazoa) - - print "# WARNING(download_compara_TSV_file): cannot find ". - "$ref_genome in $dir, try all-vs-all\n"; - - foreach my $file ( $ftp->ls() ) { - if ( $file =~ m/protein_default.homologies.tsv.gz/ ) { - $compara_file = $file; - $stored_compara_file = "$targetdir/$compara_file"; - foreach my $div (@DIVISIONS) { - if ( $dir =~ m/($div)/i ) { - $div = $1; - $stored_compara_file =~ s/tsv.gz/$div.tsv.gz/; - last; - } + # find file to be downloaded + print "# WARNING(download_compara_TSV_file): try all-vs-all\n"; + + foreach my $file ( $ftp->ls() ) { + if ( $file =~ m/protein_default.homologies.tsv.gz/ ) { + $compara_file = $file; + $stored_compara_file = "$targetdir/$compara_file"; + foreach my $div (@DIVISIONS) { + if ( $dir =~ m/($div)/i ) { + $div = $1; + $stored_compara_file =~ s/tsv.gz/$div.tsv.gz/; + last; } } } diff --git a/phylogenomics/ens_sequences.pl b/phylogenomics/ens_sequences.pl index c0b3945d2..60f5e5caa 100755 --- a/phylogenomics/ens_sequences.pl +++ b/phylogenomics/ens_sequences.pl @@ -18,7 +18,7 @@ # Uses canonical transcripts, used in the gene tree analysis, # which usually are the longest translation with no stop codons # -# Copyright [2019-2021] EMBL-European Bioinformatics Institute +# Copyright [2019-2025] EMBL-European Bioinformatics Institute # Ensembl Genomes my $RESTURL = 'http://rest.ensembl.org'; diff --git a/phylogenomics/ens_single-copy_core_genes.pl b/phylogenomics/ens_single-copy_core_genes.pl index 0893baec7..606adb538 100755 --- a/phylogenomics/ens_single-copy_core_genes.pl +++ b/phylogenomics/ens_single-copy_core_genes.pl @@ -15,10 +15,10 @@ ); # Retrieves single-copy orthologous genes/proteins shared by (plant) species in clade -# by querying pre-computed Compara data from Ensembl Genomes with a reference genome. +# by querying pre-computed Compara data from Ensembl with a reference genome. # Multiple copies are optionally allowed for selected or all species. # -# Copyright [2019-2023] EMBL-European Bioinformatics Institute +# Copyright [2019-2025] EMBL-European Bioinformatics Institute # Ensembl Genomes my $RESTURL = 'http://rest.ensembl.org'; @@ -273,15 +273,23 @@ sub help_message { $wga_coverage, $high_confidence ) = split(/\t/); - if ( $species ne $ref_genome ) { + next if( !$supported{$species} || !$supported{$hom_species} ); + + # ref genome forced to be species as opposed to hom_species + if( $hom_species eq $ref_genome ) { + ($gene_stable_id, $hom_gene_stable_id) = ($hom_gene_stable_id, $gene_stable_id); + ($prot_stable_id, $hom_prot_stable_id) = ($hom_prot_stable_id, $prot_stable_id); + ($species, $hom_species) = ($hom_species, $species); + ($identity, $hom_identity) = ($hom_identity, $identity); + } + + if ( $species ne $ref_genome ) { if ( keys(%present) == $n_of_species ) { last; } # in case all-vs-all file is used else { next } } - next if ( !$supported{$hom_species} || $hom_species eq $ref_genome ); - if ( defined($high_confidence) ) { next if ( $LOWCONF == 0 @@ -298,7 +306,6 @@ sub help_message { && $homology_type eq 'ortholog_one2many' ) ) { - # add $ref_genome protein if ( !$core{$gene_stable_id} ) { diff --git a/phylogenomics/ens_syntelogs.pl b/phylogenomics/ens_syntelogs.pl index 3457ce495..550f98c59 100755 --- a/phylogenomics/ens_syntelogs.pl +++ b/phylogenomics/ens_syntelogs.pl @@ -16,9 +16,9 @@ ); # Retrieves orthologous, syntenic genes (syntelogs) shared by (plant) species in clade -# by querying pre-computed Compara data from Ensembl Genomes with a reference genome. +# by querying pre-computed Compara data from Ensembl with a reference genome. # -# Copyright [2019-2023] EMBL-European Bioinformatics Institute +# Copyright [2019-2025] EMBL-European Bioinformatics Institute # Ensembl Genomes my $RESTURL = 'http://rest.ensembl.org'; @@ -255,6 +255,16 @@ sub help_message { $wga_coverage, $high_confidence ) = split(/\t/); + next if( !$supported{$species} || !$supported{$hom_species} ); + + # ref genome forced to be species as opposed to hom_species + if( $hom_species eq $ref_genome ) { + ($gene_stable_id, $hom_gene_stable_id) = ($hom_gene_stable_id, $gene_stable_id); + ($prot_stable_id, $hom_prot_stable_id) = ($hom_prot_stable_id, $prot_stable_id); + ($species, $hom_species) = ($hom_species, $species); + ($identity, $hom_identity) = ($hom_identity, $identity); + } + if ( $species ne $ref_genome ) { if ( keys(%present) == $n_of_species ) { last; @@ -262,8 +272,6 @@ sub help_message { else { next } } - next if ( !$supported{$hom_species} || $hom_species eq $ref_genome ); - if ( defined($high_confidence) ) { next if ( $LOWCONF == 0 @@ -273,8 +281,7 @@ sub help_message { next if ( $goc_ssynt eq 'NULL' || $goc_ssynt < $GOC ); if ( $homology_type eq 'ortholog_one2one' - || $homology_type eq 'ortholog_one2many' ) - { + || $homology_type eq 'ortholog_one2many' ) { # add $ref_genome protein if ( !$synt{$gene_stable_id} ) {