DessimozLab
diff --git a/‎.github/workflows/build-containers.yml‎
Lines changed: 3 additions & 4 deletions b/‎.github/workflows/build-containers.yml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 73 additions & 18 deletions b/‎README.md‎
Lines changed: 73 additions & 18 deletions
diff --git a/‎config/base.config‎
Lines changed: 100 additions & 8 deletions b/‎config/base.config‎
Lines changed: 100 additions & 8 deletions
diff --git a/‎config/euler_hpc.config‎
Lines changed: 5 additions & 2 deletions b/‎config/euler_hpc.config‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎config/test.config‎
Lines changed: 1 addition & 1 deletion b/‎config/test.config‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎containers/oma/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎containers/oma/Dockerfile‎
Lines changed: 1 addition & 1 deletion
@@ -82,10 +82,9 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install Nextflow
-        run: |
-          curl -s https://get.nextflow.io | bash
-          chmod +x nextflow
-          mv nextflow /usr/local/bin/
+        uses: nf-core/setup-nextflow@v2
+        with:
+          version: '24.10.5'
 
       - name: Run Nextflow pipeline
         run: nextflow run main.nf -profile docker,test
@@ -88,6 +88,9 @@ Convert OMA run into OMA Browser release
 | Parameter | Description | Type | Default | Required |
 |-----------|-----------|-----------|-----------|-----------|
 | `oma_source` | Selection of OMA data source. Can be either 'FastOMA' or 'Production'. The selection requires setting either the parameters for FastOMA or Production. | `string` | FastOMA |  |
+| `oma_version` | Version of the OMA Browser instance. It defaults to 'All.<Mon><YEAR>' | `string` |  |  |  |
+| `oma_release_char` | Release specific character (used in HOG ids) <details><summary>Help</summary><small>A single capital letter [A-Z] which makes the
+HOG-IDs unique accross different releases.</small></details>| `string` |  |  |  |
 
 ### FastOMA Input data
 
@@ -109,43 +112,95 @@ Input files genereated from an OMA Production run
 | `matrix_file` | OMA Groups file | `string` |  |  |
 | `hog_orthoxml` | Hierarchcial orthologous groups (HOGs) in orthoxml format | `string` |  | True |
 | `genomes_dir` | Folder containing genomes | `string` |  | True |
+| `homoeologs_folder` | Folder containing the homoeologs files | `string` |  |  |  |
 
 ### Domain data
 
 File paths for domain annotations
 
 | Parameter | Description | Type | Default | Required |
 |-----------|-----------|-----------|-----------|-----------|
-| `cath_names_path` | File containing CATH domain descriptions | `string` | http://download.cathdb.info/cath/releases/latest-release/cath-classification-data/cath-names.txt |  |
-| `known_domains` | Folder containing known domain assignments files | `string` |  |  |
-| `pfam_names_path` | File containing Pfam descriptions | `string` | https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz |  |
+| `infer_domains` | Flag indicating whether domains are inferred using the CATH/Gene3d pipeline. <details><summary>Help</summary><small>If set to true, the
+pipeline will run the CATH/Gene3D pipeline to infer domain assignments. This will require substantial amount of compute time. The set of already known
+domains (see parameter 'known_domains') will be used to skip the inference of domains that are already known. If set to false, the pipeline will use the
+known domain assignments provided in the 'known_domains' parameter.</small></details>| `boolean` |  |  |  |
+| `known_domains` | Folder containing known domain assignments files. <details><summary>Help</summary><small>The folder must contain csv/tsv files that
+contain three columns (md5hash of sequence, CATH-domain-id, region on sequence). The output of a previous run of this pipeline can thus be used as
+input.</small></details>| `string` |  |  |  |
+| `cath_names_path` | File containing CATH domain descriptions | `string` |
+http://download.cathdb.info/cath/releases/latest-release/cath-classification-data/cath-names.txt |  |  |
+| `hmm_db` | Path where the domain hmms for the cath/gene3d pipeline are located. | `string` |
+ftp://orengoftp.biochem.ucl.ac.uk/gene3d/v21.0.0/gene3d_hmmsearch/hmms.tar.gz |  |  |
+| `cath_domain_list` | File with mapping from hmm id to cath domain id. | `string` |
+http://download.cathdb.info/cath/releases/latest-release/cath-classification-data/cath-domain-list.txt |  |  |
+| `discontinuous_regs` | File provided by gene3d to handle discontinuous regions | `string` |
+http://download.cathdb.info/gene3d/v21.0.0/gene3d_hmmsearch/discontinuous/discontinuous_regs.pkl |  |  |
+| `pfam_names_path` | File containing Pfam descriptions | `string` | https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz |  |  |
+
 
 ### Crossreferences
 
 Integrate crossreferences
 
-| Parameter | Description | Type | Default | Required |
-|-----------|-----------|-----------|-----------|-----------|
-| `xref_uniprot_swissprot` | UniProtKB/SwissProt annotation in text format | `string` | https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz |  |
-| `xref_uniprot_trembl` | UniProtKB/TrEMBL annotations in text format | `string` | /dev/null |  |
-| `taxonomy_sqlite_path` |  | `string` |  |  |
-| `xref_refseq` | Folder containing RefSeq gbff files. | `string` |  |  |
+| Parameter | Description | Type | Default | Required | Hidden |
+|-----------|-----------|-----------|-----------|-----------|-----------|
+| `xref_uniprot_swissprot` | UniProtKB/SwissProt annotation in text format | `string` |
+https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz |  |  |
+| `xref_uniprot_trembl` | UniProtKB/TrEMBL annotations in text format. <details><summary>Help</summary><small>If not provided, no TrEMBL cross-references
+will be included. The generic ftp url for TrEMBL is
+https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz</small></details>| `string` |  |  |  |
+| `taxonomy_sqlite_path` | Path to a sqlite database containing the combined NCBI/GTDB taxonomy data. <details><summary>Help</summary><small>If not provided
+it will be generated automatically and cached</small></details>| `string` |  |  |  |
+| `xref_refseq` | 'download' or folder containing RefSeq gbff files. <details><summary>Help</summary><small>If not specified, no RefSeq crossreferences will
+be download (default). If set to 'download', the latest RefSeq gbff files will be downloaded from NCBI FTP server. Alternatively, a folder containing local
+*.gbff.gz files can be provided.</small></details>| `string` |  |  |  |
 
 ### Gene Ontology
 
 Gene Ontology files to integrate
 
-| Parameter | Description | Type | Default | Required |
-|-----------|-----------|-----------|-----------|-----------|
-| `go_obo` | Gene Ontology OBO file | `string` | http://purl.obolibrary.org/obo/go/go-basic.obo |  |
-| `go_gaf` | Gene Ontology annotations (GAF format). This can the GOA database or a glob pattern with local files in gaf format. | `string` | https://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gaf.gz |  |
+| Parameter | Description | Type | Default | Required | Hidden |
+|-----------|-----------|-----------|-----------|-----------|-----------|
+| `go_obo` | Gene Ontology OBO file | `string` | http://purl.obolibrary.org/obo/go/go-basic.obo |  |  |
+| `go_gaf` | Gene Ontology annotations (GAF format). This can the GOA database or a glob pattern with local files in gaf format. | `string` |
+https://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gaf.gz |  |  |
+
+### OMAmer
+
+Parameters regarding building OMAmer databases based on the generated OMA instance
+
+| Parameter | Description | Type | Default | Required | Hidden |
+|-----------|-----------|-----------|-----------|-----------|-----------|
+| `omamer_levels` | Comma-seperated list of taxonomic levels for which OMAmer databases should be built. <details><summary>Help</summary><small>The input
+string is parsed as a comma-seperated list, e.g. given 'Mammalia,Primates' as parameter value would build two OMAmer databases, one for Mammalia and one for
+Primates. Note that the taxonomic levels must exist in the input species tree.</small></details>| `string` |  |  |  |
+
+### Exporting as RDF
+
+Parameters regarding the export as rdf triples
+
+| Parameter | Description | Type | Default | Required | Hidden |
+|-----------|-----------|-----------|-----------|-----------|-----------|
+| `rdf_export` | Flag to activate export as RDF triples <details><summary>Help</summary><small>Activating rdf_export will enable the dump of RDF ttl files
+which can be imported into a Sparql endpoint.</small></details>| `boolean` |  |  |  |
+| `rdf_orthOntology` | user provided orthOntology file. If not provided, default ontology will be used | `string` |  |  |  |
+| `rdf_prefixes` | user provided rdf prefix mapping. if not provided, default prefixes will be used. | `string` |  |  |  |
+
+### Production OMA output settings
+
+Parameters concerning additional output files usually needed for the production OMA Browser instance
+
+| Parameter | Description | Type | Default | Required | Hidden |
+|-----------|-----------|-----------|-----------|-----------|-----------|
+| `oma_dumps` | Flag to activate dumping various files for the download section <details><summary>Help</summary><small>Activating oma_dumps will enable
+species, sequences, GO annotations files as text files for the download section.</small></details>| `boolean` |  |  |  |
 
 ### Generic options
 
 Less common options for the pipeline, typically set in a config file.
 
-| Parameter | Description | Type | Default | Required |
-|-----------|-----------|-----------|-----------|-----------|
-| `custom_config_version` | version of configuration base to include (nf-core configs) | `string` | master |  |
-| `custom_config_base` | location where to look for nf-core/configs | `string` | https://raw.githubusercontent.com/nf-core/configs/master |  |
-
+| Parameter | Description | Type | Default | Required | Hidden |
+|-----------|-----------|-----------|-----------|-----------|-----------|
+| `help` | Display help text. | `boolean` |  |  | True |
+| `custom_config_version` | version of configuration base to include (nf-core configs) | `string` | master |  | True |
+| `custom_config_base` | location where to look for nf-core/configs | `string` | https://raw.githubusercontent.com/nf-core/configs/master |  | True |
@@ -69,14 +69,14 @@ process {
     }
 
     withName: '.*:BUILD_HOG_H5'{
-        cpus   = { 1 }
-        memory = { 500.MB + 500.B * orthoxml.size() * (2*task.attempt-1) }
-        time   = { 1.min * (Math.ceil( meta.nr_of_genomes / 3 ) + 10) * task.attempt }
+        cpus   = { meta.nr_of_taxa > 250 ? 3 : 1 }
+        memory = { 1.GB + 20.KB*Math.ceil(orthoxml.size() / 1024) * (2*task.attempt-1) }
+        time   = { 1.min * (Math.ceil( meta.nr_of_genomes / 3 ) + Math.ceil( meta.nr_of_taxa / (meta.nr_of_taxa > 250 ? 9 : 3)) + 10) * task.attempt }
     }
 
     withName: ".*:ADD_PAIRWISE_ORTHOLOGS" {
         cpus   = { meta.nr_of_genomes < 10 ? 2 : (meta.nr_of_genomes < 30 ? 4 : (meta.nr_of_genomes < 300 ? 6 : 12))  }
-        memory = { 3.GB + 10.B * (meta.max_nr_seqs_in_genome * meta.nr_of_genomes * 10 as Long) * task.attempt }
+        memory = { 3.GB + 100.KB * (Math.ceil(meta.max_nr_seqs_in_genome/1024) * meta.nr_of_genomes) * task.attempt }
         time   = { 1.min * (Math.ceil( meta.nr_of_genomes / 2 ) + 10) * task.attempt }
     }
 
@@ -88,20 +88,112 @@ process {
 
     withName: "INFER_FINGERPRINTS" {
         cpus   = { 1 }
-        memory = { (800.MB + meta.nr_of_amino_acids * 2.B) * task.attempt }
-        time   = { (10.min + Math.ceil(meta.nr_of_sequences / 100) * 1.sec) * (2 * task.attempt - 1)}
+        memory = { (800.MB + meta.nr_of_amino_acids * 24.B) * task.attempt }
+        time   = { 1.min * (0.5 * meta.nr_of_amino_acids / Math.pow(2,20) * Math.log(meta.nr_of_amino_acids / Math.pow(2, 20)) + 10) * task.attempt}
     }
 
     withName: "INFER_KEYWORDS" {
         cpus   = { 1 }
-        memory = { (800.MB + meta.nr_of_sequences * 500.B) * task.attempt }
+        memory = { (800.MB + meta.nr_of_sequences * 300.B) * task.attempt }
         time   = { (10.min + Math.ceil(meta.nr_of_sequences / 500) * 1.sec) * (2 * task.attempt - 1)}
     }
 
+    withName: "IDENTIFY_PROTEINS_WITHOUT_DOMAIN_ANNOTATION" {
+        cpus  = { 1 }
+        memory = { 
+            def tot_size = domain_files.collect {it.size() }.sum()
+            // scale memory with input size, but at least 6GB
+            def gb = Math.max(6, 4 * Math.ceil(tot_size / Math.pow(2,30)) + 3)
+            return gb * 1.GB * task.attempt
+        }
+        time   = { 4.h * task.attempt }
+    }
+
+    withName: "INFER_HOG_PROFILES" {
+        cpus   = { (meta.nr_of_sequences < 1000000 ? 2 : (meta.nr_of_sequences < 10000000 ? 4 : 6)) * task.attempt }
+        time   = { (meta.nr_of_sequences > 10000000 ? 24.h : 8.h) * task.attempt }
+        // memory should be 6GB per cpu.
+        memory = { 6.GB * (meta.nr_of_sequences < 1000000 ? 2 : (meta.nr_of_sequences < 10000000 ? 4 : 6)) * task.attempt  }
+    }
+
     withName: "HMMER_HMMSEARCH" {
         cpus   = { 4 }
         memory = { 1.GB * (2*task.attempt-1) }
         time   = { 2.h * (2*task.attempt-1)  }
     }
 
-}
+    withName: ".*:BUILD_VPTAB_DATABASE" {
+        cpus   = { 8 }
+        memory = { Math.max(Math.ceil(db.size()/Math.pow(2, 30)), 20) * 1.GB * task.attempt * task.cpus }
+        time   = { 12.h  * task.attempt }
+    }
+
+    withName: ".*:COMPUTE_CACHE" {
+        cpus   = { 1 }
+        memory = { 6.GB * task.attempt }
+        time   = { 24.h  * task.attempt }
+    }
+
+    withName: ".*:COMBINE_H5_FILES" {
+        cpus   = { 1 }
+        memory = { 3.GB + meta.nr_of_sequences * 2.KB * task.attempt }
+        time   = { (2.h  + meta.nr_of_genomes * 10.sec) * (2 * task.attempt - 1) }
+    }
+
+    withName: ".*:FILTER_AND_SPLIT" {
+        cpus   = {
+            def nr_files = xref instanceof List ? xref.size() : 1
+            def base_nr = nr_files < 4 ? 1 : (nr_files < 12 ? 2 : 6)
+            return base_nr * task.attempt
+        }
+        memory = { 3.GB + task.cpus * 400.MB * task.attempt }
+        time   = { 8.h * task.attempt }
+    }
+
+
+    withName: ".*:MAP_XREFS" {
+        cpus   = { 6 }
+        memory = { 2.GB + task.cpus * Math.ceil(meta.nr_of_sequences) * 1.KB * (2 * task.attempt - 1) }
+        time   = { 20.h * (2 * task.attempt -1) }
+    }
+
+    withName: ".*:COLLECT_XREFS" {
+        cpus   = { 1 }
+        memory = { 
+            def nr_files = map_results instanceof List ? map_results.size() : 1
+            return 6.GB + (nr_files * 128.MB) * task.attempt 
+        }
+        time   = { 12.h * task.attempt }
+    }
+
+    withName: ".*:COMBINE_ALL_XREFS" {
+        cpus   = { 1 }
+        memory = { 
+            def total_size = 0
+            if (xref_dbs instanceof List) {
+                total_size = 2 * xref_dbs.collect {it.size()}.sum() / Math.pow(2,20);
+            } else {
+                total_size = 2 * xref_dbs.size() / Math.pow(2,20);
+            }
+            def log2_scale = Math.log(Math.max(total_size, 2)) / Math.log(2)
+            return (6.GB + 1.MB * total_size * log2_scale) * task.attempt
+        }
+        time   = { 1.min * Math.ceil(meta.nr_of_sequences / 12000) * (2 * task.attempt - 1) }
+    }
+
+    withName: "OMAMER_BUILD" {
+        cpus   = { 1 }
+        memory = {
+            def basemem = 36.GB
+            def multiplier = meta.id == "LUCA" ? 3 : (meta.id == "Metazoa" ? 2.5 : (meta.id == "Viridiplantae" ? 1 : 0.5))
+            return basemem * multiplier * (2*task.attempt-1)
+        }
+        time   = { 8.h * task.attempt }
+    }
+
+    withName: "HOGPROP" {
+        // 6.GB -> 24.GB -> 54.GB; some big hogs might need a lot of memory
+        memory = { 6.GB * task.attempt * task.attempt }
+        maxRetries  = 2
+    }
+}
@@ -1,19 +1,22 @@
 // Process scope
 process {
     // Node options
-    resourceLimits    = [ cpus: 48, memory: 350.GB, time: 72.h ]
+    resourceLimits    = [ cpus: 48, memory: 450.GB, time: 72.h ]
     scratch           = true
     containerOptions  = "--bind /scratch:/scratch"
     beforeScript      = 'module load eth_proxy' 
 
     withLabel: HIGH_IO_ACCESS {
-        stageInMode   = "copy"
+        //stageInMode   = "copy"
         scratch       = true
     }
+   
 }
 
 executor {
    name                = "slurm"
    perCpuMemAllocation = true
    queueSize           = 500
 }
+
+
@@ -31,6 +31,6 @@ params {
     genomes_dir                = "${projectDir}/testdata/fastoma/proteome"
     taxonomy_sqlite_path       = "${projectDir}/testdata/taxonomy.sqlite"
     pfam_names_path            = "${projectDir}/testdata/Pfam-A.clans.stub.tsv.gz"
-    xref_refseq                = "${projectDir}/assets/NO_FILE"
+    cath_names_path            = "${projectDir}/testdata/cath-names.txt"
     go_gaf                     = "${projectDir}/testdata/fastoma/*.goa"
 }
@@ -7,7 +7,7 @@ FROM basis AS builder
 RUN apt-get update \
     && apt-get install -y --no-install-recommends \
        build-essential \
-       libhdf5-103 \
+       libhdf5-310 \
        libhdf5-dev \
        git-core \
        pkg-config \
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,6 @@ params {`
`31`	`31`	`genomes_dir = "${projectDir}/testdata/fastoma/proteome"`
`32`	`32`	`taxonomy_sqlite_path = "${projectDir}/testdata/taxonomy.sqlite"`
`33`	`33`	`pfam_names_path = "${projectDir}/testdata/Pfam-A.clans.stub.tsv.gz"`
`34`		`- xref_refseq = "${projectDir}/assets/NO_FILE"`
	`34`	`+ cath_names_path = "${projectDir}/testdata/cath-names.txt"`
`35`	`35`	`go_gaf = "${projectDir}/testdata/fastoma/*.goa"`
`36`	`36`	`}`