Merge branch 'remote-test-files' into dev

alpae · alpae · commit 293476b5645d · 2025-12-08T11:24:19.000+01:00
diff --git a/FastOMA.nf b/FastOMA.nf
diff --git a/FastOMA/fastoma_notebook_stat.ipynb b/FastOMA/fastoma_notebook_stat.ipynb
@@ -89,8 +89,8 @@
    },
    "source": [
     "output_folder = \"Output\"\n",
-    "input_folder = \"testdata/in_folder\"\n",
-    "proteome_folder = input_folder + \"/proteome\"\n",
+    "input = \"testdata/in_folder\"\n",
+    "proteome_folder = input + \"/proteome\"\n",
     "min_sequence_length = 50"
    ],
    "outputs": [],
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ any installation steps given the system supports running either docker container
 installed.
 
 ```bash
-nextflow run dessimozlab/FastOMA -profile docker  --input_folder /path/to/in_folder --output_folder /path/to/out_folder 
+nextflow run dessimozlab/FastOMA -profile docker  --input /path/to/in_folder --output_folder /path/to/out_folder 
 ```
 You could also add specific version to be used by adding `-r v0.4.0` to the command line. Without any `-r` argument, 
 always the latest available release will be used. With `-r dev` the latest development release can be used.
@@ -85,7 +85,7 @@ git checkout version, you can specify this in the following way:
 ```bash
 nextflow run FastOMA.nf -profile docker \
     --container_version "sha-$(git rev-list --max-count=1 --abbrev-commit HEAD)" \
-    --input_folder testdata/in_folder \
+    --input testdata/in_folder \
     --output_folder myresult/
 ```
 
@@ -126,7 +126,7 @@ nextflow run FastOMA.nf -profile docker --container_version "sha-$(git rev-list
 
 - run pipeline including with some testdata (For more details, see the section [How to run FastOMA on the test data](https://github.com/DessimozLab/fastoma?tab=readme-ov-file#how-to-run-fastoma-on-the-test-data) )
   ```bash
-  nextflow run FastOMA.nf -profile standard --input_folder testdata/in_folder --output_folder output -with-report
+  nextflow run FastOMA.nf -profile standard --input testdata/in_folder --output_folder output -with-report
   ```
 
 
@@ -172,7 +172,7 @@ mamba activate FastOMA
 Afterwards, you can run the workflow using nextflow (which is installed as part of the conda environment)
 
 ```
-nextflow run FastOMA.nf -profile standard|slurm --input_folder /path/to/input_folder --output_folder /path/to/output
+nextflow run FastOMA.nf -profile standard|slurm --input /path/to/input --output_folder /path/to/output
 ```
 Note that you should use either the profile `standard` or `slurm` such the nextflow executor will use the activated environment.
 
@@ -191,7 +191,7 @@ One can select the desired container via the `--container_version` argument
 ```
 nextflow run FastOMA.nf -profile docker \
     --container_version "sha-$(git rev-list --max-count=1 --abbrev-commit HEAD)" \
-    --input_folder testdata/in_folder \
+    --input testdata/in_folder \
     --output_folder myresult/
 ```
 This will use the container that is tagged with the current commit id. Similarly, one could also use 
@@ -251,7 +251,7 @@ Finally, run the package using nextflow as below:
 ```
 # cd FastOMA/testdata
 nextflow run ../FastOMA.nf  \
-         --input_folder in_folder  \
+         --input in_folder  \
          --omamer_db in_folder/omamerdb.h5 \
          --output_folder out_folder \
          --report \
@@ -421,7 +421,7 @@ For running on a SLURM cluster, you can add the slurm profile argument:  `-profi
 # ls ../FastOMA.nf 
 
 nextflow ../FastOMA.nf -profile slurm \
-   --input_folder in_folder \
+   --input in_folder \
    --output_folder out_folder
 ```
 
@@ -468,6 +468,7 @@ Citation:  Majidian, Sina, Yannis Nevers, Ali Yazdizadeh Kharrazi, Alex Warwick
 
 ## Change log
 - Update  v0.5dev (not released yet):
+  - renamed input_folder parameter to input. input accepts now also (remote) archive tarball files.
   - better configuration setup (close to nf-core)
   - improved resource allocation for nextflow
   - improved handling of alternative splicing variants in reporting
diff --git a/conf/test-fungi.config b/conf/test-fungi.config
@@ -0,0 +1,7 @@
+// Default configuration for Nextflow
+
+params {
+    test_data_url         = "https://zenodo.org/records/17434495/files/fungi-30.tgz?download=1"
+    report                = true
+    omamer_db             = "${projectDir}/testdata/test.h5"
+}
diff --git a/conf/test-mammalia.config b/conf/test-mammalia.config
@@ -0,0 +1,6 @@
+// Default configuration for Nextflow
+
+params {
+    test_data_url         = "https://zenodo.org/records/17434495/files/mammalia-22.tgz?download=1"
+    report                = true
+}
diff --git a/conf/test.config b/conf/test.config
@@ -9,12 +9,12 @@ process {
 
 params {
     omamer_db             = "${projectDir}/testdata/test.h5"
-    input_folder          = "${projectDir}/testdata/in_folder"
+    input                 = "${projectDir}/testdata/in_folder"
     report                = true
 
     // derived parameters
-    proteome_folder       = "${params.input_folder}/proteome"
-    hogmap_in             = "${params.input_folder}/hogmap_in"
-    splice_folder         = "${params.input_folder}/splice"
-    species_tree          = "${params.input_folder}/species_tree.nwk"
+    proteome_folder       = "${params.input}/proteome"
+    hogmap_in             = "${params.input}/hogmap_in"
+    splice_folder         = "${params.input}/splice"
+    species_tree          = "${params.input}/species_tree.nwk"
 }
diff --git a/nextflow.config b/nextflow.config
@@ -30,12 +30,16 @@ manifest {
 params {
   // default parameters for test run
   // these can be overridden by the user on the command line
-  input_folder = null
+  input           = null
   // input sub-folders, can also be somewhere else
-  proteome_folder = "${params.input_folder}/proteome"
-  hogmap_in       = "${params.input_folder}/hogmap_in"
-  splice_folder   = "${params.input_folder}/splice"
-  species_tree    = "${params.input_folder}/species_tree.nwk"
+  proteome_folder = "${params.input}/proteome"
+  hogmap_in       = "${params.input}/hogmap_in"
+  splice_folder   = "${params.input}/splice"
+  species_tree    = "${params.input}/species_tree.nwk"
+  // cache path for (remote) archive input files
+  test_data_cache = null
+  // Keep deprecated parameter for backward compatibility
+  input_folder = null
 
   // main output folder
   output_folder   = "Output"
@@ -144,8 +148,10 @@ profiles {
     slurm {
         includeConfig 'conf/slurm_basic.config'
     }
-    test      { includeConfig 'conf/test.config'             }
-    large     { includeConfig 'conf/base_large.config'       }
+    test      { includeConfig 'conf/test.config'          }
+    large     { includeConfig 'conf/base_large.config'    }
+    mammalia  { includeConfig 'conf/test-mammalia.config' }
+    fungi     { includeConfig 'conf/test-fungi.config'    }
 }
 
 // Capture exit codes from upstream processes when piping
@@ -183,4 +189,4 @@ dag {
     enabled = !params.help && params.report
     file    = "${params.statsdir}/pipeline_dag_${params.trace_report_suffix}.html"
     overwrite = true
-}
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -9,19 +9,24 @@
       "title": "Input options",
       "type": "object",
       "description": "Define where the pipeline should find input data ",
-      "required": ["input_folder", "species_tree"],
+      "required": ["input", "species_tree"],
       "properties": {
-        "input_folder": {
+        "input": {
           "type": "string",
-          "format": "directory-path",
-          "description": "Path to input directory containing proteomes and species tree",
-          "fa_icon": "fas fa-folder-open"
+          "description": "Input data source: local directory, archive file, or remote URL",
+          "help": "Can be: (1) Path to a local directory containing proteome/ subfolder and species_tree.nwk file, (2) Path to a local archive file (.tar.gz, .tgz, .zip), or (3) HTTP/HTTPS URL to download an archive. Archives will be automatically extracted and cached.",
+          "fa_icon": "fas fa-folder-open",
+          "examples": [
+            "/path/to/dataset/",
+            "/path/to/dataset.tar.gz",
+            "https://zenodo.org/records/12345/files/dataset.tar.gz"
+          ]
         },
         "proteome_folder": {
           "type": "string",
           "format": "directory-path",
           "description": "Path to input directory containing the proteome files in fasta format",
-          "help": "If not provided, the proteomes are asumed to be in the input_folder/proteomes directory.",
+          "help": "Override the default proteome folder location. Only used when input is a local directory. If not specified, defaults to input/proteome/.",
           "fa_icon": "fas fa-folder-open"
         },
         "hogmap_in": {
@@ -34,17 +39,17 @@
         "species_tree": {
           "type": "string",
           "format": "file-path",
-          "description": "Path to species tree file",
+          "description": "Path to species tree file in Newick format",
           "fa_icon": "fas fa-tree",
           "pattern": "^\\S+\\.(nhx|nh|nwk)",
-          "help": "The species tree should be in Newick or NHX format. By default, the pipeline looks for a file named species_tree.nwk in the input_folder."
+          "help": "Override the default species tree location. Only used when input is a local directory. If not specified, defaults to input/species_tree.nwk."
         },
         "splice_folder": {
           "type": "string",
           "format": "directory-path",
           "description": "Path to input directory containing the splice files",
           "fa_icon": "fas fa-folder-open",
-          "help": "If provided, FastOMA will use the splice files to identify and handle alternative splicing isoforms in the proteomes and select the best representative isoform for each gene."
+          "help": "If provided, FastOMA will use splice variant information to select representative isoforms for each gene. Only used when input is a local directory."
         },
         "omamer_db": {
           "type": "string",
@@ -53,6 +58,19 @@
           "fa_icon": "fas fa-database",
           "help": "If not provided, the default OMAmer database (LUCA) will be used.",
           "default": "https://omabrowser.org/All/LUCA.h5"
+        },
+        "test_data_cache": {
+          "type": "string",
+          "format": "directory-path",
+          "description": "Path where (remote) input archives will be stored and permanently cached",
+          "fa_ison": "fas fa-folder-open"
+        },
+        "input_folder": {
+          "type": "string",
+          "hidden": true,
+          "format": "directory-path",
+          "description": "DEPRECATED: Use --input instead",
+          "help_text": "This parameter has been renamed to --input. Please update your command line or configuration files."
         }
       }
     },