Merge pull request #74 from FloWuenne/add_benchmark_reports

FloWuenne · web-flow · commit fcf5126c00e5 · 2025-02-25T12:01:51.000-05:00
Added process to generate benchmarking reports.
diff --git a/README.md b/README.md
@@ -56,6 +56,26 @@ nextflow run seqeralabs/nf-aggregate \
 
 If you are using a Seqera Platform Enterprise instance that is secured with a private CA SSL certificate not recognized by default Java certificate authorities, you can specify a custom `cacerts` store path through the `--java_truststore_path` parameter and optionally, a password with the `--java_truststore_password`. This certificate will be used to achieve connectivity with your Seqera Platform instance through API and CLI.
 
+### Benchmark reports
+
+If you want to generate a benchmark report comparing multiple runs, you can include a `group` column in your `run_ids.csv` file. This allows you to organize and analyze runs based on custom groupings in the final report.
+
+```
+id,workspace,group
+3VcLMAI8wyy0Ld,community/showcase,group1
+4VLRs7nuqbAhDy,community/showcase,group2
+```
+
+To incorporate AWS cost data into the benchmark report, use the benchmark_aws_cur_report parameter. This should point to a valid AWS Cost and Usage Report (CUR) file in Parquet format, supporting both CUR 1.0 and CUR 2.0 schemas. The file can be stored locally or in a cloud bucket. To run nf-aggregate and generate benchmark reports, you can use the following command:
+
+```
+nextflow run seqeralabs/nf-aggregate \
+    --input run_ids.csv \
+    --outdir ./results \
+    --run_benchmark \
+    --benchmark_aws_cur_report ./aws_cost_report.parquet
+```
+
 ## Output
 
 The results from the pipeline will be published in the path specified by the `--outdir` and will consist of the following contents:
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
     "$id": "https://raw.githubusercontent.com/seqeralabs/nf-aggregate/main/assets/schema_input.json",
     "title": "nf-aggregate pipeline - params.input schema",
     "description": "Schema for the file provided with params.input",
@@ -10,12 +10,20 @@
             "id": {
                 "type": "string",
                 "pattern": "^[A-Za-z0-9]{9,14}$",
-                "errorMessage": "Please provide a valid Seqera Platform run identifier"
+                "errorMessage": "Please provide a valid Seqera Platform run identifier",
+                "meta": ["id"]
             },
             "workspace": {
                 "type": "string",
                 "pattern": "^[a-zA-Z0-9](?:[a-zA-Z0-9]|[-_](?=[a-zA-Z0-9])){1,38}/[a-zA-Z0-9](?:[a-zA-Z0-9]|[-_](?=[a-zA-Z0-9])){1,38}$",
-                "errorMessage": "Please provide a valid Seqera Platform Workspace name"
+                "errorMessage": "Please provide a valid Seqera Platform Workspace name",
+                "meta": ["workspace"]
+            },
+            "group": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9](?:[a-zA-Z0-9]|[-_](?=[a-zA-Z0-9])){1,38}$",
+                "errorMessage": "Please provide a valid group name",
+                "meta": ["group"]
             }
         },
         "required": ["id", "workspace"]
diff --git a/modules/local/benchmark_report/main.nf b/modules/local/benchmark_report/main.nf
@@ -0,0 +1,48 @@
+process BENCHMARK_REPORT {
+
+    container 'cr.seqera.io/scidev/benchmark-reports:sha-b370978'
+
+    input:
+    path run_dumps
+    val  groups
+    path benchmark_aws_cur_report
+
+    output:
+    path "benchmark_report.html" , emit: benchmark_html
+    path "versions.yml"          , emit: versions
+
+    script:
+    def aws_cost_param = benchmark_aws_cur_report ? "--profile cost -P aws_cost:\$TASK_DIR/${benchmark_aws_cur_report}" : ""
+    def benchmark_samplesheet = "benchmark_samplesheet.csv"
+    """
+    # Set up R environment from renv
+    export R_LIBS_USER=/project/renv/library/linux-ubuntu-noble/R-4.4/x86_64-pc-linux-gnu
+    TASK_DIR="\$PWD"
+
+    # Setup cache directories
+    export QUARTO_CACHE=/tmp/quarto/cache
+    export XDG_CACHE_HOME=/tmp/quarto
+
+    # Create the benchmark samplesheet csv
+    echo "group,file_path" > ${benchmark_samplesheet}
+    ${groups.withIndex().collect { group, idx ->
+        "echo \"${group},\$TASK_DIR/${run_dumps[idx]}\" >> ${benchmark_samplesheet}"
+    }.join('\n')}
+
+    cd /project
+    quarto render main_benchmark_report.qmd \\
+        -P log_csv:"\$TASK_DIR/"${benchmark_samplesheet} \\
+        $aws_cost_param \\
+        --output-dir .\\
+        --output benchmark_report.html
+
+    cp /project/benchmark_report.html "\$TASK_DIR/"
+    cd "\$TASK_DIR/"
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        r: \$(R --version | head -1 | sed 's/R version \\([0-9.]*\\).*/\\1/')
+        quarto-cli: \$(quarto --version | head -1 | sed 's/quarto //g')
+END_VERSIONS
+    """
+}
diff --git a/modules/local/benchmark_report/nextflow.config b/modules/local/benchmark_report/nextflow.config
@@ -0,0 +1,10 @@
+process {
+    withName: 'BENCHMARK_REPORT' {
+        publishDir = [
+            path: { "${params.outdir}/${metaOut?.projectName?.replace("/", "_") ?: ""}/benchmark_report" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('.json') ? null : filename }
+        ]
+        containerOptions = "--user root"
+    }
+}
diff --git a/nextflow.config b/nextflow.config
@@ -27,6 +27,10 @@ params {
     multiqc_logo                 = null
     skip_multiqc                 = false
 
+    // Benchmark report options
+    generate_benchmark_report                = false
+    benchmark_aws_cur_report     = null
+
     // Boilerplate options
     outdir                       = 'results'
     publish_dir_mode             = 'copy'
@@ -38,7 +42,6 @@ params {
     // Schema validation default options
     validationFailUnrecognisedParams = false
     validationLenientMode            = false
-    validationSchemaIgnoreParams     = ''
     validationShowHiddenParams       = false
     validationSkipDuplicateCheck     = false
     validate_params                  = true
@@ -52,6 +55,7 @@ process {
     errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
     maxRetries    = 1
     maxErrors     = '-1'
+
 }
 
 profiles {
@@ -172,7 +176,7 @@ singularity.registry = 'quay.io'
 
 // Nextflow plugins
 plugins {
-    id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet
+    id 'nf-schema@2.3.0' // Validation of pipeline parameters and creation of an input channel from a sample sheet
 }
 
 // Export these variables to prevent local Python/R libraries from conflicting with those in the container
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -1,10 +1,10 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
     "$id": "https://raw.githubusercontent.com/seqeralabs/nf-aggregate/main/nextflow_schema.json",
     "title": "seqeralabs/nf-aggregate pipeline parameters",
     "description": "Minimal nf-core pipeline compatible with template",
     "type": "object",
-    "definitions": {
+    "$defs": {
         "input_output_options": {
             "title": "Input/output options",
             "type": "object",
@@ -26,7 +26,8 @@
                     "type": "string",
                     "format": "directory-path",
                     "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.",
-                    "fa_icon": "fas fa-folder-open"
+                    "fa_icon": "fas fa-folder-open",
+                    "default": "results"
                 }
             }
         },
@@ -67,6 +68,18 @@
                     "type": "boolean",
                     "description": "Skip MultiQC.",
                     "fa_icon": "fas fa-fast-forward"
+                },
+                "generate_benchmark_report": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-tachometer-alt",
+                    "description": "Compile a benchmarking report for Seqera Platform runs."
+                },
+                "benchmark_aws_cur_report": {
+                    "type": "string",
+                    "fa_icon": "fas fa-dollar-sign",
+                    "description": "AWS CUR report from data exports.",
+                    "pattern": "^\\S+\\.parquet",
+                    "format": "file-path"
                 }
             },
             "required": ["seqera_api_endpoint"]
@@ -127,7 +140,8 @@
                 "modules_testdata_base_path": {
                     "type": "string",
                     "description": "Base path / URL for data used in the modules",
-                    "hidden": true
+                    "hidden": true,
+                    "default": "s3://ngi-igenomes/testdata/nf-core/modules/"
                 },
                 "validate_params": {
                     "type": "boolean",
@@ -169,13 +183,13 @@
     },
     "allOf": [
         {
-            "$ref": "#/definitions/input_output_options"
+            "$ref": "#/$defs/input_output_options"
         },
         {
-            "$ref": "#/definitions/pipeline_options"
+            "$ref": "#/$defs/pipeline_options"
         },
         {
-            "$ref": "#/definitions/generic_options"
+            "$ref": "#/$defs/generic_options"
         }
     ]
 }
diff --git a/subworkflows/local/utils_nf_aggregate/main.nf b/subworkflows/local/utils_nf_aggregate/main.nf
@@ -15,6 +15,7 @@ import java.nio.file.Paths
 include { UTILS_NEXTFLOW_PIPELINE   } from '../../nf-core/utils_nextflow_pipeline/main'
 include { getWorkflowVersion        } from '../../nf-core/utils_nextflow_pipeline/main'
 include { UTILS_NFVALIDATION_PLUGIN } from '../../nf-core/utils_nfvalidation_plugin/main.nf'
+include { samplesheetToList }         from 'plugin/nf-schema'
 
 /*
 ========================================================================================
@@ -53,9 +54,8 @@ workflow PIPELINE_INITIALISATION {
 
     // Read in ids from --input file
     Channel
-        .from(file(params.input))
-        .splitCsv(header:true, sep:',', strip:true)
-        .unique()
+        .fromList(samplesheetToList(params.input, "assets/schema_input.json"))
+        .flatten()
         .set { ch_ids }
 
     emit:
diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf b/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf
diff --git a/tower.yml b/tower.yml
@@ -3,3 +3,5 @@ reports:
     display: "MultiQC HTML report"
   "*_gantt.html":
     display: "GANTT plot of task execution in a run, grouped by 'instance-id' if available."
+  benchmark_report.html:
+    display: "Benchmarking HTML report"
diff --git a/workflows/nf_aggregate/main.nf b/workflows/nf_aggregate/main.nf
@@ -5,10 +5,11 @@
 include { SEQERA_RUNS_DUMP     } from '../../modules/local/seqera_runs_dump'
 include { PLOT_RUN_GANTT       } from '../../modules/local/plot_run_gantt'
 include { MULTIQC              } from '../../modules/nf-core/multiqc'
+include { BENCHMARK_REPORT     } from '../../modules/local/benchmark_report'
 include { paramsSummaryMultiqc } from '../../subworkflows/local/utils_nf_aggregate'
 include { getProcessVersions   } from '../../subworkflows/local/utils_nf_aggregate'
 include { getWorkflowVersions  } from '../../subworkflows/local/utils_nf_aggregate'
-include { paramsSummaryMap     } from 'plugin/nf-validation'
+include { paramsSummaryMap     } from 'plugin/nf-schema'
 
 workflow NF_AGGREGATE {
 
@@ -56,6 +57,20 @@ workflow NF_AGGREGATE {
     )
     ch_versions = ch_versions.mix(PLOT_RUN_GANTT.out.versions.first())
 
+    //
+    // MODULE: Generate benchmark report
+    //
+    if (params.generate_benchmark_report) {
+        aws_cur_report = params.benchmark_aws_cur_report ? Channel.fromPath(params.benchmark_aws_cur_report) : []
+
+        BENCHMARK_REPORT (
+            SEQERA_RUNS_DUMP.out.run_dump.collect{it[1]},
+            SEQERA_RUNS_DUMP.out.run_dump.collect{it[0].group},
+            aws_cur_report
+        )
+        ch_versions = ch_versions.mix(BENCHMARK_REPORT.out.versions.first())
+    }
+
     //
     // Collate software versions
     //
diff --git a/workflows/nf_aggregate/nextflow.config b/workflows/nf_aggregate/nextflow.config
@@ -1,3 +1,4 @@
 includeConfig '../../modules/local/seqera_runs_dump/nextflow.config'
 includeConfig '../../modules/local/plot_run_gantt/nextflow.config'
 includeConfig '../../modules/nf-core/multiqc/nextflow.config'
+includeConfig '../../modules/local/benchmark_report/nextflow.config'