Merge pull request nf-core#1063 from nf-core/nf-test-conversion

TCLamnidis · web-flow · commit 652b0c16b810 · 2025-05-09T11:35:56.000+02:00
Add nf-test
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,10 +1,13 @@
-name: nf-core CI
 # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
+name: nf-core CI
 on:
   push:
     branches:
-      - dev
+      - "dev"
   pull_request:
+    branches:
+      - "dev"
+      - "master"
   release:
     types: [published]
   workflow_dispatch:
@@ -15,16 +18,31 @@ env:
   NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity
 
 concurrency:
-  group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 jobs:
+  define_nxf_versions:
+    name: Choose nextflow versions to test against depending on target branch
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.nxf_versions.outputs.matrix }}
+    steps:
+      - id: nxf_versions
+        run: |
+          if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.base_ref }}" == "dev" && "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then
+            echo matrix='["latest-everything"]' | tee -a $GITHUB_OUTPUT
+          else
+            echo matrix='["latest-everything", "23.10.0"]' | tee -a $GITHUB_OUTPUT
+          fi
+
   test:
     name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})"
     # Only run on push if this is the nf-core dev branch (merged PRs)
     if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/eager') }}"
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         NXF_VER:
           - "24.04.2"
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ testing*
 *.pyc
 null/
 .nf-test*
+
diff --git a/conf/modules.config b/conf/modules.config
@@ -1067,19 +1067,6 @@ process {
         ext.args   = { "--profiler ${meta.profiler} --output ${meta.profiler}taxpasta_table.tsv" }
     }
 
-    //
-    // QUALIMAP
-    //
-
-    withName: 'QUALIMAP_BAMQC_WITHBED|QUALIMAP_BAMQC_NOBED' {
-        tag        = { "${meta.reference}|${meta.sample_id}" }
-        publishDir = [
-            path: { "${params.outdir}/mapstats/qualimap/${meta.reference}/" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-        ]
-    }
-
     //
     // DAMAGE CALCULATION
     //
diff --git a/conf/test.config b/conf/test.config
@@ -44,6 +44,7 @@ params {
     bamfiltering_minreadlength      = 30
     bamfiltering_mappingquality     = 37
     deduplication_tool              = 'markduplicates'
+    bamfiltering_savefilteredbams   = true
 
     // PreSeq
     mapstats_preseq_mode            = 'c_curve'
diff --git a/nf-test.config b/nf-test.config
@@ -0,0 +1,14 @@
+config {
+
+    testsDir "tests"
+    workDir ".nf-test"
+    configFile "tests/nextflow.config"
+    profile ""
+
+    // load the necessary plugins
+    plugins {
+        load "nft-utils@0.0.3"
+        load "nft-vcf@1.0.7"
+    }
+
+}
diff --git a/tests/nextflow.config b/tests/nextflow.config
@@ -0,0 +1,5 @@
+/*
+========================================================================================
+    Nextflow config file for running tests
+========================================================================================
+*/
diff --git a/tests/test.nf.test b/tests/test.nf.test
@@ -0,0 +1,149 @@
+nextflow_pipeline {
+
+    name "Test pipeline: NFCORE_EAGER"
+    script "main.nf"
+    tag "pipeline"
+    tag "nfcore_eager"
+    tag "test"
+
+    test("test_profile") {
+
+        when {
+            params {
+                outdir = "$outputDir"
+            }
+        }
+
+        then {
+
+            ///////////////////
+            // DOCUMENTATION //
+            ///////////////////
+
+            // The contents of each top level results directory should be tested with individually named snapshots.
+            // Within each snapshot, there should be two to three distinct variables, that contain the files to be tested.
+            //    - stable_name_<dir> is for files with variable md5sums (i.e. content) so only names will be compared
+            //    - stable_content_<dir> is for files with stable md5sums (i.e. content) so md5sums will be compared
+            //    - bams_<dir> is for BAM files, where the headerMD5 is checked for stability (since the content can be unstable)
+            // If a directory is fully stable, you can drop `stable_name_*`
+            // If a directory contains no BAMs, you can drop `bams_*`
+
+            // Generate with: nf-test test --tag test --profile docker,test --update-snapshot
+            // Test with:     nf-test test --tag test --profile docker,test
+            // NOTE: BAMs are always only stable in name, because:
+            //   a) sharding breaks header since the shard that was first is named in the header (Fixed in https://github.com/nf-core/eager/pull/1112)
+            //   b) the order of the reads in the BAMs is not stable (sorted, but reads that share a start position can be in any order)
+            //   point b) also causes BAIs to be unstable.
+            //   c) Merging of multiple BAMs with duplicate @RG / @PG tags can cause the header to be unstable (particularly in the case of shards/lanes)
+
+            //////////////////////
+            // DEFINE VARIABLES //
+            //////////////////////
+
+            // Define exclusion patterns for files with unstable contents
+            // NOTE: When a section needs more than a couple of small patterns, consider adding a variable to store the patterns here
+            //       This is particularly important if the patterns excluded in the stable content section should be included in the stable name section
+            def unstable_patterns_auth = [
+                '**/mapped_reads_gc-content_distribution.txt',
+                '**/mapped_reads_nucleotide_content.txt',
+                '**/genome_gc_content_per_window.png',
+                '**/*.{svg,pdf,html,png}',
+                '**/DamageProfiler.log',
+                '**/3p_freq_misincorporations.txt',
+                '**/5p_freq_misincorporations.txt',
+                '**/DNA_comp_genome.txt',
+                '**/DNA_composition_sample.txt',
+                '**/misincorporation.txt',
+                '**/genome_results.txt',
+                ]
+
+            // Check that no files are missing/added
+            // Command legend:                                       Result directory to index      , includeDir: include dirs?, ignore: exclude patterns       , ignoreFile: exclude pattern list , include: include patterns
+            def stable_name_all                 = getAllFilesFromDir("$outputDir/"                  , includeDir: false         , ignore: ['pipeline_info/*']    , ignoreFile: null                 , include: ['*', '**/*'] )
+
+            // Authentication
+            def stable_content_authentication   = getAllFilesFromDir("$outputDir/authentication"    , includeDir: false         , ignore: unstable_patterns_auth , ignoreFile: null                 , include: ['*', '**/*'] )
+            def stable_name_authentication      = getAllFilesFromDir("$outputDir/authentication"    , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: unstable_patterns_auth)
+
+            // Deduplication - TODO -> snapshot both lists are empty!?
+            def stable_content_deduplication    = getAllFilesFromDir("$outputDir/deduplication"     , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.flagstat']  )
+            def stable_name_deduplication       = getAllFilesFromDir("$outputDir/deduplication"     , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.{bam,bai}'] )
+
+            // Final_bams
+            def stable_content_final_bams       = getAllFilesFromDir("$outputDir/final_bams"        , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.flagstat']  )
+            def stable_name_final_bams          = getAllFilesFromDir("$outputDir/final_bams"        , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.{bam,bai}'] )
+
+            // Mapping (incl. bam_input flasgstat)
+            def stable_content_mapping          = getAllFilesFromDir("$outputDir/mapping"           , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.flagstat']  )
+            def stable_name_mapping             = getAllFilesFromDir("$outputDir/mapping"           , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.{bam,bai}'] )
+
+            // Preprocessing
+            // NOTE: FastQC html appears stable, but I worry it might just include a day timestamp instead of a full timestamp. To keep the expression simpler I removed both from checksum testing.
+            def stable_content_preprocessing    = getAllFilesFromDir("$outputDir/preprocessing"     , includeDir: false         , ignore: ['**/*.{zip,log,html}'], ignoreFile: null                 , include: ['**/*'] )
+            def stable_name_preprocessing       = getAllFilesFromDir("$outputDir/preprocessing"     , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.{zip,log,html}'] )
+
+            // Read filtering
+            def stable_content_readfiltering    = getAllFilesFromDir("$outputDir/read_filtering"    , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.flagstat']  )
+            def stable_name_readfiltering       = getAllFilesFromDir("$outputDir/read_filtering"    , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.{bam,bai}'] )
+
+            // Genotyping
+            def stable_content_genotyping       = getAllFilesFromDir("$outputDir/genotyping"        , includeDir: false         , ignore: ['**/*.{tbi,vcf.gz}']  , ignoreFile: null                 , include: ['**/*'] )
+            def stable_name_genotyping          = getAllFilesFromDir("$outputDir/genotyping"        , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.tbi'] )
+            // We need to collect the vcfs separately to run more specific md5sum checks on the header (contnts are unstable due to same reasons as BAMs, explained above).
+            def genotyping_vcfs                 = getAllFilesFromDir("$outputDir/genotyping"        , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.vcf.gz'] )
+
+            // Metagenomics
+            def stable_content_metagenomics    = getAllFilesFromDir("$outputDir/metagenomics"       , includeDir: false         , ignore: ['**/*.biom', '**/*table.tsv']          , ignoreFile: null                 , include: ['**/*'] )
+            def stable_name_metagenomics       = getAllFilesFromDir("$outputDir/metagenomics"       , includeDir: false         , ignore: null                   , ignoreFile: null                 , include: ['**/*.biom', '**/*table.tsv'] )
+
+            // MultiQC
+            def stable_name_multiqc             = getAllFilesFromDir("$outputDir/multiqc"           , includeDir: false         , ignore: null                    , ignoreFile: null                , include: ['*', '**/*'] )
+
+            ///////////////////////
+            // DEFINE ASSERTIONS //
+            ///////////////////////
+
+            assertAll(
+                { assert workflow.success },
+                // This checks that there are no missing or additional output files.
+                // Also a good starting point to look at all the files in the output folder than need to be checked in subsequent sections.
+                { assert snapshot( stable_name_all*.name             ).match("all_files") },
+
+                // Checking changes to contents of each section
+                // NOTE: Keep the order of the sections in the alphanumeric order of the output directories.
+                //    Each section should first check stable_content, stable_name second (if applicable).
+                { assert snapshot( stable_content_authentication     , stable_name_authentication*.name   ).match("authentication") },
+                { assert snapshot( stable_content_deduplication      , stable_name_deduplication*.name    ).match("deduplication") },
+                { assert snapshot( stable_content_final_bams         , stable_name_final_bams*.name       ).match("final_bams") },
+                // NOTE: The snapshot section for mapping cannot be named 'mapping'. See https://github.com/askimed/nf-test/issues/279
+                { assert snapshot( stable_content_mapping            , stable_name_mapping*.name          ).match("mapping_output") },
+                { assert snapshot( stable_content_preprocessing      , stable_name_preprocessing*.name    ).match("preprocessing") },
+                { assert snapshot( stable_content_readfiltering      , stable_name_readfiltering*.name    ).match("read_filtering") },
+                { assert snapshot( stable_content_genotyping         , stable_name_genotyping*.name       ).match("genotyping") },
+                // Additional checks on the genotyping VCFs for content. Specifically the md5sums of the header FORMAT, INFO, FILTER, CONTIG lines, and sample names
+                { assert snapshot(
+                    genotyping_vcfs.collect {
+                        file ->
+                        def vcf_head = path(file.toString()).vcf.header
+                        // The header contains lines in the "OTHER" category, which contain a timestamp and/or work dir paths, so we need to filter those out, then calculate md5sums.
+                        def header_md5 = [
+                            vcf_head.getFormatHeaderLines().toString(),
+                            vcf_head.getInfoHeaderLines().toString(),
+                            vcf_head.getFilterLines().toString(),
+                            vcf_head.getIDHeaderLines().toString(),
+                            vcf_head.getGenotypeSamples().toString(),
+                            vcf_head.getContigLines().toString(),
+                        ].join(' ').md5()
+                        file.getName() + ":header_md5," + header_md5
+                    }
+                ).match("genotyping_vcfs")},
+                { assert snapshot( stable_content_metagenomics       , stable_name_metagenomics*.name     ).match("metagenomics") },
+                { assert snapshot( stable_name_multiqc*.name         ).match("multiqc") },
+
+                // Versions
+                { assert new File("$outputDir/pipeline_info/nf_core_eager_software_mqc_versions.yml").exists() },
+
+            )
+        }
+    }
+}
diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap

-Original file line number
+Diff line change
 *.pyc
 null/
 .nf-test*
++