diff --git a/conf/modules.config b/conf/modules.config index 4a653ed..9327925 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,11 +18,11 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SEQTK_SAMPLE { + withName: 'SEQTK_SAMPLE' { ext.args = '-s100' } - withName: FASTQC { + withName: 'FASTQC' { ext.args = '--quiet' } @@ -35,6 +35,17 @@ process { ] } + withName: 'RUNDIRPARSER' { + publishDir = [ + path: { "${params.outdir}/rundirparser" }, + mode: params.publish_dir_mode, + // The process _mqc.txt outputs should have identical names for the same sequencing platforms + // in order to be grouped together in the MultiQC report, but here we need to enforce uniqueness + // to avoid overwriting results in the publishDir. + saveAs: { filename -> filename.equals('versions.yml') ? null : "${dir_meta.dirname}_$filename" } + ] + } + withName: 'MULTIQC_GLOBAL' { ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ diff --git a/modules/local/rundirparser/environment.yml b/modules/local/rundirparser/environment.yml new file mode 100644 index 0000000..046f88d --- /dev/null +++ b/modules/local/rundirparser/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda +dependencies: + - pip + - pip: + - PyYAML==6.0.2 diff --git a/modules/local/rundirparser/main.nf b/modules/local/rundirparser/main.nf new file mode 100644 index 0000000..c702e57 --- /dev/null +++ b/modules/local/rundirparser/main.nf @@ -0,0 +1,43 @@ +process RUNDIRPARSER { + tag "$rundir.simpleName" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/41/412df2cdcf04e0a12971ba61b12cacaa5a49705442afe99ad96668bebbb8f880/data' : + 'community.wave.seqera.io/library/pip_pyyaml_xmltodict:a4e48bd1ab4b6a53' }" + + input: + tuple val(dir_meta), path(rundir) + + output: + tuple val(dir_meta), path("*_mqc.*"), emit: multiqc + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + # TODO: check what kind of seq platfrom to decide which script to use + rundirparser.py ${rundir} + parse_illumina.py ${rundir} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Python: \$(python --version |& sed '1!d ; s/Python //') + PyYAML: \$(python -c "import yaml; print(yaml.__version__)") + END_VERSIONS + """ + + stub: + """ + touch rundir_mqc.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Python: stub_version + PyYAML: stub_version + END_VERSIONS + """ +} diff --git a/modules/local/rundirparser/meta.yml b/modules/local/rundirparser/meta.yml new file mode 100644 index 0000000..7027a80 --- /dev/null +++ b/modules/local/rundirparser/meta.yml @@ -0,0 +1,68 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundirparser" +## TODO nf-core: Add a description of the module and list keywords +description: write your description here +keywords: + - sort + - example + - genomics +tools: + - "rundirparser": + ## TODO nf-core: Add a description and other details for the software below + description: "" + homepage: "" + documentation: "" + tool_dev_url: "" + doi: "" + licence: + identifier: + +## TODO nf-core: Add a description of all of the variables used as input +input: + # Only when we have meta + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + ## TODO nf-core: Delete / customise this example input + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_25722" + - edam: "http://edamontology.org/format_2573" + - edam: "http://edamontology.org/format_3462" + +## TODO nf-core: Add a description of all of the variables used as output +output: + - bam: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + ## TODO nf-core: Delete / customise this example output + - "*.bam": + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_25722" + - edam: "http://edamontology.org/format_2573" + - edam: "http://edamontology.org/format_3462" + + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@kedhammar" +maintainers: + - "@kedhammar" diff --git a/modules/local/rundirparser/resources/usr/bin/parse_illumina.py b/modules/local/rundirparser/resources/usr/bin/parse_illumina.py new file mode 100755 index 0000000..5d210ab --- /dev/null +++ b/modules/local/rundirparser/resources/usr/bin/parse_illumina.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +import os +import yaml +import sys +from datetime import datetime +from pathlib import Path +import xmltodict + + +def read_run_parameters(directory): + alt_1 = directory / "runParameters.xml" + alt_2 = directory / "RunParameters.xml" + if alt_1.exists(): + with open(alt_1) as f: + return xmltodict.parse(f.read()) + elif alt_2.exists(): + with open(alt_2) as f: + return xmltodict.parse(f.read()) + else: + raise Exception("[Rr]unParameters.xml not found!") + + +def find(d, tag): + if isinstance(d, dict): + if tag in d: + yield d[tag] + for k, v in d.items(): + if isinstance(v, dict): + yield from find(v, tag) + if isinstance(v, list): + for i in v: + yield from find(i, tag) + + +def construct_data(run_parameters): + run_parameters_tags = { + "RunId": "Run ID", + "RunID": "Run ID", + "InstrumentType": "Instrument type", + "ApplicationName": "Control software", + "Application": "Control software", + "ApplicationVersion": "Control software version", + "SystemSuiteVersion": "Control software version", + "Flowcell": "Flowcell type", + "FlowCellMode": "Flowcell type", + "ReagentKitVersion": "Reagent kit version", + "RTAVersion": "RTA Version", + "RtaVersion": "RTA Version", + } + data = {} + for k, v in run_parameters_tags.items(): + for key, value in run_parameters_tags.items(): + info = list(find(run_parameters, key)) + if info: + data[value] = info[0] + return data + + +def construct_multiqc_yaml(directory): + + directory_name = directory.name + run_parameters = read_run_parameters(directory) + + data = construct_data(run_parameters) + + #TODO: MultiQC currently ignores the data in this yaml RUDE + metadata = { + "custom_data": { + "my_data_type": { + "id": "mqc_seq_metadata", + "section_name": "Sequencing instrument metadata", + "description": directory_name, + "plot_type": "table", + "pconfig": { + "id": 'custom_table', + "title": 'Custom Table', + "no_headers": "true", + }, + "data": data, + } + } + } + + return metadata + + +if __name__ == "__main__": + rundir_path = Path(sys.argv[1]) + output_file = "illumina_mqc.yml" + + multiqc_yaml = construct_multiqc_yaml(rundir_path) + + with open(output_file, "w") as f: + yaml.dump(multiqc_yaml, f) diff --git a/modules/local/rundirparser/resources/usr/bin/rundirparser.py b/modules/local/rundirparser/resources/usr/bin/rundirparser.py new file mode 100755 index 0000000..b8db68d --- /dev/null +++ b/modules/local/rundirparser/resources/usr/bin/rundirparser.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +import sys +import yaml + + +def parse_rundir(rundir): + # Dummy implementation, replace with actual logic + + sequencing_platform = None + + yml_contents = """# plot_type: 'table' +# section_name: 'rundir stats' +# description: 'dummy rundir stats' +# pconfig: +# namespace: 'Cust Data' +# headers: +# col1: +# title: '#Seqs' +# description: 'Number of sequences' +# format: '{:,.0f}' +# col2: +# title: 'Total bp' +# description: 'Total size of the dataset' +# col3: +# title: 'Avg' +# description: 'Average sequence length' +# col4: +# title: 'N50' +# description: '50% of the sequences are longer than this size' +# col5: +# title: 'N75' +# description: '75% of the sequences are longer than this size' +# col6: +# title: 'N90' +# description: '90% of the sequences are longer than this size' +# col7: +# title: 'Min' +# description: 'Length of the shortest sequence' +# col8: +# title: 'Max' +# description: 'Length of the longest sequence' +# col9: +# title: 'auN' +# description: 'Area under the Nx curve' +# col10: +# title: 'GC' +# description: 'Relative GC content (excluding Ns)' +""" + tsv_contents = f"""Sample col1 col2 col3 col4 col5 col6 col7 col8 col9 col10 +{rundir} 10 147806 14780.6000000 22507 16573 15322 22801.9181765 344 33340 NaN +""" + + contents = yml_contents + tsv_contents + + """ + File names should be unique between sequencing platforms, but otherwise identical + so multiple rundirs of the same platform will be written to the same table + in the MultiQC report. + """ + outname = f"{sequencing_platform or 'rundirparser'}_mqc.txt" + + with open(outname, "w") as f: + f.write(contents) + + +def main(): + rundir = sys.argv[1] + parse_rundir(rundir) + + +if __name__ == "__main__": + main() diff --git a/modules/local/rundirparser/tests/main.nf.test b/modules/local/rundirparser/tests/main.nf.test new file mode 100644 index 0000000..0dc321d --- /dev/null +++ b/modules/local/rundirparser/tests/main.nf.test @@ -0,0 +1,73 @@ +// TODO nf-core: Once you have added the required tests, please run the following command to build this file: +// nf-core modules test rundirparser +nextflow_process { + + name "Test Process RUNDIRPARSER" + script "../main.nf" + process "RUNDIRPARSER" + + tag "modules" + tag "modules_" + tag "rundirparser" + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used + test("sarscov2 - bam") { + + // TODO nf-core: If you are created a test for a chained module + // (the module requires running more than one process to generate the required output) + // add the 'setup' method here. + // You can find more information about how to use a 'setup' method in the docs (https://nf-co.re/docs/contributing/modules#steps-for-creating-nf-test-for-chained-modules). + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + //TODO nf-core: Add all required assertions to verify the test output. + // See https://nf-co.re/docs/contributing/tutorials/nf-test_assertions for more information and examples. + ) + } + + } + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used but keep the " - stub" suffix. + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + //TODO nf-core: Add all required assertions to verify the test output. + ) + } + + } + +} diff --git a/modules/nf-core/seqfu/stats/main.nf b/modules/nf-core/seqfu/stats/main.nf index 0f8bb3e..2ea8e43 100644 --- a/modules/nf-core/seqfu/stats/main.nf +++ b/modules/nf-core/seqfu/stats/main.nf @@ -14,7 +14,7 @@ process SEQFU_STATS { output: tuple val(meta), path("*.tsv") , emit: stats - tuple val(meta), path("*_mqc.txt"), emit: multiqc + tuple val(meta), path("*_mqc.txt"), emit: multiqc path "versions.yml" , emit: versions when: diff --git a/nextflow.config b/nextflow.config index bb1cb27..caefca9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,6 +6,9 @@ ---------------------------------------------------------------------------------------- */ +// Enable use of module binaries for e.g. module specific Python scripts +nextflow.enable.moduleBinaries = true + // Global default params, used in configs params { diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index bd936fa..0472ab3 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -10,6 +10,7 @@ include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/sample/ include { FASTQC } from '../modules/nf-core/fastqc/main' include { SEQFU_STATS } from '../modules/nf-core/seqfu/stats' include { FASTQSCREEN_FASTQSCREEN } from '../modules/nf-core/fastqscreen/fastqscreen/main' +include { RUNDIRPARSER } from '../modules/local/rundirparser/main' include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' include { MULTIQC as MULTIQC_PER_TAG } from '../modules/nf-core/multiqc/main' @@ -38,6 +39,37 @@ workflow SEQINSPECTOR { ch_multiqc_extra_files = Channel.empty() ch_multiqc_reports = Channel.empty() + // + // MODULE: Parse rundir info + // + if (!("rundirparser" in skip_tools)) { + + // From samplesheet channel serving (sampleMetaObj, sampleReadsPath) tuples: + // --> Create new rundir channel serving (rundirMetaObj, rundirPath) tuples + ch_rundir = ch_samplesheet + // Group by rundir + .map { meta, _reads -> [meta.rundir, meta] } + .groupTuple() + // From all meta objects associated with a given rundir... + .map { rundir, metas -> + // Collect all unique tags into a list + def all_tags = metas.collect { it.tags }.flatten().unique() + // Create a new meta object whose attributes are... + // 1. tags: The list of merged tags, used for grouping MultiQC reports + // 2. dirname: The simple name of the rundir, used for setting unique output names in publishDir + def dir_meta = [tags: all_tags, dirname: rundir.simpleName] + // Return the new structure, to... + // 1. Feed into rundir specific processes + // 2. Mix with the ch_multiqc_files channel downstream + [dir_meta, rundir] + } + + RUNDIRPARSER( ch_rundir ) + + ch_multiqc_files = ch_multiqc_files.mix(RUNDIRPARSER.out.multiqc) + ch_versions = ch_versions.mix(RUNDIRPARSER.out.versions.first()) + } + // // MODULE: Run Seqtk sample to perform subsampling // @@ -125,14 +157,14 @@ workflow SEQINSPECTOR { Channel.fromPath(params.multiqc_logo, checkIfExists: true) : Channel.empty() - summary_params = paramsSummaryMap( + summary_params = paramsSummaryMap( workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value( + ch_workflow_summary = Channel.value( paramsSummaryMultiqc(summary_params)) ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( + ch_methods_description = Channel.value( methodsDescriptionText(ch_multiqc_custom_methods_description)) ch_multiqc_extra_files = ch_multiqc_extra_files.mix( @@ -147,7 +179,7 @@ workflow SEQINSPECTOR { MULTIQC_GLOBAL ( ch_multiqc_files - .map { meta, file -> file } + .map { _meta, file -> file } .mix(ch_multiqc_extra_files) .collect(), ch_multiqc_config.toList(), @@ -158,7 +190,7 @@ workflow SEQINSPECTOR { ) ch_tags = ch_multiqc_files - .map { meta, sample -> meta.tags } + .map { meta, _sample -> meta.tags } .flatten() .unique() @@ -168,13 +200,13 @@ workflow SEQINSPECTOR { // Group samples by tag tagged_mqc_files = ch_tags .combine(ch_multiqc_files) - .filter { sample_tag, meta, sample -> sample_tag in meta.tags } - .map { sample_tag, meta, sample -> [sample_tag, sample] } + .filter { sample_tag, meta, _sample -> sample_tag in meta.tags } + .map { sample_tag, _meta, sample -> [sample_tag, sample] } .mix(multiqc_extra_files_per_tag) .groupTuple() .tap { mqc_by_tag } .collectFile { - sample_tag, samples -> + sample_tag, _samples -> def prefix_tag = "[TAG:${sample_tag}]" [ "${prefix_tag}_multiqc_extra_config.yml", @@ -187,7 +219,7 @@ workflow SEQINSPECTOR { } .map { file -> [ (file =~ /\[TAG:(.+)\]/)[0][1], file ] } .join(mqc_by_tag) - .multiMap { sample_tag, config, samples -> + .multiMap { _sample_tag, config, samples -> samples_per_tag: samples.flatten() config: config }