From 96b830a566c3c2785006e19ac27982187513e7ae Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:53:24 -0700 Subject: [PATCH 01/33] Add files via upload --- .../Remove_Human_Reads.nf | 161 ++++++++++++++++++ .../Remove_Human_reads.config | 29 ++++ .../example-reads_PE/Sample-1_R1.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-1_R2.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-2_R1.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-2_R2.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-3_R1.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-3_R2.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-4_R1.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-4_R2.fastq.gz | Bin 0 -> 361 bytes .../example-reads_SE/Sample-1_raw.fastq.gz | Bin 0 -> 361 bytes .../example-reads_SE/Sample-2_raw.fastq.gz | Bin 0 -> 361 bytes .../example-reads_SE/Sample-3_raw.fastq.gz | Bin 0 -> 361 bytes .../example-reads_SE/Sample-4_raw.fastq.gz | Bin 0 -> 361 bytes .../unique_sample_ids.txt | 3 + 15 files changed, 193 insertions(+) create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_Reads.nf create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R1.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R2.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R1.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R2.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R1.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R2.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R1.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R2.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-1_raw.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-2_raw.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-3_raw.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-4_raw.fastq.gz create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/unique_sample_ids.txt diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_Reads.nf b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_Reads.nf new file mode 100644 index 00000000..b805e5ba --- /dev/null +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_Reads.nf @@ -0,0 +1,161 @@ + + +log.info """\ + REMOVING HUMAN READS + =================================== + Download DB: ${params.DL_kraken} + Single end reads: ${params.single_end} + Use SampleID file: ${params.specify_reads} + Outputs: ${params.human_db_path} + """ + .stripIndent() + +// Process to set up the human reads database +process set_up_human_db { + tag "Downloading human reads database to ${params.human_db_path}\n" + publishDir path: "$projectDir" + + output: + path "${params.human_db_name}/" + + script: + """ + curl -L -o ${params.human_db_name}.tar.gz https://ndownloader.figshare.com/files/25627058 + + tar -xzvf ${params.human_db_name}.tar.gz + rm ${params.human_db_name}.tar.gz + + """ + +} + +// Process for paired-end (PE) read analysis with Kraken2 +process PE_kraken2 { + + container params.kraken2container + tag "$sample_id" + publishDir "$params.kraken_output_dir", pattern: "*.{txt,tsv}" + publishDir "$params.kraken_output_dir/reads", pattern: "*.fastq.gz" + + input: + path database + tuple val(sample_id), path(reads_ch) + + + output: + path "${sample_id}-kraken2-output.txt" + path "${sample_id}-kraken2-report.tsv" + path "${sample_id}*.gz" + + script: + """ + kraken2 --db $database --gzip-compressed \ + --threads 2 --use-names --paired \ + --output ${sample_id}-kraken2-output.txt \ + --report ${sample_id}-kraken2-report.tsv \ + --unclassified-out "${sample_id}${params.PE_reads_out_suffix}" \ + ${reads_ch[0]} ${reads_ch[1]} + + gzip ${sample_id}*.fastq + """ +} + +// Process for single-end (SE) read analysis with Kraken2 +process SE_kraken2 { + + container params.kraken2container + tag "$sample_id" + publishDir "$params.kraken_output_dir", pattern: "*.{txt,tsv}" + publishDir "$params.kraken_output_dir/reads", pattern: "*.fastq.gz" + + input: + path database + tuple val(sample_id), path(reads_ch) + + output: + path "${sample_id}-kraken2-output.txt" + path "${sample_id}-kraken2-report.tsv" + path "${sample_id}${params.SE_reads_out_suffix}.gz" + + script: + """ + kraken2 --db $database --gzip-compressed --threads 2 --use-names \ + --output ${sample_id}-kraken2-output.txt \ + --report ${sample_id}-kraken2-report.tsv \ + --unclassified-out "${sample_id}${params.SE_reads_out_suffix}" \ + ${reads_ch[0]} + + gzip ${sample_id}${params.SE_reads_out_suffix} + """ +} + + +// Main workflow logic +workflow { + +// Conditionally download the human reads database if requested +if(params.DL_kraken == true){ + log.info "\nPreparing to download new human reads database" + database_ch = set_up_human_db() + database_ch.view{"database path: ${it}"} +} + +else { + log.info "\nAccessing previous human reads database" + database_ch = Channel.value(params.human_db_path) + database_ch.view{"database path: ${it}"} +} + +// Process reads based on whether they are single-end or paired-end +if(params.single_end == true) { + log.info "\nReading Single-end data from ${params.reads_dir}\n" + + // Specified reads handling (mimics Channel.fromFilePairs() method's output, but with SE) + if (params.specify_reads) { + reads_ch = Channel + .fromPath("${params.sample_id_list}") + .splitText() + .map { it.trim() } + .map { sample_id -> + def files = file("${params.reads_dir}${sample_id}${params.SE_reads_suffix}") + return [sample_id, files] + } + } + else { + reads_ch = Channel + .fromPath("${params.reads_dir}/*${params.SE_reads_suffix}", checkIfExists: true) + .map { readfile -> + def sampleId = readfile.name.replaceAll("${params.SE_reads_suffix}\$", "") + return tuple(sampleId, readfile) + } + } + reads_ch.view{"reads: ${it}"} + output_ch = SE_kraken2(database_ch, reads_ch) +} +else { + log.info "\nReading Paired-end data from ${params.reads_dir}\n" + + // Specified reads handling (mimics Channel.fromFilePairs() method's output) + if (params.specify_reads) { + reads_ch = Channel + .fromPath("${params.sample_id_list}") + .splitText() + .map { it.trim() } + .map { sample_id -> + def files = file("${params.reads_dir}${sample_id}${params.PE_reads_suffix}").toList().sort() + return [sample_id, files] + } + } + else { + reads_ch = Channel.fromFilePairs(params.reads_dir + "*" + params.PE_reads_suffix, checkIfExists: true) + } + reads_ch.view{"reads: ${it}"} + output_ch = PE_kraken2(database_ch, reads_ch) +} + +// Calculate and log the final result +final_percent = output_ch[1] + .collect{(it.text[0..5]).toFloat()} + .average().trunc(2) + .view{"\nRESULT: ${it}% of input reads were unclassified, available in ${params.kraken_output_dir}/reads "} +} \ No newline at end of file diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config new file mode 100644 index 00000000..00fa6fa9 --- /dev/null +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config @@ -0,0 +1,29 @@ +params.DL_kraken = false + +params.single_end = false + +params.specify_reads = true + +params.sample_id_list = "/workspace/GeneLab_Data_Processing/rmv/unique_sample_ids.txt" + +params.reads_dir = "$projectDir/example-reads_PE/" + +params.PE_reads_suffix = "_R{1,2}.fastq.gz" +params.PE_reads_out_suffix = "_R#_raw_hrRemoved.fastq" + + +params.SE_reads_suffix = "_raw.fastq.gz" +params.SE_reads_out_suffix = "_raw_hrRemoved.fastq" + +params.num_threads = 2 + + + +params.kraken_output_dir = "$projectDir/kraken2-outputs" +params.human_db_name = 'kraken2-human-db' +params.human_db_path = "$projectDir/${params.human_db_name}" + +docker { + enabled = true +} +params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' \ No newline at end of file diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R1.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R2.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R1.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R2.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R1.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R2.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R1.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R2.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-1_raw.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-1_raw.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-2_raw.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-2_raw.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-3_raw.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-3_raw.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-4_raw.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-4_raw.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/unique_sample_ids.txt b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/unique_sample_ids.txt new file mode 100644 index 00000000..1793c123 --- /dev/null +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/unique_sample_ids.txt @@ -0,0 +1,3 @@ +Sample-1 +Sample-2 +Sample-3 \ No newline at end of file From 919d2d2fed01cc2d930de2be30d77965ecd0613f Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Thu, 4 Apr 2024 09:59:42 -0700 Subject: [PATCH 02/33] Update Remove_Human_reads.config --- .../Remove_Human_reads.config | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config index 00fa6fa9..5781bc20 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config @@ -1,29 +1,33 @@ -params.DL_kraken = false +//Variables to set: -params.single_end = false +params.DL_kraken = false //whether or not to download the human reads database as the first step -params.specify_reads = true +params.single_end = false // single-end reads (false if paired-end) -params.sample_id_list = "/workspace/GeneLab_Data_Processing/rmv/unique_sample_ids.txt" +params.specify_reads = true //if true, only process reads specified by the sample_id_list -params.reads_dir = "$projectDir/example-reads_PE/" +params.sample_id_list = "/workspace/GeneLab_Data_Processing/rmv/unique_sample_ids.txt" //list of sample IDs to proccess if specify_reads is true -params.PE_reads_suffix = "_R{1,2}.fastq.gz" -params.PE_reads_out_suffix = "_R#_raw_hrRemoved.fastq" +params.reads_dir = "$projectDir/example-reads_PE/" //directory to find sample reads +params.PE_reads_suffix = "_R{1,2}.fastq.gz" //raw read suffixes (region following the unique part of the sample names) + //e.g. for "Sample-1_R1/2_raw.fastq.gz" would be "_R1_raw.fastq.gz" -params.SE_reads_suffix = "_raw.fastq.gz" -params.SE_reads_out_suffix = "_raw_hrRemoved.fastq" +params.PE_reads_out_suffix = "_R#_raw_hrRemoved.fastq" //suffix to use for final (human reads removed) output files -params.num_threads = 2 + +params.SE_reads_suffix = "_raw.fastq.gz" //if single-end, set this. raw read suffixes which follow the unique part of sample name + +params.SE_reads_out_suffix = "_raw_hrRemoved.fastq" //suffix to use for final (human reads removed) output files -params.kraken_output_dir = "$projectDir/kraken2-outputs" -params.human_db_name = 'kraken2-human-db' -params.human_db_path = "$projectDir/${params.human_db_name}" -docker { - enabled = true -} -params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' \ No newline at end of file +//Only change if desired: + +params.num_threads = 2 +params.kraken_output_dir = "$projectDir/kraken2-outputs" //location to output files, relative to wd or full path +params.human_db_name = 'kraken2-human-db' // +params.human_db_path = "$projectDir/${params.human_db_name}" +docker {enabled = true} +params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' From 4259bc61ee7cd7509c3ca18645b7ead5fa51f6f7 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:25:03 -0700 Subject: [PATCH 03/33] Create README.md would appreciate input on this! --- .../workflow_code/README.md | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md new file mode 100644 index 00000000..c34d76da --- /dev/null +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md @@ -0,0 +1,91 @@ +# NF_MGRemoveHumanReads-B Workflow Information and Usage Instructions + + +## General workflow info +The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow and utilizes [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (SW_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with Snakemake and conda, but if you want to learn more about those, [this Snakemake tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) within [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) is a good place to start for that, and an introduction to conda with installation help and links to other resources can be found [here at Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro). + +## Utilizing the workflow + +1. [Install conda and NextFlow](#1-install-conda-NextFlow) +2. [Download the workflow template files](#2-download-the-workflow-template-files) +3. [Modify the variables in the Remove_Human_Reads.config file](#3-modify-the-variables-in-the-config-file) +4. [Run the workflow](#4-run-the-workflow) + +### 1. Install conda, mamba, and `genelab-utils` package +We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). + +Once Conda is installed, you can install NextFlow into your specified directory using the following code: + +```bash +curl -s https://get.nextflow.io | bash + +sudo mv nextflow /usr/local/bin +``` + +### 2. Download the workflow template files +All workflow files for removing human reads from metagenomics data are in the [workflow_code](workflow_code) directory. To get a copy of the latest SW_MGRemoveHumanReads-A version on to your system, run the following command: + +```bash +GL-get-workflow MG-remove-human-reads-B +``` + +This downloaded the workflow into a directory called `NF_MGRemoveHumanReads-*/`, with the workflow version number at the end. + +> Note: If wanting an earlier version, the wanted version can be provided as an optional argument like so: +> ```bash +> GL-get-workflow MG-remove-human-reads --wanted-version 1.0.0 +> ``` + +### 3. Modify the variables in the Remove_Human_Reads.config file +Once you've downloaded the workflow template, you can modify the variables in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below - if you are running the example dataset, this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt)). You will also need to indicate the path to your input data (raw reads) and the root directory for where the kraken2 reference database should be stored (it will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the [config.yaml](workflow_code/config.yaml) file to be consistent with the study you want to process and the machine you're using. + +> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). + +**Example for how to create a single-column list of unique sample identifiers from your raw data file names** + +For example, if you only want to process a subset of the read files within the reads directory and have paired-end read data for 2 samples located in `../Raw_Sequence_Data/` relative to your workflow directory, that would look like this: + +```bash +ls ../Raw_Sequence_Data/ +``` + +``` +Sample-1_R1.fastq.gz +Sample-1_R2.fastq.gz +Sample-2_R1.fastq.gz +Sample-2_R2.fastq.gz +``` + +You would set up your `unique-sample-IDs.txt` file as follows: + +```bash +cat unique-sample-IDs.txt +``` + +``` +Sample-1 +Sample-2 +``` + +### 4. Run the workflow + +While in the directory holding the NextFlow file, .config file, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: + +```bash +nextflow run *path/to/Remove_Human_Reads.nf* -ansi-log false +``` + +* `-ansi-log false` – specifies to print out each command being run to the screen +* `-resume` – continues to run the workflow using cached data from the previous run + + +See `nextflow -h` and [NextFlow's documentation](https://www.nextflow.io/docs/master/index.html) for more options and details. + +A quick example can be run with the files included in the [workflow_code](workflow_code) directory after specifying a location for the reference database in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file. + +--- + +## Reference database info +The database we use was built with kraken2 v2.1.1 as detailed below, and can be downloaded to run with this workflow (it's ~4.3 GB uncompressed). + +--- From 1e3a7b35c7321e787c4a8f39f9a0931441acb934 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:26:13 -0700 Subject: [PATCH 04/33] moved --- .../{ => workflow_code}/Remove_Human_Reads.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/{ => workflow_code}/Remove_Human_Reads.nf (96%) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_Reads.nf b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_Reads.nf similarity index 96% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_Reads.nf rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_Reads.nf index b805e5ba..b3f0b381 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_Reads.nf +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_Reads.nf @@ -158,4 +158,4 @@ final_percent = output_ch[1] .collect{(it.text[0..5]).toFloat()} .average().trunc(2) .view{"\nRESULT: ${it}% of input reads were unclassified, available in ${params.kraken_output_dir}/reads "} -} \ No newline at end of file +} From 4deeeae05e306c8f60b17284f52606082d8f7655 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:26:44 -0700 Subject: [PATCH 05/33] moved --- .../{ => workflow_code}/Remove_Human_reads.config | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/{ => workflow_code}/Remove_Human_reads.config (100%) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_reads.config similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/Remove_Human_reads.config rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_reads.config From 9acfc4b0bdcdd8c04ceed68abede5d1924b7fd17 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:27:56 -0700 Subject: [PATCH 06/33] moved --- .../{ => workflow_code}/unique_sample_ids.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/{ => workflow_code}/unique_sample_ids.txt (66%) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/unique_sample_ids.txt b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/unique_sample_ids.txt similarity index 66% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/unique_sample_ids.txt rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/unique_sample_ids.txt index 1793c123..5537ff9f 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/unique_sample_ids.txt +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/unique_sample_ids.txt @@ -1,3 +1,3 @@ Sample-1 Sample-2 -Sample-3 \ No newline at end of file +Sample-3 From 63829378b3df75e587a837cdbc8d51e287ac90bd Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 11:57:26 -0700 Subject: [PATCH 07/33] Create Estimate_Host_Reads.nf --- .../Estimate_Host_Reads.nf | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_Reads.nf diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_Reads.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_Reads.nf new file mode 100644 index 00000000..4606b3eb --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_Reads.nf @@ -0,0 +1,124 @@ +// Initial logging of the workflow's parameters for tracking and debug purposes +log.info """\ + ESTIMATE HOST READS + =================================== + Download DB: ${params.DL_kraken} + Single end reads: ${params.single_end} + projectDir: ${projectDir}""" + .stripIndent() + + +// Process for paired-end reads using Kraken2 +process PE_kraken2 { + container params.kraken2container + tag "$sample_id" + publishDir "$params.kraken_output_dir", pattern: "*.{txt,tsv}" + + input: + path database + tuple val(sample_id), path(reads_ch) + + + output: + path "${sample_id}-kraken2-output.txt" + path "${sample_id}-kraken2-report.tsv" + + script: + """ + kraken2 --db $database --gzip-compressed \ + --threads 2 --use-names --paired \ + --output ${sample_id}-kraken2-output.txt \ + --report ${sample_id}-kraken2-report.tsv \ + ${reads_ch[0]} ${reads_ch[1]} + + """ +} + +// Process for single-end reads using Kraken2 +process SE_kraken2 { + + container params.kraken2container + tag "$sample_id" + publishDir "$params.kraken_output_dir", pattern: "*.{txt,tsv}" + + input: + path database + tuple val(sample_id), path(reads_ch) + + output: + path "${sample_id}-kraken2-output.txt" + path "${sample_id}-kraken2-report.tsv" + path "${sample_id}${params.SE_reads_out_suffix}.gz" + + script: + """ + kraken2 --db $database --gzip-compressed --threads 2 --use-names \ + --output ${sample_id}-kraken2-output.txt \ + --report ${sample_id}-kraken2-report.tsv \ + ${reads_ch[0]} + + """ +} + + + +workflow { + + + // Log the database path being used + log.info "\nAccessing previous host reads database" + database_ch = Channel.value(params.host_db_path) + database_ch.view{"database path: ${it}"} + + // Conditional execution for single-end or paired-end data + if(params.single_end == true) { + log.info "\nReading Single-end data from ${params.reads_dir}\n" + + if (params.specify_reads) { + reads_ch = Channel + .fromPath("${params.sample_id_list}") + .splitText() + .map { it.trim() } + .map { sample_id -> + def files = file("${params.reads_dir}${sample_id}${params.SE_reads_suffix}") + return [sample_id, files] + } + } + else { + reads_ch = Channel + .fromPath("${params.reads_dir}/*${params.SE_reads_suffix}", checkIfExists: true) + .map { readfile -> + def sampleId = readfile.name.replaceAll("${params.SE_reads_suffix}\$", "") + return tuple(sampleId, readfile) + } + } + reads_ch.view{"reads: ${it}"} + output_ch = SE_kraken2(database_ch, reads_ch) + + } + else { + log.info "\nReading Paired-end data from ${params.reads_dir}\n" + // Load specific reads if specified + if (params.specify_reads) { + reads_ch = Channel + .fromPath("${params.sample_id_list}") + .splitText() + .map { it.trim() } + .map { sample_id -> + def files = file("${params.reads_dir}${sample_id}${params.PE_reads_suffix}").toList().sort() + return [sample_id, files] + } + } + else { + reads_ch = Channel.fromFilePairs(params.reads_dir + "*" + params.PE_reads_suffix, checkIfExists: true) + } + reads_ch.view{"reads: ${it}"} + output_ch = PE_kraken2(database_ch, reads_ch) + } + // Calculate and log the final percentage of unclassified reads + final_percent = output_ch[1] + .collect{(it.text[0..5]).toFloat()} + .average().trunc(2) + .view{"\nRESULT: ${it}% of input reads were unclassified, available in ${params.kraken_output_dir}/reads "} + +} From f33b5a029d9f37e74a069a2e33826542b02dffb9 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:03:08 -0700 Subject: [PATCH 08/33] Add files via upload --- .../Estimate_Host_reads.config | 24 +++++++++ .../reference-database-info.md | 53 +++++++++++++++++++ .../SW_MGEstHostReads-B/unique_sample_ids.txt | 3 ++ 3 files changed, 80 insertions(+) create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_reads.config create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/reference-database-info.md create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/unique_sample_ids.txt diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_reads.config b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_reads.config new file mode 100644 index 00000000..89d028fb --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_reads.config @@ -0,0 +1,24 @@ +params.single_end = false + +params.specify_reads = true + +params.sample_id_list = "/workspace/GeneLab_Data_Processing/rmv/unique_sample_ids.txt" + +params.reads_dir = "$projectDir/example-reads_PE/" + +params.PE_reads_suffix = "_R{1,2}.fastq.gz" + + +params.SE_reads_suffix = "_raw.fastq.gz" + +params.host_db_name = 'kraken2-host-db' +params.host_db_path = "$projectDir/${params.host_db_name}" + +params.num_threads = 2 + + + +params.kraken_output_dir = "$projectDir/kraken2-outputs" + +docker {enabled = true} +params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/reference-database-info.md b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/reference-database-info.md new file mode 100644 index 00000000..2cff1e92 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/reference-database-info.md @@ -0,0 +1,53 @@ +# Reference database info +The database used will depend on the host. The ones that have been created thus far are detailed and available below. + + +## Mouse ([GRCm39 | GCF_000001635.27](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001635.27)) database build +This database was built with kraken2 v2.1.1 on 26-Jan-2022. + +**Download NCBI taxonomy info needed (takes ~10 minutes):** + +```bash +kraken2-build --download-taxonomy --db kraken2-mouse-db/ +``` + +**Downloading mouse reference genome:** + +```bash +curl -LO https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.fna.gz + +gunzip GCF_000001635.27_GRCm39_genomic.fna.gz +``` + + +**Adding mouse fasta to database:** + +```bash +kraken2-build --add-to-library GCF_000001635.27_GRCm39_genomic.fna.gz --no-masking --db kraken2-mouse-db/ +``` + +**Build the database (takes ~20 minutes as run here):** + +```bash +kraken2-build --build --db kraken2-mouse-db/ --threads 30 --no-masking +``` + +**Remove intermediate files:** + +```bash +kraken2-build --clean --db kraken2-mouse-db/ +``` + +### Download mouse kraken2 db + + +The reference database is ~2.6GB compressed and ~3.8GB uncompressed. It can be downloaded and unpacked with the following: + +```bash +curl -L -o kraken2-mouse-db.tar.gz https://figshare.com/ndownloader/files/33900572 + +tar -xzvf kraken2-mouse-db.tar.gz +``` + +--- + diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/unique_sample_ids.txt b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/unique_sample_ids.txt new file mode 100644 index 00000000..1793c123 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/unique_sample_ids.txt @@ -0,0 +1,3 @@ +Sample-1 +Sample-2 +Sample-3 \ No newline at end of file From cc40dd0ee49d73454d1b907fe0bc577fe7da6c0e Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:16:00 -0700 Subject: [PATCH 09/33] Rename Estimate_Host_Reads.nf to Estimate_Host_Reads.nf --- .../Estimate_Host_Reads.nf | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/{SW_MGEstHostReads-B => NF_MGEstHostReads-B}/Estimate_Host_Reads.nf (100%) diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_Reads.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.nf similarity index 100% rename from Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_Reads.nf rename to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.nf From 9e0f585a8e66a665ea1d47700bbda871f19aaffc Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:17:38 -0700 Subject: [PATCH 10/33] Update and rename Estimate_Host_reads.config to Estimate_Host_Reads.config --- .../Estimate_Host_Reads.config} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/{SW_MGEstHostReads-B/Estimate_Host_reads.config => NF_MGEstHostReads-B/Estimate_Host_Reads.config} (92%) diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_reads.config b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.config similarity index 92% rename from Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_reads.config rename to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.config index 89d028fb..c5038298 100644 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/Estimate_Host_reads.config +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.config @@ -21,4 +21,4 @@ params.num_threads = 2 params.kraken_output_dir = "$projectDir/kraken2-outputs" docker {enabled = true} -params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' \ No newline at end of file +params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' From dd3e06baac5b7137a0cff4d88c3c44b754f08bb7 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:18:09 -0700 Subject: [PATCH 11/33] Rename reference-database-info.md to reference-database-info.md --- .../reference-database-info.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/{SW_MGEstHostReads-B => NF_MGEstHostReads-B}/reference-database-info.md (100%) diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/reference-database-info.md b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/reference-database-info.md similarity index 100% rename from Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/reference-database-info.md rename to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/reference-database-info.md From 44b755dad356f086f06bd2fbcee964253768f389 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:18:33 -0700 Subject: [PATCH 12/33] Rename unique_sample_ids.txt to unique_sample_ids.txt --- .../unique_sample_ids.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/{SW_MGEstHostReads-B => NF_MGEstHostReads-B}/unique_sample_ids.txt (66%) diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/unique_sample_ids.txt b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/unique_sample_ids.txt similarity index 66% rename from Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/unique_sample_ids.txt rename to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/unique_sample_ids.txt index 1793c123..5537ff9f 100644 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads-B/unique_sample_ids.txt +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/unique_sample_ids.txt @@ -1,3 +1,3 @@ Sample-1 Sample-2 -Sample-3 \ No newline at end of file +Sample-3 From 89ed0f4462abb56a0b852f609f9746885cd7aca5 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:35:15 -0700 Subject: [PATCH 13/33] Create README.md --- .../NF_MGEstHostReads-B/README.md | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/README.md diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/README.md b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/README.md new file mode 100644 index 00000000..bd765ab7 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/README.md @@ -0,0 +1,70 @@ +# Estimate Host Reads Workflow Information and Usage Instructions + +## General Workflow Information +The workflow for estimating host DNA in Illumina metagenomics sequencing data, as specified in the "Estimate Host Reads" workflow, is implemented using Nextflow. This workflow is intended to be run on any Unix-based system using Docker containers to ensure consistency and reproducibility of the computational environment. + +## Utilizing the Workflow + +1. **Install Docker** +2. **Download the workflow template files** +3. **Modify the variables in the Nextflow config file** +4. **Run the workflow** + +### 1. Install Docker +We recommend installing Docker to handle all dependencies within containers. This simplifies the setup on any system and avoids compatibility issues: +```bash +# Install Docker following the official guidelines: +https://docs.docker.com/get-docker/ +``` + +### 2. Download the Workflow Template Files +Clone the repository or download the workflow files from the designated repository or link. Ensure you have all required files, including the Nextflow script (.nf) and the associated configuration files. + +### 3. Modify the Variables in the Nextflow Config File +Adjust the variables in the `nextflow.config` file to match your specific needs. This includes paths to input data, Docker containers, and output directories. + +Once you've downloaded the workflow template, you can modify the variables in the [Estimate_Host_Reads.config](workflow_code/Estimate_Host_Reads.config) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below - if you are running the example dataset, this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt)). You will also need to indicate the path to your input data (raw reads) and the root directory for where the kraken2 reference database should be stored (it will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the [config.yaml](workflow_code/config.yaml) file to be consistent with the study you want to process and the machine you're using. + +> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). + +**Example for how to create a single-column list of unique sample identifiers from your raw data file names** + +For example, if you only want to process a subset of the read files within the reads directory and have paired-end read data for 2 samples located in `../Raw_Sequence_Data/` relative to your workflow directory, that would look like this: + +```bash +ls ../Raw_Sequence_Data/ +``` + +``` +Sample-1_R1.fastq.gz +Sample-1_R2.fastq.gz +Sample-2_R1.fastq.gz +Sample-2_R2.fastq.gz +``` + +You would set up your `unique-sample-IDs.txt` file as follows: + +```bash +cat unique-sample-IDs.txt +``` + +``` +Sample-1 +Sample-2 +``` + +### 4. Run the Workflow +Navigate to the directory containing the Nextflow script and config file. Here is an example command to run the workflow: +```bash +nextflow run Estimate_Host_Reads.nf -profile docker +``` +- `-profile docker` specifies that Docker containers should be used to run the tools. + +See `nextflow run -help` and [Nextflow's documentation](https://www.nextflow.io/docs/latest/index.html) for more options and detailed information. + +## Reference Database Information +The database used for host estimation is maintained and updated periodically. Links to download the latest version of the database can be found in the workflow documentation. + +--- + +This workflow is designed for flexibility and can be adapted for various datasets and research needs. For any additional information or support, refer to the official Nextflow documentation or contact the support team. From 96e55ba86fd6b184e7b12e10ab3fa8b6f2ca5fcd Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:35:39 -0700 Subject: [PATCH 14/33] Update and rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.config to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.config --- .../{ => workflow_code}/Estimate_Host_Reads.config | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/{ => workflow_code}/Estimate_Host_Reads.config (100%) diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.config b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.config similarity index 100% rename from Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.config rename to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.config From eb877e6e69cb4ffca13c1f12f6301630a472aec4 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:35:53 -0700 Subject: [PATCH 15/33] Rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.nf to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.nf --- .../{ => workflow_code}/Estimate_Host_Reads.nf | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/{ => workflow_code}/Estimate_Host_Reads.nf (100%) diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.nf similarity index 100% rename from Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/Estimate_Host_Reads.nf rename to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.nf From 444663b6a0214d2448913780cbdb7b7865f40997 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:39:48 -0700 Subject: [PATCH 16/33] Add files via upload --- .../example-reads_PE/Sample-1_R1.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-1_R2.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-2_R1.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-2_R2.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-3_R1.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-3_R2.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-4_R1.fastq.gz | Bin 0 -> 361 bytes .../example-reads_PE/Sample-4_R2.fastq.gz | Bin 0 -> 361 bytes .../example-reads_SE/Sample-1_raw.fastq.gz | Bin 0 -> 361 bytes .../example-reads_SE/Sample-2_raw.fastq.gz | Bin 0 -> 361 bytes .../example-reads_SE/Sample-3_raw.fastq.gz | Bin 0 -> 361 bytes .../example-reads_SE/Sample-4_raw.fastq.gz | Bin 0 -> 361 bytes 12 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R1.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R2.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R1.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R2.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R1.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R2.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R1.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R2.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-1_raw.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-2_raw.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-3_raw.fastq.gz create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-4_raw.fastq.gz diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R1.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R2.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R1.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R2.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R1.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R2.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R1.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R2.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-1_raw.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-1_raw.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-2_raw.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-2_raw.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-3_raw.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-3_raw.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-4_raw.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-4_raw.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..eb20c0c9ca408806a7bc7e0d50f860a385a80b3d GIT binary patch literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n literal 0 HcmV?d00001 From 0aae6d57475fb47efbfe6860cd818c2b54ab6a87 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 23 Apr 2024 15:38:37 -0700 Subject: [PATCH 17/33] Update and rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.config to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/Estimate_Host_Reads.config --- .../workflow_code/{ => config}/Estimate_Host_Reads.config | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/{ => config}/Estimate_Host_Reads.config (100%) diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.config b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/Estimate_Host_Reads.config similarity index 100% rename from Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.config rename to Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/Estimate_Host_Reads.config From 86e371df3fae0bf5776a7fadddc7bdb401b7c8d1 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 23 Apr 2024 15:39:01 -0700 Subject: [PATCH 18/33] Add files via upload --- .../workflow_code/config/checks.py | 1537 +++++++++++++++++ .../workflow_code/config/config.yaml | 150 ++ .../workflow_code/config/protocol.py | 997 +++++++++++ .../workflow_code/config/schemas.py | 62 + 4 files changed, 2746 insertions(+) create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/checks.py create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/config.yaml create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/protocol.py create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/schemas.py diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/checks.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/checks.py new file mode 100644 index 00000000..885d2160 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/checks.py @@ -0,0 +1,1537 @@ +from collections import defaultdict +import copy +import enum +import gzip +import itertools +import logging +import math +from pathlib import Path +from statistics import mean +import string +import subprocess +from typing import Callable, Dict, Union +from importlib.metadata import files + +import pandas as pd + +from dp_tools.core.entity_model import Dataset, Sample, multiqc_run_to_dataframes + +log = logging.getLogger(__name__) + +from dp_tools.core.check_model import FlagCode, FlagEntry, FlagEntryWithOutliers + + +def r_style_make_names(s: str) -> str: + """Recreates R's make.names function for individual strings. + This function is often used to create syntactically valid names in R which are then saved in R outputs. + Source: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/make.names + + Args: + s (str): A string to convert + + Returns: + str: A string converted in the same way as R's make.names function + """ + EXTRA_WHITELIST_CHARACTERS = "_ΩπϴλθijkuΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψω_µ" # Note: there are two "μμ" like characters one is greek letter mu, the other is the micro sign + VALID_CHARACTERS = string.ascii_letters + string.digits + "." + EXTRA_WHITELIST_CHARACTERS + REPLACEMENT_CHAR = "." + new_string_chars = list() + for char in s: + if char in VALID_CHARACTERS: + new_string_chars.append(char) + else: + new_string_chars.append(REPLACEMENT_CHAR) + return "".join(new_string_chars) + + +# adapted from reference: https://stackoverflow.com/questions/56048627/round-floats-in-a-nested-dictionary-recursively +# used to round values for easier to read messages +def formatfloat(x): + return "%.3g" % float(x) + + +def pformat(original_dictionary, function): + dictionary = copy.deepcopy( + original_dictionary + ) # we don't want to override original values + if isinstance(dictionary, dict): + new_dict = dict() + for k, v in dictionary.items(): + new_dict[k] = function(v) if isinstance(v, float) else pformat(v, function) + return new_dict + return dictionary + + +def convert_nan_to_zero(input: Dict[str, Union[float, int]]) -> Dict: + """Convert any Nan into zero""" + output = dict() + for key, value in input.items(): + output[key] = value if not math.isnan(value) else 0 + return output + + +## Functions that use the following syntax to merge values from general stats: +# "stat1 + stat2" should search and sum the stats +# TODO: refine dict typehint +def stat_string_to_value(stat_string: str, mqcData: dict) -> float: + """ "stat1 + stat2" should search and sum the stats""" + sum = float(0) + direct_keys = stat_string.split(" + ") + for direct_key in direct_keys: + print(direct_key) + sum += mqcData[direct_key] + return float(sum) + + +## Dataframe and Series specific helper functions +def nonNull(df: pd.DataFrame) -> bool: + # negation since it checks if any are null + return ~df.isnull().any(axis=None) + + +def nonNegative(df: pd.DataFrame) -> bool: + """This ignores null values, use nonNull to validate that condition""" + return ((df >= 0) | (df.isnull())).all(axis=None) + + +def onlyAllowedValues(df: pd.DataFrame, allowed_values: list) -> bool: + """This ignores null values, use nonNull to validate that condition""" + return ((df.isin(allowed_values)) | (df.isnull())).all(axis=None) + + +def check_forward_and_reverse_reads_counts_match( + sample: Sample, reads_key_1: str, reads_key_2: str +) -> FlagEntry: + # data specific preprocess + count_fwd_reads = float( + sample.compile_multiqc_data([reads_key_1])["general_stats"]["FastQC"][ + "total_sequences" + ] + ) + count_rev_reads = float( + sample.compile_multiqc_data([reads_key_2])["general_stats"]["FastQC"][ + "total_sequences" + ] + ) + + # check logic + if count_fwd_reads == count_rev_reads: + code = FlagCode.GREEN + message = ( + f"Forward and reverse read counts match at " + f"{int(count_rev_reads)} sequences " + ) + else: + code = FlagCode.HALT + message = ( + f"Forward and reverse read counts do not " + f"match: forward_Count:{int(count_fwd_reads)}, " + f"reverse_Count:{int(count_rev_reads)}" + ) + + return {"code": code, "message": message} + + +def check_file_exists(file: Path) -> FlagEntry: + # check logic + if file.is_file(): + code = FlagCode.GREEN + message = f"File exists: {file.name} " + else: + code = FlagCode.HALT + message = f"Missing file: {file.name} expected at {str(file)} " + + return {"code": code, "message": message} + + +def check_fastqgz_file_contents(file: Path, count_lines_to_check: int) -> FlagEntry: + """Check fastqgz by: + 1. Decompressing as a stream of lines. + 2. Affirming expected headers (every 4th line) look correct. + + :param file: Input fastqGZ file path + :type file: Path + :param count_lines_to_check: Maximum number of lines to check. Setting this to a negative value will remove the limit + :type count_lines_to_check: int + :return: A required fields-only flag entry dictionary + :rtype: FlagEntry + """ + + lines_with_issues: list[int] = list() + + # check logic + # truncated files raise EOFError + # catch this as HALT3 + try: + with gzip.open(file, "rb") as f: + for i, byte_line in enumerate(f): + # checks if lines counted equals the limit input + if i + 1 == count_lines_to_check: + log.debug( + f"Reached {count_lines_to_check} lines, ending line check" + ) + break + + line = byte_line.decode() + # every fourth line should be an identifier + expected_identifier_line = i % 4 == 0 + # check if line is actually an identifier line + if expected_identifier_line and line[0] != "@": + lines_with_issues.append(i + 1) + # update every 2,000,000 reads + if i % 2_000_000 == 0: + log.debug(f"Checked {i} lines for {file}") + pass + + if not len(lines_with_issues) == 0: + code = FlagCode.HALT + message = ( + f"Following decompressed fastqGZ lines have issues: {lines_with_issues}" + ) + else: + code = FlagCode.GREEN + message = f"First {count_lines_to_check} lines checked found no issues. This means headers lines were identifiable and no decompression errors occured." + except (EOFError, gzip.BadGzipFile): + code = FlagCode.HALT + message = ( + f"Error during decompression, likely a compression or truncation issue." + ) + + return {"code": code, "message": message} + +def check_gzip_file_integrity(file: Path, gzip_bin: Path = Path("gzip")) -> FlagEntry: + """ Check gzip file integrity using 'gzip -t' as per https://www.gnu.org/software/gzip/manual/gzip.html """ + output = subprocess.run( + [str(gzip_bin), "-t", str(file)], capture_output=True + ) + stdout_string = output.stdout.decode() + if stdout_string == "": + code = FlagCode.GREEN + message = f"Gzip integrity test raised no issues" + else: + code = FlagCode.HALT + message = ( + f"Gzip integrity test failed on this file with output: {stdout_string}" + ) + return {"code": code, "message": message} + +def check_bam_file_integrity( + file: Path, samtools_bin: Path = Path("samtools") +) -> FlagEntry: + """Uses http://www.htslib.org/doc/samtools-quickcheck.html""" + # data specific preprocess + + # check logic + output = subprocess.run( + [str(samtools_bin), "quickcheck", "-v", str(file)], capture_output=True + ) + stdout_string = output.stdout.decode() + if stdout_string == "": + code = FlagCode.GREEN + message = f"Samtools quickcheck raised no issues" + else: + code = FlagCode.HALT + message = ( + f"Samtools quickcheck failed on this file with output: {stdout_string}" + ) + return {"code": code, "message": message} + + +def check_thresholds( + multiqc_inputs: list[Path], mqc_key: str, stat_string: str, thresholds: list[dict] +) -> FlagEntry: + # data specific preprocess + data = multiqc_run_to_dataframes(multiqc_inputs) + value = stat_string_to_value(stat_string, data["general_stats"][mqc_key]) + + # check logic + # Assuming GREEN unless reassigned + code = FlagCode.GREEN + for threshold in thresholds: + match threshold["type"]: + case "lower": + if value < threshold["value"]: + code = ( + FlagCode[threshold["code"]] + if code < FlagCode[threshold["code"]] + else code + ) + + if code == FlagCode.GREEN: + message = f"Value: ({value}) did not breech any configured thresholds" + else: + message = f"Value: ({value}) breeched configured thresholds" + return {"code": code, "message": message} + + +def check_metadata_attributes_exist( + dataset: Dataset, expected_attrs: list[str] +) -> FlagEntry: + missing_metadata_fields = list(set(expected_attrs) - set(dataset.metadata)) + + # check if any missing_metadata_fields are present + # check logic + if not missing_metadata_fields: + code = FlagCode.GREEN + message = f"All expected metadata keys found: Expected {expected_attrs}, Found {set(dataset.metadata)}" + else: + code = FlagCode.HALT + message = f"Missing dataset metadata (source from Runsheet): {missing_metadata_fields}" + return {"code": code, "message": message} + + +def check_for_outliers( + dataset: Dataset, + data_asset_keys: list[str], + mqc_module: str, + mqc_plot: str, + mqc_keys: list[str], + thresholds: list[dict], +) -> FlagEntryWithOutliers: + # assume code is GREEN until outliers detected + code = FlagCode.GREEN + # dataframe extraction + compiled_mqc_data = dataset.compile_multiqc_data(data_asset_keys=data_asset_keys) + + if mqc_plot == "general_stats": + df = compiled_mqc_data["general_stats"][mqc_module] + else: + df = compiled_mqc_data["plots"][mqc_module][mqc_plot] + + def default_to_regular(d): + if isinstance(d, defaultdict): + d = {k: default_to_regular(v) for k, v in d.items()} + return d + + # track for outliers + outliers: dict[str, dict[str, dict[str, str]]] = defaultdict( + lambda: defaultdict(dict) + ) + + # override if mqc_keys is a special value + if mqc_keys == ["_ALL"]: + mqc_keys = df.columns + + for mqc_key in mqc_keys: + for threshold in thresholds: + if threshold["middle_fcn"] == "mean": + middle = df[mqc_key].mean() + elif threshold["middle_fcn"] == "median": + middle = df[mqc_key].median() + else: + raise ValueError( + f"Cannot compute middle from supplied middle_fcn name: {threshold['middle_fcn']}. Must supply either 'median' or 'mean'" + ) + + # bail if standard deviation == 0 + # e.g. if all values are identical (and thus has no outliers) + if df[mqc_key].std() == 0: + continue + + # compute difference + df_diffs = df[mqc_key] - middle + + # compute as number of standard deviations + df_diffs_in_std = df_diffs / df[mqc_key].std() + + # add to outlier tracker if over the threshold + for key, value in df_diffs_in_std.iteritems(): + # if an outlier + if abs(value) > threshold["stdev_threshold"]: + # track it + outliers[key][mqc_module][mqc_key] = value + # elevate code if current code is lower severity + if code < FlagCode[threshold["code"]]: + code = FlagCode[threshold["code"]] + + # convert defaultdict to regular for all reporting + outliers = default_to_regular(outliers) + # check logic + if code == FlagCode.GREEN: + message = f"No outliers found for {mqc_keys} in {mqc_plot} part of {mqc_module} multiQC module" + else: + message = ( + f"Outliers found in {mqc_module} multiQC module as follows: {outliers}" + ) + return {"code": code, "message": message, "outliers": outliers} + + +def _check_expected_files_exist( + input_dir: Path, expected_extensions: list[str], parent_dir_is_filename: bool = True +): + if parent_dir_is_filename: + fname = input_dir.name + expected_files = [input_dir / f"{fname}{ext}" for ext in expected_extensions] + missing_files = list() + for expected_file in expected_files: + if not expected_file.is_file(): + missing_files.append(str(expected_file)) + + expected_file_str = [str(f) for f in expected_files] + return missing_files, expected_file_str + + +def check_genebody_coverage_output(input_dir: Path): + EXPECTED_EXTENSIONS = [ + ".geneBodyCoverage.r", + ".geneBodyCoverage.txt", + ".geneBodyCoverage.curves.pdf", + ] + + missing_files, expected_file_str = _check_expected_files_exist( + input_dir, expected_extensions=EXPECTED_EXTENSIONS + ) + + if not missing_files: + code = FlagCode.GREEN + message = f"All output from geneBody coverage found: {expected_file_str}" + else: + code = FlagCode.HALT + message = f"Missing output from geneBody coverage: {missing_files}. Expected: {expected_file_str}" + return {"code": code, "message": message} + + +def check_inner_distance_output(input_dir: Path): + EXPECTED_EXTENSIONS = [ + ".inner_distance_plot.r", + ".inner_distance_freq.txt", + ".inner_distance.txt", + ".inner_distance_plot.pdf", + ] + + missing_files, expected_file_str = _check_expected_files_exist( + input_dir, expected_extensions=EXPECTED_EXTENSIONS + ) + + if not missing_files: + code = FlagCode.GREEN + message = f"All output from inner distance found: {expected_file_str}" + else: + code = FlagCode.HALT + message = f"Missing output from inner distance: {missing_files}. Expected: {expected_file_str}" + return {"code": code, "message": message} + + +def check_strandedness_assessable_from_infer_experiment( + dataset: Dataset, + stranded_assessment_range: dict[str, float], + unstranded_assessment_range: dict[str, float], + valid_dominant_strandedness_assessments: list[str], +) -> FlagEntry: + # data specific preprocess + def get_median_strandedness( + dataset: Dataset, + ) -> dict[str, float]: + + df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][ + "Infer experiment" + ].fillna( + 0 + ) # Nan is a zero for this MultiQC table + + median_strandedness = df.median().to_dict() + + return median_strandedness + + median_strandedness = get_median_strandedness(dataset) + + # check if dominant assessment is valid + strand_assessment: str = max( + median_strandedness, key=lambda k: median_strandedness[k] + ) + + # flag based on thresholds + assessment_value: float = median_strandedness[strand_assessment] + + is_stranded: bool = ( + stranded_assessment_range["max"] + > assessment_value + > stranded_assessment_range["min"] + ) + is_unstranded: bool = ( + unstranded_assessment_range["max"] + > assessment_value + > unstranded_assessment_range["min"] + ) + + def determine_samples_outside_range( + dataset: Dataset, min: float, max: float + ) -> list[str]: + df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][ + "Infer experiment" + ].fillna( + 0 + ) # Nan is a zero for this MultiQC table + + return df.index[df[strand_assessment].between(min, max) == False].to_list() + + # Catalog and flag any samples outside of range + # flags based on samples that are out of the assessment range + samples_outside_range: list[str] + if is_stranded: + samples_outside_range = determine_samples_outside_range( + dataset, + stranded_assessment_range["min"], + stranded_assessment_range["max"], + ) + elif is_unstranded: + samples_outside_range = determine_samples_outside_range( + dataset, + unstranded_assessment_range["min"], + unstranded_assessment_range["max"], + ) + else: # this means that the strandedness is ambiguous + samples_outside_range = list() + + # check logic + if strand_assessment not in valid_dominant_strandedness_assessments: + code = FlagCode.HALT + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is invalid for processing. Valid assessments: {valid_dominant_strandedness_assessments}" + elif not samples_outside_range and any([is_stranded, is_unstranded]): + code = FlagCode.GREEN + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with no individual samples outside the assessment range" + elif samples_outside_range and any([is_stranded, is_unstranded]): + code = FlagCode.RED + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with samples outside the assessment range: {samples_outside_range}" + else: + code = FlagCode.HALT + message = ( + f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is ambiguous due to being inside range " + f"({stranded_assessment_range['min']}-{unstranded_assessment_range['max']})" + ) + + return {"code": code, "message": message} + + +def check_rsem_counts_and_unnormalized_tables_parity( + rsem_table_path: Path, deseq2_table_path: Path +) -> FlagEntry: + # data specific preprocess + df_rsem = pd.read_csv(rsem_table_path) + df_deseq2 = pd.read_csv(deseq2_table_path) + + # return halt flag if column labels not conserved + if not set(df_deseq2.columns) == set(df_rsem.columns): + unique_to_deseq2 = set(df_deseq2.columns) - set(df_rsem.columns) + unique_to_rsem = set(df_rsem.columns) - set(df_deseq2.columns) + return { + "code": FlagCode.HALT, + "message": f"Columns do not match: unique to rsem: {unique_to_rsem}. unique to deseq2: {unique_to_deseq2}.", + } + + # rearrange columns to the same order + df_deseq2 = df_deseq2[df_rsem.columns] + + # check logic + if df_deseq2.equals(df_rsem): + code = FlagCode.GREEN + message = f"Tables of unnormalized counts match." + else: + code = FlagCode.HALT + message = ( + f"Tables of unnormalized counts have same columns but values do not match." + ) + return {"code": code, "message": message} + + +def check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables( + unnormalizedCountTable: Path, samplewise_tables: dict[str, Path] +) -> FlagEntry: + STAR_COUNT_MODES = ["unstranded", "sense", "antisense"] + # data specific preprocess + df_agg = pd.read_csv(unnormalizedCountTable, index_col=0) + + # based on which column matches the first entry + # all columns must match with the same strand column + strand_assessment: str = None # type: ignore + samples_with_issues: dict[str, list[str]] = { + "Not in aggregate table": list(), + "Sample counts mismatch": list(), + } + for sample, path in samplewise_tables.items(): + # check if samples exist as a column + if sample not in df_agg: + samples_with_issues["Not in aggregate table"].append(sample) + break + + # load + df_samp = pd.read_csv( + path, sep="\t", names=STAR_COUNT_MODES, index_col=0 + ).filter( + regex="^(?!N_.*).*", axis="rows" + ) # filter out N_* entries + + # check if the values match for any of the count modes + # unstranded, sense, antisense + # for remaining samples, only check the match for the first count mode + # TODO: Fix rare false postive related to zero counts, in those cases the strand_assessment can be prematurely determined which causes other samples to be compared with an inappropriate assessment + for count_mode in STAR_COUNT_MODES: + # make sure to sort indicies + if df_agg[sample].sort_index().equals(df_samp[count_mode].sort_index()): + # assign strand assessment if first sample + if strand_assessment is None: + strand_assessment = count_mode + + if strand_assessment == count_mode: + # no issues found (i.e. counts match with a consistent count mode column), break out + break + else: # no break + samples_with_issues["Sample counts mismatch"].append(sample) + + # check logic + if not any([issue_type for issue_type in samples_with_issues.values()]): + code = FlagCode.GREEN + message = ( + f"All samples accounted for and with matching counts " + f"between samplewise and aggregate table using strand assessment: '{strand_assessment}'" + ) + else: + code = FlagCode.HALT + message = f"Identified issues: {samples_with_issues}" + return {"code": code, "message": message} + + +def check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables( + unnormalizedCountTable: Path, samplewise_tables: dict[str, Path] +) -> FlagEntry: + # data specific preprocess + df_agg = pd.read_csv(unnormalizedCountTable, index_col=0) + + # based on which column matches the first entry + # TODO: LOW PRIORITY, fix this typehint + samples_with_issues: dict[str, Union[list[str], list[tuple[str, list[str]]]]] = { + "Not in aggregate table": list(), # type: ignore + "Sample counts mismatch": list(), # type: ignore + } + for sample, path in samplewise_tables.items(): + # check if samples exist as a column + if sample not in df_agg: + samples_with_issues["Not in aggregate table"].append(sample) + break + + # load + df_samp = pd.read_csv(path, sep="\t", index_col=0) + + # check if values match + if geneID_with_mismatched_counts := ( + list(df_agg.loc[df_agg[sample] != df_samp["expected_count"]].index) + ): + samples_with_issues["Sample counts mismatch"].append( + (sample, geneID_with_mismatched_counts) + ) + + # check logic + if not any([issue_type for issue_type in samples_with_issues.values()]): + code = FlagCode.GREEN + message = f"All samples accounted for and with matching counts between samplewise and aggregate table" + else: + code = FlagCode.HALT + message = f"Identified issues: {samples_with_issues}" + return {"code": code, "message": message} + + +def check_sample_table_against_runsheet( + runsheet: Path, sampleTable: Path, all_samples_required: bool +) -> FlagEntry: + """Check the sample table includes all samples as denoted in the runsheet. + + Args: + runsheet (Path): csv file used for processing, the index denotes all samples + sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table) + all_samples_required (bool): denotes if all samples must be shared or if a subset of samples from the runsheet is okay. + + Returns: + FlagEntry: A check result + """ + # data specific preprocess + df_rs = pd.read_csv(runsheet, index_col="Sample Name").sort_index() + df_sample = pd.read_csv(sampleTable, index_col=0).sort_index() + + extra_samples: dict[str, set[str]] = { + "unique_to_runsheet": set(df_rs.index) - set(df_sample.index), + "unique_to_sampleTable": set(df_sample.index) - set(df_rs.index), + } + + # check logic + if any( + [ + (extra_samples["unique_to_runsheet"] and all_samples_required), + (extra_samples["unique_to_sampleTable"]), + ] + ): + code = FlagCode.HALT + message = f"Samples mismatched: {[f'{entry}:{v}' for entry, v in extra_samples.items() if v]}" + else: + code = FlagCode.GREEN + message = f"All samples accounted for based on runsheet (All samples required?: {all_samples_required})" + return {"code": code, "message": message} + + +class GroupFormatting(enum.Enum): + r_make_names = enum.auto() + ampersand_join = enum.auto() + + +def utils_runsheet_to_expected_groups( + runsheet: Path, + formatting: GroupFormatting = GroupFormatting.ampersand_join, + limit_to_samples: list = None, + map_to_lists: bool = False, +) -> Union[dict[str, str], dict[str, list[str]]]: + df_rs = ( + pd.read_csv(runsheet, index_col="Sample Name", dtype=str) + .filter(regex="^Factor Value\[.*\]") + .sort_index() + ) # using only Factor Value columns + + if limit_to_samples: + df_rs = df_rs.filter(items=limit_to_samples, axis="rows") + + match formatting: + case GroupFormatting.r_make_names: + expected_conditions_based_on_runsheet = ( + df_rs.apply(lambda x: "...".join(x), axis="columns") + .apply(r_style_make_names) # join factors with '...' + .to_dict() + ) # reformat entire group in the R style + case GroupFormatting.ampersand_join: + expected_conditions_based_on_runsheet = df_rs.apply( + lambda x: f"({' & '.join(x)})", axis="columns" + ).to_dict() + case _: + raise ValueError( + f"Formatting method invalid, must be one of the following: {list(GroupFormatting)}" + ) + + # convert from {sample: group} dict + # to {group: [samples]} dict + if map_to_lists: + unique_groups = set(expected_conditions_based_on_runsheet.values()) + reformatted_dict: dict[str, list[str]] = dict() + for query_group in unique_groups: + reformatted_dict[query_group] = [ + sample + for sample, group in expected_conditions_based_on_runsheet.items() + if group == query_group + ] + expected_conditions_based_on_runsheet: dict[str, list[str]] = reformatted_dict + + return expected_conditions_based_on_runsheet + + +def check_sample_table_for_correct_group_assignments( + runsheet: Path, sampleTable: Path +) -> FlagEntry: + """Check the sample table is assigned to the correct experimental group. + An experimental group is defined by the Factor Value columns found in the runsheet. + + Args: + runsheet (Path): csv file used for processing, includes metadata used for experimental group designation + sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table) + + Returns: + FlagEntry: A check result + """ + df_sample = pd.read_csv(sampleTable, index_col=0).sort_index() + # data specific preprocess + df_rs = ( + pd.read_csv(runsheet, index_col="Sample Name", dtype=str) # Ensure no factor value columns are misinterpreted as numeric + .filter(regex="^Factor Value\[.*\]") + .loc[df_sample.index] # ensure only sampleTable groups are checked + .sort_index() + ) # using only Factor Value columns + + # TODO: refactor with utils_runsheet_to_expected_groups + expected_conditions_based_on_runsheet = df_rs.apply( + lambda x: "...".join(x), axis="columns" + ).apply( # join factors with '...' + r_style_make_names + ) # reformat entire group in the R style + + mismatched_rows = expected_conditions_based_on_runsheet != df_sample["condition"] + + # check logic + if not any(mismatched_rows): + code = FlagCode.GREEN + message = f"Conditions are formatted and assigned correctly based on runsheet for all {len(df_sample)} samples in sample table: {list(df_sample.index)}" + else: + code = FlagCode.HALT + mismatch_description = ( + df_sample[mismatched_rows]["condition"] + + " <--SAMPLETABLE : RUNSHEET--> " + + expected_conditions_based_on_runsheet[mismatched_rows] + ).to_dict() + message = f"Mismatch in expected conditions based on runsheet for these rows: {mismatch_description}" + return {"code": code, "message": message} + + +def check_contrasts_table_headers(contrasts_table: Path, runsheet: Path) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + df_contrasts = pd.read_csv(contrasts_table, index_col=0) + + # check logic + differences = set(expected_comparisons).symmetric_difference( + set(df_contrasts.columns) + ) + if not differences: + code = FlagCode.GREEN + message = f"Contrasts header includes expected comparisons as determined runsheet Factor Value Columns: {set(expected_comparisons)}" + else: + code = FlagCode.HALT + message = f"Contrasts header does not match expected comparisons as determined runsheet Factor Value Columns: {differences}" + return {"code": code, "message": message} + + +def check_contrasts_table_rows(contrasts_table: Path, **_) -> FlagEntry: + # data specific preprocess + df_contrasts = pd.read_csv(contrasts_table, index_col=0) + + def _get_groups_from_comparisions(s: str) -> set[str]: + """Converts '(G1)v(G2)' + into G1...G2 where G1 and G2 are renamed as per the r make names function + + Args: + s (str): Input that fits this format: '(G1)v(G2)' + + Returns: + str: Reformatted string + """ + g1, g2 = s.split(")v(") + # remove parens and reformat with r make names style + g1 = r_style_make_names(g1[1:].replace(" & ", "...")) + g2 = r_style_make_names(g2[:-1].replace(" & ", "...")) + return {g1, g2} + + bad_columns: dict[str, dict[str, set]] = dict() + for (col_name, col_series) in df_contrasts.iteritems(): + expected_values = _get_groups_from_comparisions(col_name) + if not expected_values == set(col_series): + bad_columns[col_name] = { + "expected": expected_values, + "actual": set(col_series), + } + + # check logic + if not bad_columns: + code = FlagCode.GREEN + message = f"Contrasts column and rows match expected formatting" + else: + code = FlagCode.HALT + message = f"Contrasts columns {bad_columns} have unexpected values" + return {"code": code, "message": message} + + +def check_dge_table_annotation_columns_exist( + dge_table: Path, organism: str, **_ +) -> FlagEntry: + REQUIRED_ANNOTATION_KEYS = { + "SYMBOL", + "GENENAME", + "REFSEQ", + "ENTREZID", + "STRING_id", + "GOSLIM_IDS", + } + MASTER_ANNOTATION_KEY = {"_DEFAULT": "ENSEMBL", "Arabidopsis thaliana": "TAIR"} + + df_dge = pd.read_csv(dge_table) + + required_columns = REQUIRED_ANNOTATION_KEYS.union( + {MASTER_ANNOTATION_KEY.get(organism, MASTER_ANNOTATION_KEY["_DEFAULT"])} + ) + + missing_columns = required_columns - set(df_dge.columns) + # check logic + if not missing_columns: + code = FlagCode.GREEN + message = f"Found all required annotation columns: {required_columns}" + else: + code = FlagCode.HALT + message = ( + f"Missing the following required annotation columns: {missing_columns}" + ) + return {"code": code, "message": message} + + +def check_dge_table_sample_columns_exist( + dge_table: Path, samples: set[str], **_ +) -> FlagEntry: + # data specific preprocess + df_dge = pd.read_csv(dge_table) + + missing_sample_columns = samples - set(df_dge.columns) + + # check logic + if not missing_sample_columns: + code = FlagCode.GREEN + message = f"All samplewise columns present" + else: + code = FlagCode.HALT + message = f"Missing these sample count columns: {missing_sample_columns}" + return {"code": code, "message": message} + + +def check_dge_table_sample_columns_constraints( + dge_table: Path, samples: set[str], **_ +) -> FlagEntry: + MINIMUM_COUNT = 0 + # data specific preprocess + df_dge = pd.read_csv(dge_table)[samples] + + column_meets_constraints = df_dge.apply( + lambda col: all(col >= MINIMUM_COUNT), axis="rows" + ) + + # check logic + contraint_description = f"All counts are greater or equal to {MINIMUM_COUNT}" + if all(column_meets_constraints): + code = FlagCode.GREEN + message = ( + f"All values in columns: {samples} met constraint: {contraint_description}" + ) + else: + code = FlagCode.HALT + message = ( + f"These columns {list(column_meets_constraints.index[~column_meets_constraints])} " + f"fail the contraint: {contraint_description}." + ) + return {"code": code, "message": message} + + +def check_dge_table_group_columns_exist( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet) + expected_columns = { + "".join(comb) + for comb in itertools.product(GROUP_PREFIXES, expected_groups.values()) + } + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All group summary statistic columns (Prefixes: {GROUP_PREFIXES}) present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these group summary statistic columns (Prefixes: {GROUP_PREFIXES}): {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def check_dge_table_group_columns_constraints( + dge_table: Path, runsheet: Path, samples: set[str], **_ +) -> FlagEntry: + FLOAT_TOLERANCE = ( + 0.001 # Percent allowed difference due to float precision differences + ) + # data specific preprocess + GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet) + query_columns = { + "".join(comb) + for comb in itertools.product(GROUP_PREFIXES, expected_groups.values()) + } + + expected_group_lists = utils_runsheet_to_expected_groups( + runsheet, map_to_lists=True, limit_to_samples=samples + ) + df_dge = pd.read_csv(dge_table) + + # issue trackers + issues: dict[str, list[str]] = { + f"mean computation deviates by more than {FLOAT_TOLERANCE} percent": [], + f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent": [], + } + + group: str + sample_set: list[str] + for group, sample_set in expected_group_lists.items(): + abs_percent_differences = abs( + (df_dge[f"Group.Mean_{group}"] - df_dge[sample_set].mean(axis="columns")) + / df_dge[sample_set].mean(axis="columns") + * 100 + ) + if any(abs_percent_differences > FLOAT_TOLERANCE): + issues[ + f"mean computation deviates by more than {FLOAT_TOLERANCE} percent" + ].append(group) + + abs_percent_differences = abs( + (df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set].std(axis="columns")) + / df_dge[sample_set].mean(axis="columns") + * 100 + ) + if any(abs_percent_differences > FLOAT_TOLERANCE): + issues[ + f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent" + ].append(group) + + # check logic + contraint_description = f"Group mean and standard deviations are correctly computed from samplewise normalized counts within a tolerance of {FLOAT_TOLERANCE} percent (to accomodate minor float related differences )" + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns: {query_columns} met constraint: {contraint_description}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" + f"fail the contraint: {contraint_description}." + ) + return {"code": code, "message": message} + + +def check_dge_table_comparison_statistical_columns_exist( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + COMPARISON_PREFIXES = ["Log2fc_", "Stat_", "P.value_", "Adj.p.value_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + expected_columns = { + "".join(comb) + for comb in itertools.product(COMPARISON_PREFIXES, expected_comparisons) + } + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}) present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}): {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def utils_common_constraints_on_dataframe( + df: pd.DataFrame, constraints: tuple[tuple[set, dict], ...] +) -> dict: + + issues: dict[str, list[str]] = { + "Failed non null constraint": list(), + "Failed non negative constraint": list(), + } + + for (col_set, col_constraints) in constraints: + # this will avoid overriding the original constraints dictionary + # which is likely used in the check message + col_constraints = col_constraints.copy() + + # limit to only columns of interest + query_df = df[col_set] + for (colname, colseries) in query_df.iteritems(): + # check non null constraint + if col_constraints.pop("nonNull", False) and nonNull(colseries) == False: + issues["Failed non null constraint"].append(colname) + # check non negative constraint + if ( + col_constraints.pop("nonNegative", False) + and nonNegative(colseries) == False + ): + issues["Failed non negative constraint"].append(colname) + # check allowed values constraint + if allowedValues := col_constraints.pop("allowedValues", False): + if onlyAllowedValues(colseries, allowedValues) == False: + issues["Failed non negative constraint"].append(colname) + + # raise exception if there are unhandled constraint keys + if col_constraints: + raise ValueError(f"Unhandled constraint types: {col_constraints}") + + return issues + + +def check_dge_table_group_statistical_columns_constraints( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + + resolved_constraints = ( + ({f"Log2fc_{comp}" for comp in expected_comparisons}, {"nonNull": True}), + ({f"Stat_{comp}" for comp in expected_comparisons}, {"nonNull": True}), + # can be removed from analysis before p-value and adj-p-value assessed + # ref: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na + ( + {f"P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": True, "nonNull": False}, + ), + ( + {f"Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": True, "nonNull": False}, + ), + ) + + df_dge = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe(df_dge, resolved_constraints) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns met constraint: {resolved_constraints}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" f"fail the contraint: {resolved_constraints}." + ) + return {"code": code, "message": message} + + +def check_dge_table_fixed_statistical_columns_exist(dge_table: Path, **_) -> FlagEntry: + # data specific preprocess + fixed_stats_columns = { + "All.mean": {"nonNull": True, "nonNegative": True}, + "All.stdev": {"nonNull": True, "nonNegative": True}, + "LRT.p.value": {"nonNull": False, "nonNegative": True}, + } + expected_columns = set(fixed_stats_columns) + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All dataset summary stat columns present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = ( + f"Missing these dataset summary stat columns: {sorted(list(missing_cols))}" + ) + return {"code": code, "message": message} + + +def check_dge_table_fixed_statistical_columns_constraints( + dge_table: Path, **_ +) -> FlagEntry: + # data specific preprocess + fixed_stats_columns = ( + ({"All.mean", "All.stdev"}, {"nonNull": True, "nonNegative": True}), + ({"LRT.p.value"}, {"nonNull": False, "nonNegative": True}), + ) + + df_dge = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe(df_dge, fixed_stats_columns) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns met constraint: {fixed_stats_columns}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" f"fail the contraint: {fixed_stats_columns}." + ) + return {"code": code, "message": message} + + +def check_dge_table_log2fc_within_reason( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD = 10 # Percent + LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT = 50 # Percent + + # TODO: discuss, this might even be fine to lower quite a bit + # e.g THRESHOLD_PERCENT_MEANS_DIFFERENCE = 1 # percent + THRESHOLD_PERCENT_MEANS_DIFFERENCE = 50 # percent + + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + df_dge = pd.read_csv(dge_table) + + # Track error messages + err_msg_yellow = "" + all_suspect_signs: dict[int, dict[str, float]] = dict() + for comparision in expected_comparisons: + query_column = f"Log2fc_{comparision}" + group1_mean_col = ( + "Group.Mean_" + comparision.split(")v(")[0] + ")" + ) # Uses parens and adds them back to prevent slicing on 'v' within factor names + group2_mean_col = "Group.Mean_" + "(" + comparision.split(")v(")[1] + computed_log2fc = (df_dge[group1_mean_col] / df_dge[group2_mean_col]).apply( + math.log, args=[2] + ) + abs_percent_difference = abs( + ((computed_log2fc - df_dge[query_column]) / df_dge[query_column]) * 100 + ) + percent_within_tolerance = ( + mean( + abs_percent_difference + < LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD + ) + * 100 + ) + # flag if not enough within tolerance + if percent_within_tolerance < LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT: + err_msg_yellow += ( + f"For comparison: '{comparision}' {percent_within_tolerance:.2f} % of genes have absolute percent differences " + f"(between log2fc direct computation and DESeq2's approach) " + f"less than {LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD} % which does not met the minimum percentage " + f"({LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT} %) of genes required. " + f"This may indicate misassigned or misaligned columns. " + ) + + #### sign based checks + + # filter to genes with based on groups means + abs_percent_differences = ( + abs( + (df_dge[group1_mean_col] - df_dge[group2_mean_col]) + / df_dge[group2_mean_col] + ) + * 100 + ) + df_dge_filtered = df_dge.loc[ + abs_percent_differences > THRESHOLD_PERCENT_MEANS_DIFFERENCE + ] + + df_dge_filtered["positive_sign_expected"] = ( + df_dge[group1_mean_col] - df_dge[group2_mean_col] > 0 + ) + + df_dge_filtered["matches_expected_sign"] = ( + (df_dge[query_column] > 0) & df_dge_filtered["positive_sign_expected"] + ) | ((df_dge[query_column] < 0) & ~df_dge_filtered["positive_sign_expected"]) + + all_suspect_signs = all_suspect_signs | df_dge_filtered.loc[ + df_dge_filtered["matches_expected_sign"] == False + ][[group1_mean_col, group2_mean_col, query_column]].to_dict("index") + + if all_suspect_signs: + code = FlagCode.RED + message = f"At least one log2fc sign is suspect, the following log2fc compared to actual group means: {all_suspect_signs}" + elif err_msg_yellow: + code = FlagCode.YELLOW + message = ( + f"All log2fc not within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% " + f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than " + f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. " + ) + else: + code = FlagCode.GREEN + message = ( + f"All log2fc within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% " + f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than " + f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. Additionally, for comparisons with mean differences " + f"greater than {THRESHOLD_PERCENT_MEANS_DIFFERENCE}% all have reasonable log2fc signs" + ) + + return {"code": code, "message": message} + + +def check_viz_table_columns_exist(dge_table: Path, runsheet: Path, **_) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + viz_pairwise_columns_prefixes = ( + ( + {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNull": False}, + ), + ( + {f"Sig.1_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Sig.05_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Log2_P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": False, "nonNull": False}, + ), + ( + {f"Updown_{comp}" for comp in expected_comparisons}, + {"allowedValues": [1, 0, -1], "nonNull": True}, + ), + ) + + expected_columns = set( + itertools.chain(*[c1 for c1, _ in viz_pairwise_columns_prefixes]) + ) + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All viz specific comparison columns present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these viz specific comparison columns: {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def check_viz_table_columns_constraints( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + viz_pairwise_columns_constraints = ( + ( + {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNull": False}, + ), + ( + {f"Sig.1_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Sig.05_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Log2_P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": False, "nonNull": False}, + ), + ( + {f"Updown_{comp}" for comp in expected_comparisons}, + {"allowedValues": [1, 0, -1], "nonNull": True}, + ), + ) + + df_viz = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe( + df_viz, viz_pairwise_columns_constraints + ) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = ( + f"All values in columns met constraint: {viz_pairwise_columns_constraints}" + ) + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" + f"fail the contraint: {viz_pairwise_columns_constraints}." + ) + return {"code": code, "message": message} + + +def check_viz_pca_table_index_and_columns_exist( + pca_table: Path, samples: set[str] +) -> FlagEntry: + EXPECTED_VIS_PCA_COLUMNS = {"PC1", "PC2", "PC3"} + err_msg = "" + # data specific preprocess + df = pd.read_csv(pca_table, index_col=0) + + # check all samples included + if missing_samples := samples - set(df.index): + err_msg += f"Missing samples in index: {missing_samples}" + + # check all expected columns exist + if missing_cols := EXPECTED_VIS_PCA_COLUMNS - set(df.columns): + err_msg += f"Missing expected columns: {missing_cols}" + + if not err_msg: + code = FlagCode.GREEN + message = f"PCA Table has all the samples in the index and these columns exist: {EXPECTED_VIS_PCA_COLUMNS}" + else: + code = FlagCode.HALT + message = err_msg + + return {"code": code, "message": message} + + +def utils_formatting_list(l: list[str], spaces: int = 2) -> str: + """Reformats list to print friendly multi line string. + + Example: + Reformatting a list of samples:: + + l = ['groundControl_1','groundControl_2','spaceFlight_1','spaceFlight-2'] + print(f"Samples: \n{utils_formatting_list(l)}") + + Args: + l (list): A list of strings to format + spaces (int): Number of leading spaces per line + + Returns: + str: Print friendly multiline string + """ + leading_spaces = " " * spaces + return "\n".join([f"{leading_spaces}- {item}" for item in l]) + + +def utils_rsem_counts_table_to_dataframe( + counts_table: Path, describe: bool = True +) -> pd.DataFrame: + df = pd.read_csv(counts_table, index_col=0).rename_axis("geneID") + if describe: + print(f"Loaded rsem counts table:") + print(f" Samples: \n{utils_formatting_list(list(df.columns), spaces = 4)}") + print(f" Number of Genes: {len(df)}") + return df + + +def utils_get_asset(asset_name: str) -> Path: + [p] = (p for p in files("dp_tools") if p.name == asset_name) + return p.locate() + + +def check_ERCC_subgroup_representation(unnormalizedCountTable: Path, **_) -> FlagEntry: + """Check ERCC subgroup representation is robust. + Specifically, counts the dataset wide ERCC IDs then categorizes each subgroup + by the number of represented ERCC IDs in that subgroup. + Finally, generates a Flag result by comparison to thresholds. + + Args: + counts_table (Path): RSEM unnormalized counts table + + Returns: + FlagEntry: Result of the check. + """ + MINIMUM_GREEN = 21 + MINIMUM_YELLOW = 19 + MINIMUM_RED = 0 + MINIMUM_HALT = 0 + + # data specific preprocess + df_counts = utils_rsem_counts_table_to_dataframe(unnormalizedCountTable) + + ercc_file = utils_get_asset("cms_095046.txt") + df_ercc = pd.read_csv(ercc_file, sep="\t") + + # filter to only ercc genes + df_counts = df_counts.loc[df_counts.index.isin(df_ercc["ERCC ID"])] + + # filter to only genes with at least one count (i.e. ERCC genes represented in the dataset) + df_counts = df_counts.loc[df_counts.sum(axis="columns") > 0] + + # merge to ercc table data including subgroup + df_counts = df_counts.merge(df_ercc, left_index=True, right_on="ERCC ID") + + # generate subgroup counts + df_subgroup_counts = df_counts["subgroup"].value_counts().sort_index() + + green_key = f"green level subgroups: > {MINIMUM_GREEN} ERCC represented" + yellow_key = ( + f"yellow level subgroups: {MINIMUM_YELLOW}-{MINIMUM_GREEN} ERCC represented" + ) + red_key = f"red level subgroups: {MINIMUM_RED}-{MINIMUM_YELLOW} ERCC represented" + halt_key = f"halt level subgroups: < {MINIMUM_HALT} ERCC represented" + + # classify each representation count + representation_category: dict[str, dict[str,int]] = { + green_key: df_subgroup_counts.loc[df_subgroup_counts > MINIMUM_GREEN].to_dict(), + yellow_key: + df_subgroup_counts.loc[ + df_subgroup_counts.between(MINIMUM_YELLOW, MINIMUM_GREEN) + ].to_dict() + , + red_key: + df_subgroup_counts.loc[ + df_subgroup_counts.between( + MINIMUM_RED, MINIMUM_YELLOW, inclusive="left" + ) + ].to_dict() + , + halt_key: df_subgroup_counts.loc[df_subgroup_counts < MINIMUM_HALT].to_dict(), + } + + # check logic + if representation_category[halt_key]: + code = FlagCode.HALT + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + elif representation_category[red_key]: + code = FlagCode.RED + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + elif representation_category[yellow_key]: + code = FlagCode.YELLOW + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + else: + code = FlagCode.GREEN + message = ( + f"Dataset wide ERCC representation is robust: {representation_category}" + ) + return {"code": code, "message": message} + + +def check_sample_in_multiqc_report( + samples: list[str], + multiqc_report_path: Path, + name_reformat_func: Callable = lambda s: s, +) -> FlagEntry: + """Determines if the query samples are present in the multiqc report. + + This is achieved by checking the 'multiqc_sources.txt' table, 'Sample Name' column. + An optional name_reformat_function can be supplied to address sample name changes that occur in the multiqc report. + An example being the renaming of Sample '-' characters to '_' for certain RSeQC modules. + + :param sample: Query sample names to check for presense + :type sample: list[str] + :param multiqc_report_path: MultiQC report directory + :type multiqc_report_path: Path + :param name_reformat_func: A function applied to the multiQC sample names before searching against query sample names, defaults to not renaming the multiQC sample names + :type name_reformat_func: Callable, optional + :return: Flag Entry denoting successful or failing results. Includes description of query sample names and any missing samples + :rtype: FlagEntry + """ + # Load multiQC sources table and retrieve set of samples + [sources_table] = multiqc_report_path.glob("**/multiqc_sources.txt") + multiQC_samples = list(pd.read_csv(sources_table, sep="\t")["Sample Name"]) + + # Transform multiQC samples using name_reformat_func + reformatted_multiQC_samples = [name_reformat_func(s) for s in multiQC_samples] + + # Check for any missing reformatted sample names. + # Also track extra samples, these are not errors but should be reported as well. + missing_samples = set(samples) - set(reformatted_multiQC_samples) + + # check logic + if len(missing_samples) == 0: + code = FlagCode.GREEN + message = f"Found all query samples after reformatting multiQC sample names. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }" + else: + code = FlagCode.HALT + message = f"Missing the following query samples: {missing_samples}. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }" + return {"code": code, "message": message} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/config.yaml b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/config.yaml new file mode 100644 index 00000000..ea4f7041 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/config.yaml @@ -0,0 +1,150 @@ +# TOP LEVEL +NAME: "metagenomics" +VERSION: "1" + +Staging: + General: + Required Metadata: + From ISA: + + - ISA Field Name: + - Characteristics[Organism] + - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: organism + Processing Usage: >- + Mapping to the appropriate alignment reference and annotation databases. + Example: Microbiota + + - ISA Field Name: + - Characteristics[host organism] + - Characteristics[Host Organism] + - Characteristics[Host organism] + ISA Table Source: Sample + Runsheet Column Name: host organism + Processing Usage: >- + Mapping to the appropriate alignment reference and annotation databases. + Example: Mus musculus + + - ISA Field Name: Sample Name + ISA Table Source: Assay + Runsheet Column Name: sample_name + Runsheet Index: true + Processing Usage: >- + Sample name is used as a unique sample identifier during processing + Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 + + - ISA Field Name: + - Parameter Value[library layout] + - Parameter Value[Library Layout] + - Parameter Value: library layout + ISA Table Source: Assay + Runsheet Column Name: paired_end + Remapping: {"PAIRED":true, "Paired":true, "SINGLE":false, "Single":false} + Processing Usage: >- + Indicates if the sequencing was paired end. This controls how a variety of tools are invoked + including in-house written scripts. + Example: 'TRUE' + + # this entry denotes the following: + # retrieve from that ISA field name + # multiple values (separated by ",") + # index those to certain runsheet columns + # if the index doesn't exist, optional prevents raising an exception + # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls + # an exception will be raised if one and only one url is not mapped to each filename + - ISA Field Name: + - Parameter Value[Merged Sequence Data File] + - Characteristics[Merged Sequence Data File] + - Raw Data File + ISA Table Source: Assay + Multiple Values Per Entry: true + Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma + Runsheet Column Name: + - {'name':'read1_path', 'index':0} + - {'name':'read2_path', 'index':1, 'optional':true} + GLDS URL Mapping: True + Processing Usage: >- + Location to the raw data fastq file. May be a url or local path. + Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...' + + - ISA Field Name: + - Parameter Value[Merged Sequence Data File] + - Characteristics[Merged Sequence Data File] + - Raw Data File + ISA Table Source: Assay + Multiple Values Per Entry: true + Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma + Runsheet Column Name: + - {'name':'raw_R1_suffix', 'index':0} + - {'name':'raw_R2_suffix', 'index':1, 'optional':true} + + Processing Usage: >- + Raw data fastq file. + Example: '_R1_raw.fastq.gz or _raw.fastq.gz for SE' + + - ISA Field Name: Factor Value[{factor_name}] + ISA Table Source: [Assay, Sample] + Runsheet Column Name: Factor Value[{factor_name}] + Matches Multiple Columns: true + Match Regex: "Factor Value\\[.*\\]" + Append Column Following: "Unit" + Processing Usage: >- + Factor values in a study. Used to assign experimental groups for each sample. + Note: On the runsheet, a subsequent 'Unit' Column value will be + suffix-concatenated if it exists. + Example: Basal Control + + - ISA Field Name: Unit + ISA Table Source: [Assay, Sample] + Runsheet Column Name: null + Matches Multiple Columns: true + Autoload: false # handled by factor value loading above + Processing Usage: >- + Unit to be suffix-concatenated onto prior Factor value columns. + Example: day + + From User: + # Removed since unused by Processing via the runsheet + # - Runsheet Column Name: GLDS + # Processing Usage: >- + # The GLDS accession number + # Example: GLDS-205 + + - Runsheet Column Name: read1_path + # used to generate candidate file names for searching GLDS repository filelisting + Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"] + Processing Usage: >- + The location of either the forward reads (paired end) or only reads file (single end) + raw fastq file. Can be either a url or local path. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + + + - Runsheet Column Name: read2_path + Data Asset Keys: ["raw reverse reads fastq GZ"] + Processing Usage: >- + The location of either the reverse reads (paired end) + raw fastq file. Can be either a url or local path. + For single end studies, this should be an empty string. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + +ISA Meta: + Valid Study Assay Technology And Measurement Types: + - measurement: "Metagenomic sequencing" + technology: "Whole-Genome Shotgun Sequencing" + + # this is prepended to all file names in the curation assay table + Global file prefix: "{datasystem}_metagenomics_" + + # # configuration related to updating investigation file + # # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file + # # LEADCAP_organism should be the studied organisms scientific name with a leading cap + # Post Processing Add Study Protocol: + # GeneLab Methyl-Seq data processing protocol::{LEADCAP_organism} V1 + +data assets: + # resource categories: *neverPublished \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/protocol.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/protocol.py new file mode 100644 index 00000000..5eaa896a --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/protocol.py @@ -0,0 +1,997 @@ +from pathlib import Path +import re +from typing import Union +import yaml +import logging + +from dp_tools.core.entity_model import Dataset + +log = logging.getLogger(__name__) + +from dp_tools.core.check_model import ValidationProtocol + +from .checks import * + +CONFIG = { + "Metadata-check_metadata_attributes_exist": { + "expected_attrs": ["paired_end", "has_ERCC", "organism"] + }, + "Raw Reads-check_for_outliers": { + "mqc_module": "FastQC", + "mqc_plot": "general_stats", + "mqc_keys": [ + "percent_gc", + "avg_sequence_length", + "total_sequences", + "percent_duplicates", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "Trim Reads-check_for_outliers": { + "mqc_module": "FastQC", + "mqc_plot": "general_stats", + "mqc_keys": [ + "percent_gc", + "avg_sequence_length", + "total_sequences", + "percent_duplicates", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "Raw Reads By Sample-check_fastqgz_file_contents": { + "count_lines_to_check": 200000000 + }, + "Trim Reads By Sample-check_fastqgz_file_contents": { + "count_lines_to_check": 200000000 + }, + "STAR Alignments By Sample-check_thresholds-Mapped": { + "mqc_key": "STAR", + "stat_string": "uniquely_mapped_percent + multimapped_percent", + "thresholds": [ + {"code": "YELLOW", "type": "lower", "value": 70}, + {"code": "RED", "type": "lower", "value": 50}, + ], + }, + "STAR Alignments By Sample-check_thresholds-MultiMapped": { + "mqc_key": "STAR", + "stat_string": "multimapped_toomany_percent + multimapped_percent", + "thresholds": [ + {"code": "YELLOW", "type": "lower", "value": 30}, + {"code": "RED", "type": "lower", "value": 15}, + ], + }, + "STAR Alignments-check_for_outliers": { + "mqc_module": "STAR", + "mqc_plot": "general_stats", + "mqc_keys": [ + "uniquely_mapped_percent", + "avg_mapped_read_length", + "mismatch_rate", + "deletion_rate", + "deletion_length", + "insertion_rate", + "insertion_length", + "multimapped_percent", + "multimapped_toomany_percent", + "unmapped_mismatches_percent", + "unmapped_tooshort_percent", + "unmapped_other_percent", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-geneBody_coverage": { + "mqc_module": "RSeQC", + "mqc_plot": "Gene Body Coverage", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-infer_experiment": { + "mqc_module": "RSeQC", + "mqc_plot": "Infer experiment", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-inner_distance": { + "mqc_module": "RSeQC", + "mqc_plot": "Inner Distance", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-read_distribution": { + "mqc_module": "RSeQC", + "mqc_plot": "Read Distribution", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_strandedness_assessable_from_infer_experiment": { + "stranded_assessment_range": {"max": 100, "min": 75}, + "unstranded_assessment_range": {"min": 40, "max": 60}, + "valid_dominant_strandedness_assessments": [ + "Sense (% Tags)", + "Antisense (% Tags)", + ], + }, + "RSEM Counts-check_for_outliers": { + "mqc_module": "Rsem", + "mqc_plot": "general_stats", + "mqc_keys": [ + "Unalignable", + "Alignable", + "Filtered", + "Total", + "alignable_percent", + "Unique", + "Multi", + "Uncertain", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, +} + +# Manual kept in sync for now +COMPONENTS_LIST = [ + "Metadata", # for raw reads V&V + "Raw Reads", # for raw reads V&V + "Raw Reads By Sample", # for raw reads V&V + "Trim Reads", # for trim reads V&V + "Trimmed Reads By Sample", # for trim reads V&V + "STAR Alignments", # for star alignment V&V + "STAR Alignments By Sample", # for star alignment V&V + "RSeQC By Sample", # for RSeQC V&V + "RSeQC", # for RSeQC V&V + "RSEM Counts", # for after RSEM V&V + "Unnormalized Gene Counts", # for after RSEM V&V + "DGE Metadata", # for post DGE + "DGE Metadata ERCC", # for post DGE + "DGE Output", # for post DGE + "DGE Output ERCC", # for post DGE +] + + +def validate( + dataset: Dataset, + config_path: Path = None, + run_args: dict = None, + report_args: dict = None, + protocol_args: dict = None, + defer_run: bool = False, +) -> Union[ValidationProtocol, ValidationProtocol.Report]: + + if config_path is not None: + with open(config_path, "r") as f: + config = yaml.safe_load(f) + else: + config = CONFIG + + if run_args is None: + run_args = dict() + + if report_args is None: + report_args = dict() + + if protocol_args is None: + protocol_args = dict() + + # Modify protocol_args to convert run_components to skip_components based on COMPONENTS_LIST + if ( + "run_components" in protocol_args + and protocol_args.get("run_components") is not None + ): + protocol_args["skip_components"] = [ + c for c in COMPONENTS_LIST if c not in protocol_args["run_components"] + ] + # Check if any run components are not in COMPONENTS_LIST + if set(protocol_args["run_components"]) - set(COMPONENTS_LIST): + raise ValueError( + f"run_components contains components not in COMPONENTS_LIST. Unique to run_components: {set(protocol_args['run_components']) - set(COMPONENTS_LIST)}. All Components: {COMPONENTS_LIST}" + ) + del protocol_args["run_components"] + + # init validation protocol + vp = ValidationProtocol(**protocol_args) + # fmt: on + with vp.component_start( + name=dataset.name, + description="Validate processing from trim reads through differential gene expression output", + ): + + with vp.component_start( + name="Metadata", description="Metadata file validation" + ): + with vp.payload(payloads=[{"dataset": dataset}]): + vp.add( + check_metadata_attributes_exist, + config=config["Metadata-check_metadata_attributes_exist"], + ) + + with vp.component_start( + name="Raw Reads", description="Raw Reads Outliers Detection" + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["raw reads fastQC ZIP"], + } + ] + if not dataset.metadata["paired_end"] + else [ + { + "dataset": dataset, + "data_asset_keys": [ + "raw forward reads fastQC ZIP", + ], + }, + { + "dataset": dataset, + "data_asset_keys": [ + "raw reverse reads fastQC ZIP", + ], + }, + ] + ): + vp.add( + check_for_outliers, config=config["Raw Reads-check_for_outliers"] + ) + + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "raw MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_raw|_R1_raw|_R2_raw$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in raw reads multiQC report", + ) + + with vp.component_start( + name="Trim Reads", description="Trimmed Reads Outliers Detection" + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["trimmed reads fastQC ZIP"], + } + ] + if not dataset.metadata["paired_end"] + else [ + { + "dataset": dataset, + "data_asset_keys": [ + "trimmed forward reads fastQC ZIP", + ], + }, + { + "dataset": dataset, + "data_asset_keys": [ + "trimmed reverse reads fastQC ZIP", + ], + }, + ] + ): + vp.add( + check_for_outliers, config=config["Trim Reads-check_for_outliers"] + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "trimmed fastQC MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_R1|_R2$", "", s + ), + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "trimming MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_raw|_R1_raw|_R2_raw$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check that all samples are present in the trimmed FastQC and trimming report multiQC reports", + ) + with vp.component_start( + name="STAR Alignments", + description="Dataset wide checks including outliers detection", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["aligned log Final"], + } + ] + ): + vp.add( + check_for_outliers, + config=config["STAR Alignments-check_for_outliers"], + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "aligned MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in STAR multiQC report", + ) + + with vp.component_start( + name="RSeQC", + description="RSeQC submodule outliers checking and other submodule specific dataset wide checks", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["genebody coverage out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in geneBody Coverage", + config=config["RSeQC-check_for_outliers-geneBody_coverage"], + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["infer experiment out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in infer experiment", + config=config["RSeQC-check_for_outliers-infer_experiment"], + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["inner distance out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in inner distance", + config=config["RSeQC-check_for_outliers-inner_distance"], + skip=(not dataset.metadata["paired_end"]), + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["read distribution out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in read distribution", + config=config["RSeQC-check_for_outliers-read_distribution"], + ) + + with vp.payload(payloads=[{"dataset": dataset}]): + vp.add( + check_strandedness_assessable_from_infer_experiment, + config=config[ + "RSeQC-check_strandedness_assessable_from_infer_experiment" + ], + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "genebody coverage MultiQC directory" + ].path, + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "infer experiment MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_infer_expt$", "", s + ), + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "read distribution MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_read_dist$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSeQC multiQC reports", + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "inner distance MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSeQC inner distance multiQC report (paired end only)", + skip=(not dataset.metadata["paired_end"]), + ) + with vp.component_start( + name="RSEM Counts", + description="Dataset wide checks including outliers detection", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["sample counts stats directory"], + } + ] + ): + vp.add( + check_for_outliers, config=config["RSEM Counts-check_for_outliers"] + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "RSEM counts MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSEM multiQC report", + ) + with vp.component_start( + name="Unnormalized Gene Counts", + description="Validate normalization related output", + ): + + with vp.payload( + payloads=[ + { + "unnormalizedCountTable": lambda: dataset.data_assets[ + "star unnormalized counts table" + ].path, + "samplewise_tables": lambda: { + s.name: s.data_assets["sample reads per gene table"].path + for s in dataset.samples.values() + }, + }, + ] + ): + vp.add( + check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables + ) + with vp.payload( + payloads=[ + { + "unnormalizedCountTable": lambda: dataset.data_assets[ + "rsem unnormalized counts table" + ].path, + "samplewise_tables": lambda: { + s.name: s.data_assets["sample gene counts table"].path + for s in dataset.samples.values() + }, + }, + ] + ): + vp.add( + check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables + ) + vp.add( + check_ERCC_subgroup_representation, + skip=(not dataset.metadata["has_ERCC"]), + ) + + with vp.component_start( + name="DGE Metadata", + description="", + ): + + with vp.component_start( + name="Sample Table", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "sampleTable": lambda: dataset.data_assets[ + "sample table" + ].path, + } + ] + ): + vp.add( + check_sample_table_against_runsheet, + config={"all_samples_required": True}, + ) + vp.add(check_sample_table_for_correct_group_assignments) + + with vp.component_start( + name="Contrasts Tables", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "contrasts_table": lambda: dataset.data_assets[ + "DESeq2 contrasts table" + ].path, + } + ] + ): + vp.add(check_contrasts_table_headers) + vp.add(check_contrasts_table_rows) + + with vp.component_start( + name="DGE Metadata ERCC", + description="", + skip=(not dataset.metadata["has_ERCC"]), + ): + + with vp.component_start( + name="Sample Table", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "sampleTable": lambda: dataset.data_assets[ + "ERCC sample table" + ].path, + } + ] + ): + vp.add( + check_sample_table_against_runsheet, + config={"all_samples_required": False}, + ) + vp.add(check_sample_table_for_correct_group_assignments) + + with vp.component_start( + name="Contrasts Tables", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "contrasts_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 contrasts table" + ].path, + } + ] + ): + vp.add(check_contrasts_table_headers) + vp.add(check_contrasts_table_rows) + + with vp.component_start( + name="DGE Output", + description="", + ): + with vp.payload( + payloads=[ + { + "rsem_table_path": lambda: dataset.data_assets[ + "rsem unnormalized counts table" + ].path, + "deseq2_table_path": lambda: dataset.data_assets[ + "DESeq2 unnormalized counts table" + ].path, + } + ] + ): + vp.add( + check_rsem_counts_and_unnormalized_tables_parity, + skip=( + "rsem unnormalized counts table" not in dataset.data_assets + or "DESeq2 unnormalized counts table" not in dataset.data_assets + ), + ) + + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set(dataset.samples), + "dge_table": lambda: dataset.data_assets[ + "DESeq2 annotated DGE table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + + with vp.component_start( + name="Viz Tables", + description="Extended from the dge tables", + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set(dataset.samples), + "dge_table": lambda: dataset.data_assets[ + "DESeq2 annotated DGE extended for viz table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + vp.add(check_viz_table_columns_exist) + vp.add(check_viz_table_columns_constraints) + + with vp.payload( + payloads=[ + { + "samples": lambda: set(dataset.samples), + "pca_table": lambda: dataset.data_assets[ + "DESeq2 viz PCA table" + ].path, + } + ] + ): + vp.add(check_viz_pca_table_index_and_columns_exist) + + with vp.component_start( + name="DGE Output ERCC", + description="", + skip=(not dataset.metadata["has_ERCC"]), + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "dge_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 annotated DGE table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + + with vp.component_start( + name="Viz Tables", + description="Extended from the dge tables", + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "dge_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 annotated DGE extended for viz table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + vp.add(check_viz_table_columns_exist) + vp.add(check_viz_table_columns_constraints) + + with vp.payload( + payloads=[ + { + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "pca_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 viz PCA table" + ].path, + } + ] + ): + vp.add(check_viz_pca_table_index_and_columns_exist) + + for sample in dataset.samples.values(): + with vp.component_start( + name=sample.name, description="Samples level checks" + ): + with vp.component_start( + name="Raw Reads By Sample", description="Raw reads" + ): + with vp.payload( + payloads=( + [ + { + "file": lambda sample=sample: sample.data_assets[ + "raw forward reads fastq GZ" + ].path + }, + { + "file": lambda sample=sample: sample.data_assets[ + "raw reverse reads fastq GZ" + ].path + }, + ] + if dataset.metadata["paired_end"] + else [ + { + "file": lambda sample=sample: sample.data_assets[ + "raw reads fastq GZ" + ].path + }, + ] + ) + ): + vp.add( + check_fastqgz_file_contents, + config=config[ + "Raw Reads By Sample-check_fastqgz_file_contents" + ], + ) + vp.add( + check_gzip_file_integrity, + ) + with vp.payload( + payloads=[ + { + "sample": sample, + "reads_key_1": "raw forward reads fastQC ZIP", + "reads_key_2": "raw reverse reads fastQC ZIP", + }, + ], + ): + vp.add( + check_forward_and_reverse_reads_counts_match, + skip=(not dataset.metadata["paired_end"]), + ) + with vp.component_start( + name="Trimmed Reads By Sample", description="Trimmed reads" + ): + with vp.payload( + payloads=( + [ + { + "file": lambda sample=sample: sample.data_assets[ + "trimmed forward reads fastq GZ" + ].path + }, + { + "file": lambda sample=sample: sample.data_assets[ + "trimmed reverse reads fastq GZ" + ].path + }, + ] + if dataset.metadata["paired_end"] + else [ + { + "file": lambda sample=sample: sample.data_assets[ + "trimmed reads fastq GZ" + ].path + } + ] + ) + ): + vp.add(check_file_exists, description="Check reads files exist") + vp.add( + check_fastqgz_file_contents, + config=config[ + "Trim Reads By Sample-check_fastqgz_file_contents" + ], + ) + + with vp.payload( + payloads=[ + { + "sample": sample, + "reads_key_1": "trimmed forward reads fastQC ZIP", + "reads_key_2": "trimmed reverse reads fastQC ZIP", + }, + ], + ): + vp.add( + check_forward_and_reverse_reads_counts_match, + skip=(not dataset.metadata["paired_end"]), + ) + + with vp.component_start( + name="STAR Alignments By Sample", + description="STAR Alignment outputs", + ): + + with vp.payload( + payloads=[ + { + "file": lambda sample=sample: sample.data_assets[ + "aligned ToTranscriptome Bam" + ].path, + }, + { + "file": lambda sample=sample: sample.data_assets[ + "aligned SortedByCoord Bam" + ].path, + }, + ] + ): + vp.add( + check_bam_file_integrity, + config={ + "samtools_bin": "samtools" + }, # assumes accessible on path already + ) + + with vp.payload( + payloads=[ + { + "multiqc_inputs": lambda sample=sample: [ + sample.data_assets["aligned log Final"].path + ], + }, + ] + ): + vp.add( + check_thresholds, + config=config[ + "STAR Alignments By Sample-check_thresholds-Mapped" + ], + description="Check that mapping rates are reasonable, specifically most reads map to the target genome", + ) + vp.add( + check_thresholds, + config=config[ + "STAR Alignments By Sample-check_thresholds-MultiMapped" + ], + description="Check that mapping rates are reasonable, specifically that a considerable amount of reads multimap to the target genome", + ) + + with vp.component_start( + name="RSeQC By Sample", + description="RNASeq QA outputs", + ): + with vp.component_start( + name="geneBody_coverage", + description="Assess integrity of transcripts and library prep signatures", + ): + with vp.payload( + payloads=[ + { + "input_dir": lambda sample=sample: sample.data_assets[ + "genebody coverage out" + ].path + }, + ] + ): + vp.add(check_genebody_coverage_output) + with vp.component_start( + name="inner_distance", + description="Reports on distance between mate reads based on gene annotations", + skip=(not dataset.metadata["paired_end"]), + ): + with vp.payload( + payloads=[ + { + "input_dir": lambda sample=sample: sample.data_assets[ + "inner distance out" + ].path + }, + ] + ): + vp.add(check_inner_distance_output) + # return protocol object without running or generating a report + if defer_run: + return vp + + vp.run(**run_args) + + # return report + return vp.report(**report_args, combine_with_flags=dataset.loaded_assets_dicts) \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/schemas.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/schemas.py new file mode 100644 index 00000000..d3db1810 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/schemas.py @@ -0,0 +1,62 @@ +""" Schemas for validation +Uses Schema to allow usage of validation functions +""" +from schema import Schema +from schema import Optional as schema_Optional +from typing import Optional +import pandera as pa + + +check_read2_path_populated_if_paired_end = pa.Check( + lambda df: ("read2_path" in df.columns and df['paired_end'].iloc[0] == True) or + ("read2_path" not in df.columns and df['paired_end'].iloc[0] == False), + title="Check 'read2_path' is only populated if paired_end is True", + description="Failures here are likely either due to manual user error or inappropriate source file (e.g. ISA archive)", + error="Expected 'read2_path' to be populated only if paired_end is True" + ) + +runsheet = { + "metagenomics": pa.DataFrameSchema( + columns={ + "Original Sample Name": pa.Column(str), + "read1_path": pa.Column(str), + "read2_path": pa.Column(str, required=False), # Expect if paired_end is True + }#, + # define checks at the DataFrameSchema-level + #checks=check_read2_path_populated_if_paired_end + ) +} + +import pandas as pd + +class runsheet: # Bad casing since we will use the class definition itself for all static methods + + @staticmethod + def check_single_value(column: pd.Series, error_msg: str, errors: list[str]) -> None: + if len(column.unique()) != 1: + errors.append(error_msg) + + @staticmethod + def check_read2_path_populated_if_paired_end(df: pd.DataFrame, errors: list[str]) -> None: + if (("read2_path" in df.columns and df['paired_end'][0] == True) or + ("read2_path" not in df.columns and df['paired_end'][0] == False)): + return + else: + errors.append("Expected 'read2_path' to be populated only if paired_end is True") + + @staticmethod + def validate(df_runsheet: pd.DataFrame) -> bool: + errors = [] + + # Check for single value in specified columns + + runsheet.check_single_value(df_runsheet['organism'], "Dataset level columns do NOT contain one unique value for 'organism'", errors) + runsheet.check_single_value(df_runsheet['paired_end'], "Dataset level columns do NOT contain one unique value for 'paired_end'", errors) + + # Check for 'read2_path' population if paired_end is True + #runsheet.check_read2_path_populated_if_paired_end(df_runsheet, errors) + + if errors: + raise ValueError("\n".join(errors)) + else: + return True \ No newline at end of file From 2180a500063a097eb0495d13c0dfaa1b7883d9a1 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 21 May 2024 13:14:26 -0500 Subject: [PATCH 19/33] Update README.md --- .../NF_MGRemoveHumanReads-B/workflow_code/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md index c34d76da..824d6658 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md @@ -2,7 +2,7 @@ ## General workflow info -The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow and utilizes [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (SW_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with Snakemake and conda, but if you want to learn more about those, [this Snakemake tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) within [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) is a good place to start for that, and an introduction to conda with installation help and links to other resources can be found [here at Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro). +The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow and utilizes [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (NF_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with Snakemake and conda, but if you want to learn more about those, [this Snakemake tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) within [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) is a good place to start for that, and an introduction to conda with installation help and links to other resources can be found [here at Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro). ## Utilizing the workflow @@ -72,11 +72,12 @@ Sample-2 While in the directory holding the NextFlow file, .config file, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: ```bash -nextflow run *path/to/Remove_Human_Reads.nf* -ansi-log false +nextflow run *path/to/Remove_Human_Reads.nf* -ansi-log false -specify_reads false ``` * `-ansi-log false` – specifies to print out each command being run to the screen * `-resume` – continues to run the workflow using cached data from the previous run +* `-specify_reads false` - processes all reads in the working directory, without requiring a sample ID list See `nextflow -h` and [NextFlow's documentation](https://www.nextflow.io/docs/master/index.html) for more options and details. From 579828e48179b000255a9de275381deb6e040035 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 21 May 2024 13:37:04 -0500 Subject: [PATCH 20/33] Update README.md --- .../workflow_code/README.md | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md index 824d6658..1ceb0d3f 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md @@ -2,19 +2,17 @@ ## General workflow info -The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow and utilizes [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (NF_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with Snakemake and conda, but if you want to learn more about those, [this Snakemake tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) within [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) is a good place to start for that, and an introduction to conda with installation help and links to other resources can be found [here at Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro). +The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [NextFlow](https://www.nextflow.io/docs/stable/index.html) DSL2 workflow and utilizes [Docker](https://www.docker.com/) run all tools in containers. This workflow (NF_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with NextFlow and Docker, but if you want to learn more about those, [this NextFlow tutorial](https://training.nextflow.io/basic_training/) within [NextFlow's documentation](https://www.nextflow.io/docs/stable/index.html) is a good place to start for that. ## Utilizing the workflow -1. [Install conda and NextFlow](#1-install-conda-NextFlow) +1. [Install NextFlow and Docker](#1-install-NextFlow-Docker) 2. [Download the workflow template files](#2-download-the-workflow-template-files) 3. [Modify the variables in the Remove_Human_Reads.config file](#3-modify-the-variables-in-the-config-file) 4. [Run the workflow](#4-run-the-workflow) ### 1. Install conda, mamba, and `genelab-utils` package -We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). - -Once Conda is installed, you can install NextFlow into your specified directory using the following code: +You can install NextFlow into your specified directory using the following code: ```bash curl -s https://get.nextflow.io | bash @@ -22,22 +20,26 @@ curl -s https://get.nextflow.io | bash sudo mv nextflow /usr/local/bin ``` +Docker can be installed according to the [NextFlow setup page](https://training.nextflow.io/basic_training/) + + + ### 2. Download the workflow template files -All workflow files for removing human reads from metagenomics data are in the [workflow_code](workflow_code) directory. To get a copy of the latest SW_MGRemoveHumanReads-A version on to your system, run the following command: +All workflow files for removing human reads from metagenomics data are in the [workflow_code](workflow_code) directory. To get a copy of the latest NF_MGRemoveHumanReads-A version on to your system, run the following command: ```bash -GL-get-workflow MG-remove-human-reads-B +GL-get-workflow NF_MGRemoveHumanReads-A ``` This downloaded the workflow into a directory called `NF_MGRemoveHumanReads-*/`, with the workflow version number at the end. > Note: If wanting an earlier version, the wanted version can be provided as an optional argument like so: > ```bash -> GL-get-workflow MG-remove-human-reads --wanted-version 1.0.0 +> GL-get-workflow NF_MGRemoveHumanReads-A --wanted-version 1.0.0 > ``` ### 3. Modify the variables in the Remove_Human_Reads.config file -Once you've downloaded the workflow template, you can modify the variables in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below - if you are running the example dataset, this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt)). You will also need to indicate the path to your input data (raw reads) and the root directory for where the kraken2 reference database should be stored (it will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the [config.yaml](workflow_code/config.yaml) file to be consistent with the study you want to process and the machine you're using. +Once you've downloaded the workflow template, you can modify the variables in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below - if you are running the example dataset, this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt)). You will also need to indicate the path to your input data (raw reads) and the root directory for where the kraken2 reference database should be stored (it will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file to be consistent with the study you want to process and the machine you're using. > Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). From fea98b0faeb247665c96a85cf4858cef40e4c003 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 21 May 2024 13:40:21 -0500 Subject: [PATCH 21/33] renamed folder --- .../example-reads_PE/Sample-1_R1.fastq.gz | Bin .../example-reads_PE/Sample-1_R2.fastq.gz | Bin .../example-reads_PE/Sample-2_R1.fastq.gz | Bin .../example-reads_PE/Sample-2_R2.fastq.gz | Bin .../example-reads_PE/Sample-3_R1.fastq.gz | Bin .../example-reads_PE/Sample-3_R2.fastq.gz | Bin .../example-reads_PE/Sample-4_R1.fastq.gz | Bin .../example-reads_PE/Sample-4_R2.fastq.gz | Bin .../example-reads_SE/Sample-1_raw.fastq.gz | Bin .../example-reads_SE/Sample-2_raw.fastq.gz | Bin .../example-reads_SE/Sample-3_raw.fastq.gz | Bin .../example-reads_SE/Sample-4_raw.fastq.gz | Bin .../workflow_code/README.md | 4 ++-- .../workflow_code/Remove_Human_Reads.nf | 0 .../workflow_code/Remove_Human_reads.config | 0 .../workflow_code/unique_sample_ids.txt | 0 16 files changed, 2 insertions(+), 2 deletions(-) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_PE/Sample-1_R1.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_PE/Sample-1_R2.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_PE/Sample-2_R1.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_PE/Sample-2_R2.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_PE/Sample-3_R1.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_PE/Sample-3_R2.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_PE/Sample-4_R1.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_PE/Sample-4_R2.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_SE/Sample-1_raw.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_SE/Sample-2_raw.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_SE/Sample-3_raw.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/example-reads_SE/Sample-4_raw.fastq.gz (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/workflow_code/README.md (97%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/workflow_code/Remove_Human_Reads.nf (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/workflow_code/Remove_Human_reads.config (100%) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/{NF_MGRemoveHumanReads-B => NF_MGRemoveHumanReads-A}/workflow_code/unique_sample_ids.txt (100%) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R1.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-1_R1.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R1.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-1_R1.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R2.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-1_R2.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-1_R2.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-1_R2.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R1.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-2_R1.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R1.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-2_R1.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R2.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-2_R2.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-2_R2.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-2_R2.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R1.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-3_R1.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R1.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-3_R1.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R2.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-3_R2.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-3_R2.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-3_R2.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R1.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-4_R1.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R1.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-4_R1.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R2.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-4_R2.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_PE/Sample-4_R2.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_PE/Sample-4_R2.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-1_raw.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_SE/Sample-1_raw.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-1_raw.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_SE/Sample-1_raw.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-2_raw.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_SE/Sample-2_raw.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-2_raw.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_SE/Sample-2_raw.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-3_raw.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_SE/Sample-3_raw.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-3_raw.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_SE/Sample-3_raw.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-4_raw.fastq.gz b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_SE/Sample-4_raw.fastq.gz similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/example-reads_SE/Sample-4_raw.fastq.gz rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/example-reads_SE/Sample-4_raw.fastq.gz diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md similarity index 97% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md index 1ceb0d3f..a14d2193 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/README.md +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md @@ -1,4 +1,4 @@ -# NF_MGRemoveHumanReads-B Workflow Information and Usage Instructions +# NF_MGRemoveHumanReads-A Workflow Information and Usage Instructions ## General workflow info @@ -11,7 +11,7 @@ The current pipeline for how GeneLab identifies and removes human DNA in Illumin 3. [Modify the variables in the Remove_Human_Reads.config file](#3-modify-the-variables-in-the-config-file) 4. [Run the workflow](#4-run-the-workflow) -### 1. Install conda, mamba, and `genelab-utils` package +### 1. Install NextFlow and Docker You can install NextFlow into your specified directory using the following code: ```bash diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_Reads.nf b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_Reads.nf similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_Reads.nf rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_Reads.nf diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_reads.config b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/Remove_Human_reads.config rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/unique_sample_ids.txt b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/unique_sample_ids.txt similarity index 100% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-B/workflow_code/unique_sample_ids.txt rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/unique_sample_ids.txt From b3b07c4949e36739fa62641003eb2c2bf8b05a4b Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 21 May 2024 13:46:30 -0500 Subject: [PATCH 22/33] Delete txt --- .../NF_MGEstHostReads-B/unique_sample_ids.txt | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/unique_sample_ids.txt diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/unique_sample_ids.txt b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/unique_sample_ids.txt deleted file mode 100644 index 5537ff9f..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/unique_sample_ids.txt +++ /dev/null @@ -1,3 +0,0 @@ -Sample-1 -Sample-2 -Sample-3 From 75cac8be8ba774c072976b4e99395fe023a0e161 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 21 May 2024 13:47:21 -0500 Subject: [PATCH 23/33] Delete NF_MGEstHostReads-B directory --- .../NF_MGEstHostReads-B/README.md | 70 - .../example-reads_PE/Sample-1_R1.fastq.gz | Bin 361 -> 0 bytes .../example-reads_PE/Sample-1_R2.fastq.gz | Bin 361 -> 0 bytes .../example-reads_PE/Sample-2_R1.fastq.gz | Bin 361 -> 0 bytes .../example-reads_PE/Sample-2_R2.fastq.gz | Bin 361 -> 0 bytes .../example-reads_PE/Sample-3_R1.fastq.gz | Bin 361 -> 0 bytes .../example-reads_PE/Sample-3_R2.fastq.gz | Bin 361 -> 0 bytes .../example-reads_PE/Sample-4_R1.fastq.gz | Bin 361 -> 0 bytes .../example-reads_PE/Sample-4_R2.fastq.gz | Bin 361 -> 0 bytes .../example-reads_SE/Sample-1_raw.fastq.gz | Bin 361 -> 0 bytes .../example-reads_SE/Sample-2_raw.fastq.gz | Bin 361 -> 0 bytes .../example-reads_SE/Sample-3_raw.fastq.gz | Bin 361 -> 0 bytes .../example-reads_SE/Sample-4_raw.fastq.gz | Bin 361 -> 0 bytes .../reference-database-info.md | 53 - .../workflow_code/Estimate_Host_Reads.nf | 124 -- .../config/Estimate_Host_Reads.config | 24 - .../workflow_code/config/checks.py | 1537 ----------------- .../workflow_code/config/config.yaml | 150 -- .../workflow_code/config/protocol.py | 997 ----------- .../workflow_code/config/schemas.py | 62 - 20 files changed, 3017 deletions(-) delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/README.md delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R1.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R2.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R1.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R2.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R1.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R2.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R1.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R2.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-1_raw.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-2_raw.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-3_raw.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-4_raw.fastq.gz delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/reference-database-info.md delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.nf delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/Estimate_Host_Reads.config delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/checks.py delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/config.yaml delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/protocol.py delete mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/schemas.py diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/README.md b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/README.md deleted file mode 100644 index bd765ab7..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# Estimate Host Reads Workflow Information and Usage Instructions - -## General Workflow Information -The workflow for estimating host DNA in Illumina metagenomics sequencing data, as specified in the "Estimate Host Reads" workflow, is implemented using Nextflow. This workflow is intended to be run on any Unix-based system using Docker containers to ensure consistency and reproducibility of the computational environment. - -## Utilizing the Workflow - -1. **Install Docker** -2. **Download the workflow template files** -3. **Modify the variables in the Nextflow config file** -4. **Run the workflow** - -### 1. Install Docker -We recommend installing Docker to handle all dependencies within containers. This simplifies the setup on any system and avoids compatibility issues: -```bash -# Install Docker following the official guidelines: -https://docs.docker.com/get-docker/ -``` - -### 2. Download the Workflow Template Files -Clone the repository or download the workflow files from the designated repository or link. Ensure you have all required files, including the Nextflow script (.nf) and the associated configuration files. - -### 3. Modify the Variables in the Nextflow Config File -Adjust the variables in the `nextflow.config` file to match your specific needs. This includes paths to input data, Docker containers, and output directories. - -Once you've downloaded the workflow template, you can modify the variables in the [Estimate_Host_Reads.config](workflow_code/Estimate_Host_Reads.config) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below - if you are running the example dataset, this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt)). You will also need to indicate the path to your input data (raw reads) and the root directory for where the kraken2 reference database should be stored (it will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the [config.yaml](workflow_code/config.yaml) file to be consistent with the study you want to process and the machine you're using. - -> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). - -**Example for how to create a single-column list of unique sample identifiers from your raw data file names** - -For example, if you only want to process a subset of the read files within the reads directory and have paired-end read data for 2 samples located in `../Raw_Sequence_Data/` relative to your workflow directory, that would look like this: - -```bash -ls ../Raw_Sequence_Data/ -``` - -``` -Sample-1_R1.fastq.gz -Sample-1_R2.fastq.gz -Sample-2_R1.fastq.gz -Sample-2_R2.fastq.gz -``` - -You would set up your `unique-sample-IDs.txt` file as follows: - -```bash -cat unique-sample-IDs.txt -``` - -``` -Sample-1 -Sample-2 -``` - -### 4. Run the Workflow -Navigate to the directory containing the Nextflow script and config file. Here is an example command to run the workflow: -```bash -nextflow run Estimate_Host_Reads.nf -profile docker -``` -- `-profile docker` specifies that Docker containers should be used to run the tools. - -See `nextflow run -help` and [Nextflow's documentation](https://www.nextflow.io/docs/latest/index.html) for more options and detailed information. - -## Reference Database Information -The database used for host estimation is maintained and updated periodically. Links to download the latest version of the database can be found in the workflow documentation. - ---- - -This workflow is designed for flexibility and can be adapted for various datasets and research needs. For any additional information or support, refer to the official Nextflow documentation or contact the support team. diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R1.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R1.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R2.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-1_R2.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R1.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R1.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R2.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-2_R2.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R1.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R1.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R2.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-3_R2.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R1.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R1.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R2.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_PE/Sample-4_R2.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-1_raw.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-1_raw.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-2_raw.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-2_raw.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-3_raw.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-3_raw.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-4_raw.fastq.gz b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/example-reads_SE/Sample-4_raw.fastq.gz deleted file mode 100644 index eb20c0c9ca408806a7bc7e0d50f860a385a80b3d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 361 zcmV-v0hazBiwFq7X2f3r19M?*aBO8QK`v%-0L@a%Zi7G!yz>=%Z;Jv-irP~ww{lPE zp+5A65TXG!1VqyRug6Bc^ap|vyZAAlnce+?>&>RvmStWfo=(HuH^(l`{qXQvRNMR1 zPUC#)u3b8I=V|ORAyIosSy+{sRLHg@gai?jg)J#$HMX&|qKB=(9`95H?FVh*F~$o( zfD=cM$h$u(%zx zDx@&eN2s}>@DE|P(OP33VX%Fn5MHyL#LZLOxBWENU&}Ps{m8Z&?cc7SB$ zY`Sf1_WB|GsW0rOud%Sp!-WXOt`+wz${a38%8Dz0;GS_rTv86f?ufwDo2U2-vj4>7 HR0IG3(U7%n diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/reference-database-info.md b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/reference-database-info.md deleted file mode 100644 index 2cff1e92..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/reference-database-info.md +++ /dev/null @@ -1,53 +0,0 @@ -# Reference database info -The database used will depend on the host. The ones that have been created thus far are detailed and available below. - - -## Mouse ([GRCm39 | GCF_000001635.27](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001635.27)) database build -This database was built with kraken2 v2.1.1 on 26-Jan-2022. - -**Download NCBI taxonomy info needed (takes ~10 minutes):** - -```bash -kraken2-build --download-taxonomy --db kraken2-mouse-db/ -``` - -**Downloading mouse reference genome:** - -```bash -curl -LO https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.fna.gz - -gunzip GCF_000001635.27_GRCm39_genomic.fna.gz -``` - - -**Adding mouse fasta to database:** - -```bash -kraken2-build --add-to-library GCF_000001635.27_GRCm39_genomic.fna.gz --no-masking --db kraken2-mouse-db/ -``` - -**Build the database (takes ~20 minutes as run here):** - -```bash -kraken2-build --build --db kraken2-mouse-db/ --threads 30 --no-masking -``` - -**Remove intermediate files:** - -```bash -kraken2-build --clean --db kraken2-mouse-db/ -``` - -### Download mouse kraken2 db - - -The reference database is ~2.6GB compressed and ~3.8GB uncompressed. It can be downloaded and unpacked with the following: - -```bash -curl -L -o kraken2-mouse-db.tar.gz https://figshare.com/ndownloader/files/33900572 - -tar -xzvf kraken2-mouse-db.tar.gz -``` - ---- - diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.nf deleted file mode 100644 index 4606b3eb..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/Estimate_Host_Reads.nf +++ /dev/null @@ -1,124 +0,0 @@ -// Initial logging of the workflow's parameters for tracking and debug purposes -log.info """\ - ESTIMATE HOST READS - =================================== - Download DB: ${params.DL_kraken} - Single end reads: ${params.single_end} - projectDir: ${projectDir}""" - .stripIndent() - - -// Process for paired-end reads using Kraken2 -process PE_kraken2 { - container params.kraken2container - tag "$sample_id" - publishDir "$params.kraken_output_dir", pattern: "*.{txt,tsv}" - - input: - path database - tuple val(sample_id), path(reads_ch) - - - output: - path "${sample_id}-kraken2-output.txt" - path "${sample_id}-kraken2-report.tsv" - - script: - """ - kraken2 --db $database --gzip-compressed \ - --threads 2 --use-names --paired \ - --output ${sample_id}-kraken2-output.txt \ - --report ${sample_id}-kraken2-report.tsv \ - ${reads_ch[0]} ${reads_ch[1]} - - """ -} - -// Process for single-end reads using Kraken2 -process SE_kraken2 { - - container params.kraken2container - tag "$sample_id" - publishDir "$params.kraken_output_dir", pattern: "*.{txt,tsv}" - - input: - path database - tuple val(sample_id), path(reads_ch) - - output: - path "${sample_id}-kraken2-output.txt" - path "${sample_id}-kraken2-report.tsv" - path "${sample_id}${params.SE_reads_out_suffix}.gz" - - script: - """ - kraken2 --db $database --gzip-compressed --threads 2 --use-names \ - --output ${sample_id}-kraken2-output.txt \ - --report ${sample_id}-kraken2-report.tsv \ - ${reads_ch[0]} - - """ -} - - - -workflow { - - - // Log the database path being used - log.info "\nAccessing previous host reads database" - database_ch = Channel.value(params.host_db_path) - database_ch.view{"database path: ${it}"} - - // Conditional execution for single-end or paired-end data - if(params.single_end == true) { - log.info "\nReading Single-end data from ${params.reads_dir}\n" - - if (params.specify_reads) { - reads_ch = Channel - .fromPath("${params.sample_id_list}") - .splitText() - .map { it.trim() } - .map { sample_id -> - def files = file("${params.reads_dir}${sample_id}${params.SE_reads_suffix}") - return [sample_id, files] - } - } - else { - reads_ch = Channel - .fromPath("${params.reads_dir}/*${params.SE_reads_suffix}", checkIfExists: true) - .map { readfile -> - def sampleId = readfile.name.replaceAll("${params.SE_reads_suffix}\$", "") - return tuple(sampleId, readfile) - } - } - reads_ch.view{"reads: ${it}"} - output_ch = SE_kraken2(database_ch, reads_ch) - - } - else { - log.info "\nReading Paired-end data from ${params.reads_dir}\n" - // Load specific reads if specified - if (params.specify_reads) { - reads_ch = Channel - .fromPath("${params.sample_id_list}") - .splitText() - .map { it.trim() } - .map { sample_id -> - def files = file("${params.reads_dir}${sample_id}${params.PE_reads_suffix}").toList().sort() - return [sample_id, files] - } - } - else { - reads_ch = Channel.fromFilePairs(params.reads_dir + "*" + params.PE_reads_suffix, checkIfExists: true) - } - reads_ch.view{"reads: ${it}"} - output_ch = PE_kraken2(database_ch, reads_ch) - } - // Calculate and log the final percentage of unclassified reads - final_percent = output_ch[1] - .collect{(it.text[0..5]).toFloat()} - .average().trunc(2) - .view{"\nRESULT: ${it}% of input reads were unclassified, available in ${params.kraken_output_dir}/reads "} - -} diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/Estimate_Host_Reads.config b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/Estimate_Host_Reads.config deleted file mode 100644 index c5038298..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/Estimate_Host_Reads.config +++ /dev/null @@ -1,24 +0,0 @@ -params.single_end = false - -params.specify_reads = true - -params.sample_id_list = "/workspace/GeneLab_Data_Processing/rmv/unique_sample_ids.txt" - -params.reads_dir = "$projectDir/example-reads_PE/" - -params.PE_reads_suffix = "_R{1,2}.fastq.gz" - - -params.SE_reads_suffix = "_raw.fastq.gz" - -params.host_db_name = 'kraken2-host-db' -params.host_db_path = "$projectDir/${params.host_db_name}" - -params.num_threads = 2 - - - -params.kraken_output_dir = "$projectDir/kraken2-outputs" - -docker {enabled = true} -params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/checks.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/checks.py deleted file mode 100644 index 885d2160..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/checks.py +++ /dev/null @@ -1,1537 +0,0 @@ -from collections import defaultdict -import copy -import enum -import gzip -import itertools -import logging -import math -from pathlib import Path -from statistics import mean -import string -import subprocess -from typing import Callable, Dict, Union -from importlib.metadata import files - -import pandas as pd - -from dp_tools.core.entity_model import Dataset, Sample, multiqc_run_to_dataframes - -log = logging.getLogger(__name__) - -from dp_tools.core.check_model import FlagCode, FlagEntry, FlagEntryWithOutliers - - -def r_style_make_names(s: str) -> str: - """Recreates R's make.names function for individual strings. - This function is often used to create syntactically valid names in R which are then saved in R outputs. - Source: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/make.names - - Args: - s (str): A string to convert - - Returns: - str: A string converted in the same way as R's make.names function - """ - EXTRA_WHITELIST_CHARACTERS = "_ΩπϴλθijkuΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψω_µ" # Note: there are two "μμ" like characters one is greek letter mu, the other is the micro sign - VALID_CHARACTERS = string.ascii_letters + string.digits + "." + EXTRA_WHITELIST_CHARACTERS - REPLACEMENT_CHAR = "." - new_string_chars = list() - for char in s: - if char in VALID_CHARACTERS: - new_string_chars.append(char) - else: - new_string_chars.append(REPLACEMENT_CHAR) - return "".join(new_string_chars) - - -# adapted from reference: https://stackoverflow.com/questions/56048627/round-floats-in-a-nested-dictionary-recursively -# used to round values for easier to read messages -def formatfloat(x): - return "%.3g" % float(x) - - -def pformat(original_dictionary, function): - dictionary = copy.deepcopy( - original_dictionary - ) # we don't want to override original values - if isinstance(dictionary, dict): - new_dict = dict() - for k, v in dictionary.items(): - new_dict[k] = function(v) if isinstance(v, float) else pformat(v, function) - return new_dict - return dictionary - - -def convert_nan_to_zero(input: Dict[str, Union[float, int]]) -> Dict: - """Convert any Nan into zero""" - output = dict() - for key, value in input.items(): - output[key] = value if not math.isnan(value) else 0 - return output - - -## Functions that use the following syntax to merge values from general stats: -# "stat1 + stat2" should search and sum the stats -# TODO: refine dict typehint -def stat_string_to_value(stat_string: str, mqcData: dict) -> float: - """ "stat1 + stat2" should search and sum the stats""" - sum = float(0) - direct_keys = stat_string.split(" + ") - for direct_key in direct_keys: - print(direct_key) - sum += mqcData[direct_key] - return float(sum) - - -## Dataframe and Series specific helper functions -def nonNull(df: pd.DataFrame) -> bool: - # negation since it checks if any are null - return ~df.isnull().any(axis=None) - - -def nonNegative(df: pd.DataFrame) -> bool: - """This ignores null values, use nonNull to validate that condition""" - return ((df >= 0) | (df.isnull())).all(axis=None) - - -def onlyAllowedValues(df: pd.DataFrame, allowed_values: list) -> bool: - """This ignores null values, use nonNull to validate that condition""" - return ((df.isin(allowed_values)) | (df.isnull())).all(axis=None) - - -def check_forward_and_reverse_reads_counts_match( - sample: Sample, reads_key_1: str, reads_key_2: str -) -> FlagEntry: - # data specific preprocess - count_fwd_reads = float( - sample.compile_multiqc_data([reads_key_1])["general_stats"]["FastQC"][ - "total_sequences" - ] - ) - count_rev_reads = float( - sample.compile_multiqc_data([reads_key_2])["general_stats"]["FastQC"][ - "total_sequences" - ] - ) - - # check logic - if count_fwd_reads == count_rev_reads: - code = FlagCode.GREEN - message = ( - f"Forward and reverse read counts match at " - f"{int(count_rev_reads)} sequences " - ) - else: - code = FlagCode.HALT - message = ( - f"Forward and reverse read counts do not " - f"match: forward_Count:{int(count_fwd_reads)}, " - f"reverse_Count:{int(count_rev_reads)}" - ) - - return {"code": code, "message": message} - - -def check_file_exists(file: Path) -> FlagEntry: - # check logic - if file.is_file(): - code = FlagCode.GREEN - message = f"File exists: {file.name} " - else: - code = FlagCode.HALT - message = f"Missing file: {file.name} expected at {str(file)} " - - return {"code": code, "message": message} - - -def check_fastqgz_file_contents(file: Path, count_lines_to_check: int) -> FlagEntry: - """Check fastqgz by: - 1. Decompressing as a stream of lines. - 2. Affirming expected headers (every 4th line) look correct. - - :param file: Input fastqGZ file path - :type file: Path - :param count_lines_to_check: Maximum number of lines to check. Setting this to a negative value will remove the limit - :type count_lines_to_check: int - :return: A required fields-only flag entry dictionary - :rtype: FlagEntry - """ - - lines_with_issues: list[int] = list() - - # check logic - # truncated files raise EOFError - # catch this as HALT3 - try: - with gzip.open(file, "rb") as f: - for i, byte_line in enumerate(f): - # checks if lines counted equals the limit input - if i + 1 == count_lines_to_check: - log.debug( - f"Reached {count_lines_to_check} lines, ending line check" - ) - break - - line = byte_line.decode() - # every fourth line should be an identifier - expected_identifier_line = i % 4 == 0 - # check if line is actually an identifier line - if expected_identifier_line and line[0] != "@": - lines_with_issues.append(i + 1) - # update every 2,000,000 reads - if i % 2_000_000 == 0: - log.debug(f"Checked {i} lines for {file}") - pass - - if not len(lines_with_issues) == 0: - code = FlagCode.HALT - message = ( - f"Following decompressed fastqGZ lines have issues: {lines_with_issues}" - ) - else: - code = FlagCode.GREEN - message = f"First {count_lines_to_check} lines checked found no issues. This means headers lines were identifiable and no decompression errors occured." - except (EOFError, gzip.BadGzipFile): - code = FlagCode.HALT - message = ( - f"Error during decompression, likely a compression or truncation issue." - ) - - return {"code": code, "message": message} - -def check_gzip_file_integrity(file: Path, gzip_bin: Path = Path("gzip")) -> FlagEntry: - """ Check gzip file integrity using 'gzip -t' as per https://www.gnu.org/software/gzip/manual/gzip.html """ - output = subprocess.run( - [str(gzip_bin), "-t", str(file)], capture_output=True - ) - stdout_string = output.stdout.decode() - if stdout_string == "": - code = FlagCode.GREEN - message = f"Gzip integrity test raised no issues" - else: - code = FlagCode.HALT - message = ( - f"Gzip integrity test failed on this file with output: {stdout_string}" - ) - return {"code": code, "message": message} - -def check_bam_file_integrity( - file: Path, samtools_bin: Path = Path("samtools") -) -> FlagEntry: - """Uses http://www.htslib.org/doc/samtools-quickcheck.html""" - # data specific preprocess - - # check logic - output = subprocess.run( - [str(samtools_bin), "quickcheck", "-v", str(file)], capture_output=True - ) - stdout_string = output.stdout.decode() - if stdout_string == "": - code = FlagCode.GREEN - message = f"Samtools quickcheck raised no issues" - else: - code = FlagCode.HALT - message = ( - f"Samtools quickcheck failed on this file with output: {stdout_string}" - ) - return {"code": code, "message": message} - - -def check_thresholds( - multiqc_inputs: list[Path], mqc_key: str, stat_string: str, thresholds: list[dict] -) -> FlagEntry: - # data specific preprocess - data = multiqc_run_to_dataframes(multiqc_inputs) - value = stat_string_to_value(stat_string, data["general_stats"][mqc_key]) - - # check logic - # Assuming GREEN unless reassigned - code = FlagCode.GREEN - for threshold in thresholds: - match threshold["type"]: - case "lower": - if value < threshold["value"]: - code = ( - FlagCode[threshold["code"]] - if code < FlagCode[threshold["code"]] - else code - ) - - if code == FlagCode.GREEN: - message = f"Value: ({value}) did not breech any configured thresholds" - else: - message = f"Value: ({value}) breeched configured thresholds" - return {"code": code, "message": message} - - -def check_metadata_attributes_exist( - dataset: Dataset, expected_attrs: list[str] -) -> FlagEntry: - missing_metadata_fields = list(set(expected_attrs) - set(dataset.metadata)) - - # check if any missing_metadata_fields are present - # check logic - if not missing_metadata_fields: - code = FlagCode.GREEN - message = f"All expected metadata keys found: Expected {expected_attrs}, Found {set(dataset.metadata)}" - else: - code = FlagCode.HALT - message = f"Missing dataset metadata (source from Runsheet): {missing_metadata_fields}" - return {"code": code, "message": message} - - -def check_for_outliers( - dataset: Dataset, - data_asset_keys: list[str], - mqc_module: str, - mqc_plot: str, - mqc_keys: list[str], - thresholds: list[dict], -) -> FlagEntryWithOutliers: - # assume code is GREEN until outliers detected - code = FlagCode.GREEN - # dataframe extraction - compiled_mqc_data = dataset.compile_multiqc_data(data_asset_keys=data_asset_keys) - - if mqc_plot == "general_stats": - df = compiled_mqc_data["general_stats"][mqc_module] - else: - df = compiled_mqc_data["plots"][mqc_module][mqc_plot] - - def default_to_regular(d): - if isinstance(d, defaultdict): - d = {k: default_to_regular(v) for k, v in d.items()} - return d - - # track for outliers - outliers: dict[str, dict[str, dict[str, str]]] = defaultdict( - lambda: defaultdict(dict) - ) - - # override if mqc_keys is a special value - if mqc_keys == ["_ALL"]: - mqc_keys = df.columns - - for mqc_key in mqc_keys: - for threshold in thresholds: - if threshold["middle_fcn"] == "mean": - middle = df[mqc_key].mean() - elif threshold["middle_fcn"] == "median": - middle = df[mqc_key].median() - else: - raise ValueError( - f"Cannot compute middle from supplied middle_fcn name: {threshold['middle_fcn']}. Must supply either 'median' or 'mean'" - ) - - # bail if standard deviation == 0 - # e.g. if all values are identical (and thus has no outliers) - if df[mqc_key].std() == 0: - continue - - # compute difference - df_diffs = df[mqc_key] - middle - - # compute as number of standard deviations - df_diffs_in_std = df_diffs / df[mqc_key].std() - - # add to outlier tracker if over the threshold - for key, value in df_diffs_in_std.iteritems(): - # if an outlier - if abs(value) > threshold["stdev_threshold"]: - # track it - outliers[key][mqc_module][mqc_key] = value - # elevate code if current code is lower severity - if code < FlagCode[threshold["code"]]: - code = FlagCode[threshold["code"]] - - # convert defaultdict to regular for all reporting - outliers = default_to_regular(outliers) - # check logic - if code == FlagCode.GREEN: - message = f"No outliers found for {mqc_keys} in {mqc_plot} part of {mqc_module} multiQC module" - else: - message = ( - f"Outliers found in {mqc_module} multiQC module as follows: {outliers}" - ) - return {"code": code, "message": message, "outliers": outliers} - - -def _check_expected_files_exist( - input_dir: Path, expected_extensions: list[str], parent_dir_is_filename: bool = True -): - if parent_dir_is_filename: - fname = input_dir.name - expected_files = [input_dir / f"{fname}{ext}" for ext in expected_extensions] - missing_files = list() - for expected_file in expected_files: - if not expected_file.is_file(): - missing_files.append(str(expected_file)) - - expected_file_str = [str(f) for f in expected_files] - return missing_files, expected_file_str - - -def check_genebody_coverage_output(input_dir: Path): - EXPECTED_EXTENSIONS = [ - ".geneBodyCoverage.r", - ".geneBodyCoverage.txt", - ".geneBodyCoverage.curves.pdf", - ] - - missing_files, expected_file_str = _check_expected_files_exist( - input_dir, expected_extensions=EXPECTED_EXTENSIONS - ) - - if not missing_files: - code = FlagCode.GREEN - message = f"All output from geneBody coverage found: {expected_file_str}" - else: - code = FlagCode.HALT - message = f"Missing output from geneBody coverage: {missing_files}. Expected: {expected_file_str}" - return {"code": code, "message": message} - - -def check_inner_distance_output(input_dir: Path): - EXPECTED_EXTENSIONS = [ - ".inner_distance_plot.r", - ".inner_distance_freq.txt", - ".inner_distance.txt", - ".inner_distance_plot.pdf", - ] - - missing_files, expected_file_str = _check_expected_files_exist( - input_dir, expected_extensions=EXPECTED_EXTENSIONS - ) - - if not missing_files: - code = FlagCode.GREEN - message = f"All output from inner distance found: {expected_file_str}" - else: - code = FlagCode.HALT - message = f"Missing output from inner distance: {missing_files}. Expected: {expected_file_str}" - return {"code": code, "message": message} - - -def check_strandedness_assessable_from_infer_experiment( - dataset: Dataset, - stranded_assessment_range: dict[str, float], - unstranded_assessment_range: dict[str, float], - valid_dominant_strandedness_assessments: list[str], -) -> FlagEntry: - # data specific preprocess - def get_median_strandedness( - dataset: Dataset, - ) -> dict[str, float]: - - df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][ - "Infer experiment" - ].fillna( - 0 - ) # Nan is a zero for this MultiQC table - - median_strandedness = df.median().to_dict() - - return median_strandedness - - median_strandedness = get_median_strandedness(dataset) - - # check if dominant assessment is valid - strand_assessment: str = max( - median_strandedness, key=lambda k: median_strandedness[k] - ) - - # flag based on thresholds - assessment_value: float = median_strandedness[strand_assessment] - - is_stranded: bool = ( - stranded_assessment_range["max"] - > assessment_value - > stranded_assessment_range["min"] - ) - is_unstranded: bool = ( - unstranded_assessment_range["max"] - > assessment_value - > unstranded_assessment_range["min"] - ) - - def determine_samples_outside_range( - dataset: Dataset, min: float, max: float - ) -> list[str]: - df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][ - "Infer experiment" - ].fillna( - 0 - ) # Nan is a zero for this MultiQC table - - return df.index[df[strand_assessment].between(min, max) == False].to_list() - - # Catalog and flag any samples outside of range - # flags based on samples that are out of the assessment range - samples_outside_range: list[str] - if is_stranded: - samples_outside_range = determine_samples_outside_range( - dataset, - stranded_assessment_range["min"], - stranded_assessment_range["max"], - ) - elif is_unstranded: - samples_outside_range = determine_samples_outside_range( - dataset, - unstranded_assessment_range["min"], - unstranded_assessment_range["max"], - ) - else: # this means that the strandedness is ambiguous - samples_outside_range = list() - - # check logic - if strand_assessment not in valid_dominant_strandedness_assessments: - code = FlagCode.HALT - message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is invalid for processing. Valid assessments: {valid_dominant_strandedness_assessments}" - elif not samples_outside_range and any([is_stranded, is_unstranded]): - code = FlagCode.GREEN - message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with no individual samples outside the assessment range" - elif samples_outside_range and any([is_stranded, is_unstranded]): - code = FlagCode.RED - message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with samples outside the assessment range: {samples_outside_range}" - else: - code = FlagCode.HALT - message = ( - f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is ambiguous due to being inside range " - f"({stranded_assessment_range['min']}-{unstranded_assessment_range['max']})" - ) - - return {"code": code, "message": message} - - -def check_rsem_counts_and_unnormalized_tables_parity( - rsem_table_path: Path, deseq2_table_path: Path -) -> FlagEntry: - # data specific preprocess - df_rsem = pd.read_csv(rsem_table_path) - df_deseq2 = pd.read_csv(deseq2_table_path) - - # return halt flag if column labels not conserved - if not set(df_deseq2.columns) == set(df_rsem.columns): - unique_to_deseq2 = set(df_deseq2.columns) - set(df_rsem.columns) - unique_to_rsem = set(df_rsem.columns) - set(df_deseq2.columns) - return { - "code": FlagCode.HALT, - "message": f"Columns do not match: unique to rsem: {unique_to_rsem}. unique to deseq2: {unique_to_deseq2}.", - } - - # rearrange columns to the same order - df_deseq2 = df_deseq2[df_rsem.columns] - - # check logic - if df_deseq2.equals(df_rsem): - code = FlagCode.GREEN - message = f"Tables of unnormalized counts match." - else: - code = FlagCode.HALT - message = ( - f"Tables of unnormalized counts have same columns but values do not match." - ) - return {"code": code, "message": message} - - -def check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables( - unnormalizedCountTable: Path, samplewise_tables: dict[str, Path] -) -> FlagEntry: - STAR_COUNT_MODES = ["unstranded", "sense", "antisense"] - # data specific preprocess - df_agg = pd.read_csv(unnormalizedCountTable, index_col=0) - - # based on which column matches the first entry - # all columns must match with the same strand column - strand_assessment: str = None # type: ignore - samples_with_issues: dict[str, list[str]] = { - "Not in aggregate table": list(), - "Sample counts mismatch": list(), - } - for sample, path in samplewise_tables.items(): - # check if samples exist as a column - if sample not in df_agg: - samples_with_issues["Not in aggregate table"].append(sample) - break - - # load - df_samp = pd.read_csv( - path, sep="\t", names=STAR_COUNT_MODES, index_col=0 - ).filter( - regex="^(?!N_.*).*", axis="rows" - ) # filter out N_* entries - - # check if the values match for any of the count modes - # unstranded, sense, antisense - # for remaining samples, only check the match for the first count mode - # TODO: Fix rare false postive related to zero counts, in those cases the strand_assessment can be prematurely determined which causes other samples to be compared with an inappropriate assessment - for count_mode in STAR_COUNT_MODES: - # make sure to sort indicies - if df_agg[sample].sort_index().equals(df_samp[count_mode].sort_index()): - # assign strand assessment if first sample - if strand_assessment is None: - strand_assessment = count_mode - - if strand_assessment == count_mode: - # no issues found (i.e. counts match with a consistent count mode column), break out - break - else: # no break - samples_with_issues["Sample counts mismatch"].append(sample) - - # check logic - if not any([issue_type for issue_type in samples_with_issues.values()]): - code = FlagCode.GREEN - message = ( - f"All samples accounted for and with matching counts " - f"between samplewise and aggregate table using strand assessment: '{strand_assessment}'" - ) - else: - code = FlagCode.HALT - message = f"Identified issues: {samples_with_issues}" - return {"code": code, "message": message} - - -def check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables( - unnormalizedCountTable: Path, samplewise_tables: dict[str, Path] -) -> FlagEntry: - # data specific preprocess - df_agg = pd.read_csv(unnormalizedCountTable, index_col=0) - - # based on which column matches the first entry - # TODO: LOW PRIORITY, fix this typehint - samples_with_issues: dict[str, Union[list[str], list[tuple[str, list[str]]]]] = { - "Not in aggregate table": list(), # type: ignore - "Sample counts mismatch": list(), # type: ignore - } - for sample, path in samplewise_tables.items(): - # check if samples exist as a column - if sample not in df_agg: - samples_with_issues["Not in aggregate table"].append(sample) - break - - # load - df_samp = pd.read_csv(path, sep="\t", index_col=0) - - # check if values match - if geneID_with_mismatched_counts := ( - list(df_agg.loc[df_agg[sample] != df_samp["expected_count"]].index) - ): - samples_with_issues["Sample counts mismatch"].append( - (sample, geneID_with_mismatched_counts) - ) - - # check logic - if not any([issue_type for issue_type in samples_with_issues.values()]): - code = FlagCode.GREEN - message = f"All samples accounted for and with matching counts between samplewise and aggregate table" - else: - code = FlagCode.HALT - message = f"Identified issues: {samples_with_issues}" - return {"code": code, "message": message} - - -def check_sample_table_against_runsheet( - runsheet: Path, sampleTable: Path, all_samples_required: bool -) -> FlagEntry: - """Check the sample table includes all samples as denoted in the runsheet. - - Args: - runsheet (Path): csv file used for processing, the index denotes all samples - sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table) - all_samples_required (bool): denotes if all samples must be shared or if a subset of samples from the runsheet is okay. - - Returns: - FlagEntry: A check result - """ - # data specific preprocess - df_rs = pd.read_csv(runsheet, index_col="Sample Name").sort_index() - df_sample = pd.read_csv(sampleTable, index_col=0).sort_index() - - extra_samples: dict[str, set[str]] = { - "unique_to_runsheet": set(df_rs.index) - set(df_sample.index), - "unique_to_sampleTable": set(df_sample.index) - set(df_rs.index), - } - - # check logic - if any( - [ - (extra_samples["unique_to_runsheet"] and all_samples_required), - (extra_samples["unique_to_sampleTable"]), - ] - ): - code = FlagCode.HALT - message = f"Samples mismatched: {[f'{entry}:{v}' for entry, v in extra_samples.items() if v]}" - else: - code = FlagCode.GREEN - message = f"All samples accounted for based on runsheet (All samples required?: {all_samples_required})" - return {"code": code, "message": message} - - -class GroupFormatting(enum.Enum): - r_make_names = enum.auto() - ampersand_join = enum.auto() - - -def utils_runsheet_to_expected_groups( - runsheet: Path, - formatting: GroupFormatting = GroupFormatting.ampersand_join, - limit_to_samples: list = None, - map_to_lists: bool = False, -) -> Union[dict[str, str], dict[str, list[str]]]: - df_rs = ( - pd.read_csv(runsheet, index_col="Sample Name", dtype=str) - .filter(regex="^Factor Value\[.*\]") - .sort_index() - ) # using only Factor Value columns - - if limit_to_samples: - df_rs = df_rs.filter(items=limit_to_samples, axis="rows") - - match formatting: - case GroupFormatting.r_make_names: - expected_conditions_based_on_runsheet = ( - df_rs.apply(lambda x: "...".join(x), axis="columns") - .apply(r_style_make_names) # join factors with '...' - .to_dict() - ) # reformat entire group in the R style - case GroupFormatting.ampersand_join: - expected_conditions_based_on_runsheet = df_rs.apply( - lambda x: f"({' & '.join(x)})", axis="columns" - ).to_dict() - case _: - raise ValueError( - f"Formatting method invalid, must be one of the following: {list(GroupFormatting)}" - ) - - # convert from {sample: group} dict - # to {group: [samples]} dict - if map_to_lists: - unique_groups = set(expected_conditions_based_on_runsheet.values()) - reformatted_dict: dict[str, list[str]] = dict() - for query_group in unique_groups: - reformatted_dict[query_group] = [ - sample - for sample, group in expected_conditions_based_on_runsheet.items() - if group == query_group - ] - expected_conditions_based_on_runsheet: dict[str, list[str]] = reformatted_dict - - return expected_conditions_based_on_runsheet - - -def check_sample_table_for_correct_group_assignments( - runsheet: Path, sampleTable: Path -) -> FlagEntry: - """Check the sample table is assigned to the correct experimental group. - An experimental group is defined by the Factor Value columns found in the runsheet. - - Args: - runsheet (Path): csv file used for processing, includes metadata used for experimental group designation - sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table) - - Returns: - FlagEntry: A check result - """ - df_sample = pd.read_csv(sampleTable, index_col=0).sort_index() - # data specific preprocess - df_rs = ( - pd.read_csv(runsheet, index_col="Sample Name", dtype=str) # Ensure no factor value columns are misinterpreted as numeric - .filter(regex="^Factor Value\[.*\]") - .loc[df_sample.index] # ensure only sampleTable groups are checked - .sort_index() - ) # using only Factor Value columns - - # TODO: refactor with utils_runsheet_to_expected_groups - expected_conditions_based_on_runsheet = df_rs.apply( - lambda x: "...".join(x), axis="columns" - ).apply( # join factors with '...' - r_style_make_names - ) # reformat entire group in the R style - - mismatched_rows = expected_conditions_based_on_runsheet != df_sample["condition"] - - # check logic - if not any(mismatched_rows): - code = FlagCode.GREEN - message = f"Conditions are formatted and assigned correctly based on runsheet for all {len(df_sample)} samples in sample table: {list(df_sample.index)}" - else: - code = FlagCode.HALT - mismatch_description = ( - df_sample[mismatched_rows]["condition"] - + " <--SAMPLETABLE : RUNSHEET--> " - + expected_conditions_based_on_runsheet[mismatched_rows] - ).to_dict() - message = f"Mismatch in expected conditions based on runsheet for these rows: {mismatch_description}" - return {"code": code, "message": message} - - -def check_contrasts_table_headers(contrasts_table: Path, runsheet: Path) -> FlagEntry: - # data specific preprocess - expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) - expected_comparisons = [ - "v".join(paired_groups) - for paired_groups in itertools.permutations(expected_groups, 2) - ] - df_contrasts = pd.read_csv(contrasts_table, index_col=0) - - # check logic - differences = set(expected_comparisons).symmetric_difference( - set(df_contrasts.columns) - ) - if not differences: - code = FlagCode.GREEN - message = f"Contrasts header includes expected comparisons as determined runsheet Factor Value Columns: {set(expected_comparisons)}" - else: - code = FlagCode.HALT - message = f"Contrasts header does not match expected comparisons as determined runsheet Factor Value Columns: {differences}" - return {"code": code, "message": message} - - -def check_contrasts_table_rows(contrasts_table: Path, **_) -> FlagEntry: - # data specific preprocess - df_contrasts = pd.read_csv(contrasts_table, index_col=0) - - def _get_groups_from_comparisions(s: str) -> set[str]: - """Converts '(G1)v(G2)' - into G1...G2 where G1 and G2 are renamed as per the r make names function - - Args: - s (str): Input that fits this format: '(G1)v(G2)' - - Returns: - str: Reformatted string - """ - g1, g2 = s.split(")v(") - # remove parens and reformat with r make names style - g1 = r_style_make_names(g1[1:].replace(" & ", "...")) - g2 = r_style_make_names(g2[:-1].replace(" & ", "...")) - return {g1, g2} - - bad_columns: dict[str, dict[str, set]] = dict() - for (col_name, col_series) in df_contrasts.iteritems(): - expected_values = _get_groups_from_comparisions(col_name) - if not expected_values == set(col_series): - bad_columns[col_name] = { - "expected": expected_values, - "actual": set(col_series), - } - - # check logic - if not bad_columns: - code = FlagCode.GREEN - message = f"Contrasts column and rows match expected formatting" - else: - code = FlagCode.HALT - message = f"Contrasts columns {bad_columns} have unexpected values" - return {"code": code, "message": message} - - -def check_dge_table_annotation_columns_exist( - dge_table: Path, organism: str, **_ -) -> FlagEntry: - REQUIRED_ANNOTATION_KEYS = { - "SYMBOL", - "GENENAME", - "REFSEQ", - "ENTREZID", - "STRING_id", - "GOSLIM_IDS", - } - MASTER_ANNOTATION_KEY = {"_DEFAULT": "ENSEMBL", "Arabidopsis thaliana": "TAIR"} - - df_dge = pd.read_csv(dge_table) - - required_columns = REQUIRED_ANNOTATION_KEYS.union( - {MASTER_ANNOTATION_KEY.get(organism, MASTER_ANNOTATION_KEY["_DEFAULT"])} - ) - - missing_columns = required_columns - set(df_dge.columns) - # check logic - if not missing_columns: - code = FlagCode.GREEN - message = f"Found all required annotation columns: {required_columns}" - else: - code = FlagCode.HALT - message = ( - f"Missing the following required annotation columns: {missing_columns}" - ) - return {"code": code, "message": message} - - -def check_dge_table_sample_columns_exist( - dge_table: Path, samples: set[str], **_ -) -> FlagEntry: - # data specific preprocess - df_dge = pd.read_csv(dge_table) - - missing_sample_columns = samples - set(df_dge.columns) - - # check logic - if not missing_sample_columns: - code = FlagCode.GREEN - message = f"All samplewise columns present" - else: - code = FlagCode.HALT - message = f"Missing these sample count columns: {missing_sample_columns}" - return {"code": code, "message": message} - - -def check_dge_table_sample_columns_constraints( - dge_table: Path, samples: set[str], **_ -) -> FlagEntry: - MINIMUM_COUNT = 0 - # data specific preprocess - df_dge = pd.read_csv(dge_table)[samples] - - column_meets_constraints = df_dge.apply( - lambda col: all(col >= MINIMUM_COUNT), axis="rows" - ) - - # check logic - contraint_description = f"All counts are greater or equal to {MINIMUM_COUNT}" - if all(column_meets_constraints): - code = FlagCode.GREEN - message = ( - f"All values in columns: {samples} met constraint: {contraint_description}" - ) - else: - code = FlagCode.HALT - message = ( - f"These columns {list(column_meets_constraints.index[~column_meets_constraints])} " - f"fail the contraint: {contraint_description}." - ) - return {"code": code, "message": message} - - -def check_dge_table_group_columns_exist( - dge_table: Path, runsheet: Path, **_ -) -> FlagEntry: - # data specific preprocess - GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"] - expected_groups = utils_runsheet_to_expected_groups(runsheet) - expected_columns = { - "".join(comb) - for comb in itertools.product(GROUP_PREFIXES, expected_groups.values()) - } - df_dge_columns = set(pd.read_csv(dge_table).columns) - missing_cols = expected_columns - df_dge_columns - - # check logic - if not missing_cols: - code = FlagCode.GREEN - message = f"All group summary statistic columns (Prefixes: {GROUP_PREFIXES}) present. {sorted(list(expected_columns))}" - else: - code = FlagCode.HALT - message = f"Missing these group summary statistic columns (Prefixes: {GROUP_PREFIXES}): {sorted(list(missing_cols))}" - return {"code": code, "message": message} - - -def check_dge_table_group_columns_constraints( - dge_table: Path, runsheet: Path, samples: set[str], **_ -) -> FlagEntry: - FLOAT_TOLERANCE = ( - 0.001 # Percent allowed difference due to float precision differences - ) - # data specific preprocess - GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"] - expected_groups = utils_runsheet_to_expected_groups(runsheet) - query_columns = { - "".join(comb) - for comb in itertools.product(GROUP_PREFIXES, expected_groups.values()) - } - - expected_group_lists = utils_runsheet_to_expected_groups( - runsheet, map_to_lists=True, limit_to_samples=samples - ) - df_dge = pd.read_csv(dge_table) - - # issue trackers - issues: dict[str, list[str]] = { - f"mean computation deviates by more than {FLOAT_TOLERANCE} percent": [], - f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent": [], - } - - group: str - sample_set: list[str] - for group, sample_set in expected_group_lists.items(): - abs_percent_differences = abs( - (df_dge[f"Group.Mean_{group}"] - df_dge[sample_set].mean(axis="columns")) - / df_dge[sample_set].mean(axis="columns") - * 100 - ) - if any(abs_percent_differences > FLOAT_TOLERANCE): - issues[ - f"mean computation deviates by more than {FLOAT_TOLERANCE} percent" - ].append(group) - - abs_percent_differences = abs( - (df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set].std(axis="columns")) - / df_dge[sample_set].mean(axis="columns") - * 100 - ) - if any(abs_percent_differences > FLOAT_TOLERANCE): - issues[ - f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent" - ].append(group) - - # check logic - contraint_description = f"Group mean and standard deviations are correctly computed from samplewise normalized counts within a tolerance of {FLOAT_TOLERANCE} percent (to accomodate minor float related differences )" - if not any([issue_type for issue_type in issues.values()]): - code = FlagCode.GREEN - message = f"All values in columns: {query_columns} met constraint: {contraint_description}" - else: - code = FlagCode.HALT - message = ( - f"Issues found {issues} that" - f"fail the contraint: {contraint_description}." - ) - return {"code": code, "message": message} - - -def check_dge_table_comparison_statistical_columns_exist( - dge_table: Path, runsheet: Path, **_ -) -> FlagEntry: - # data specific preprocess - COMPARISON_PREFIXES = ["Log2fc_", "Stat_", "P.value_", "Adj.p.value_"] - expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) - expected_comparisons = [ - "v".join(paired_groups) - for paired_groups in itertools.permutations(expected_groups, 2) - ] - expected_columns = { - "".join(comb) - for comb in itertools.product(COMPARISON_PREFIXES, expected_comparisons) - } - df_dge_columns = set(pd.read_csv(dge_table).columns) - missing_cols = expected_columns - df_dge_columns - - # check logic - if not missing_cols: - code = FlagCode.GREEN - message = f"All comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}) present. {sorted(list(expected_columns))}" - else: - code = FlagCode.HALT - message = f"Missing these comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}): {sorted(list(missing_cols))}" - return {"code": code, "message": message} - - -def utils_common_constraints_on_dataframe( - df: pd.DataFrame, constraints: tuple[tuple[set, dict], ...] -) -> dict: - - issues: dict[str, list[str]] = { - "Failed non null constraint": list(), - "Failed non negative constraint": list(), - } - - for (col_set, col_constraints) in constraints: - # this will avoid overriding the original constraints dictionary - # which is likely used in the check message - col_constraints = col_constraints.copy() - - # limit to only columns of interest - query_df = df[col_set] - for (colname, colseries) in query_df.iteritems(): - # check non null constraint - if col_constraints.pop("nonNull", False) and nonNull(colseries) == False: - issues["Failed non null constraint"].append(colname) - # check non negative constraint - if ( - col_constraints.pop("nonNegative", False) - and nonNegative(colseries) == False - ): - issues["Failed non negative constraint"].append(colname) - # check allowed values constraint - if allowedValues := col_constraints.pop("allowedValues", False): - if onlyAllowedValues(colseries, allowedValues) == False: - issues["Failed non negative constraint"].append(colname) - - # raise exception if there are unhandled constraint keys - if col_constraints: - raise ValueError(f"Unhandled constraint types: {col_constraints}") - - return issues - - -def check_dge_table_group_statistical_columns_constraints( - dge_table: Path, runsheet: Path, **_ -) -> FlagEntry: - expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) - expected_comparisons = [ - "v".join(paired_groups) - for paired_groups in itertools.permutations(expected_groups, 2) - ] - - resolved_constraints = ( - ({f"Log2fc_{comp}" for comp in expected_comparisons}, {"nonNull": True}), - ({f"Stat_{comp}" for comp in expected_comparisons}, {"nonNull": True}), - # can be removed from analysis before p-value and adj-p-value assessed - # ref: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na - ( - {f"P.value_{comp}" for comp in expected_comparisons}, - {"nonNegative": True, "nonNull": False}, - ), - ( - {f"Adj.p.value_{comp}" for comp in expected_comparisons}, - {"nonNegative": True, "nonNull": False}, - ), - ) - - df_dge = pd.read_csv(dge_table) - - # issue trackers - # here: {prefix+constraint: [failed_columns]} - issues: dict[str, list[str]] = dict() - - issues = utils_common_constraints_on_dataframe(df_dge, resolved_constraints) - - # check logic - if not any([issue_type for issue_type in issues.values()]): - code = FlagCode.GREEN - message = f"All values in columns met constraint: {resolved_constraints}" - else: - code = FlagCode.HALT - message = ( - f"Issues found {issues} that" f"fail the contraint: {resolved_constraints}." - ) - return {"code": code, "message": message} - - -def check_dge_table_fixed_statistical_columns_exist(dge_table: Path, **_) -> FlagEntry: - # data specific preprocess - fixed_stats_columns = { - "All.mean": {"nonNull": True, "nonNegative": True}, - "All.stdev": {"nonNull": True, "nonNegative": True}, - "LRT.p.value": {"nonNull": False, "nonNegative": True}, - } - expected_columns = set(fixed_stats_columns) - df_dge_columns = set(pd.read_csv(dge_table).columns) - missing_cols = expected_columns - df_dge_columns - - # check logic - if not missing_cols: - code = FlagCode.GREEN - message = f"All dataset summary stat columns present. {sorted(list(expected_columns))}" - else: - code = FlagCode.HALT - message = ( - f"Missing these dataset summary stat columns: {sorted(list(missing_cols))}" - ) - return {"code": code, "message": message} - - -def check_dge_table_fixed_statistical_columns_constraints( - dge_table: Path, **_ -) -> FlagEntry: - # data specific preprocess - fixed_stats_columns = ( - ({"All.mean", "All.stdev"}, {"nonNull": True, "nonNegative": True}), - ({"LRT.p.value"}, {"nonNull": False, "nonNegative": True}), - ) - - df_dge = pd.read_csv(dge_table) - - # issue trackers - # here: {prefix+constraint: [failed_columns]} - issues: dict[str, list[str]] = dict() - - issues = utils_common_constraints_on_dataframe(df_dge, fixed_stats_columns) - - # check logic - if not any([issue_type for issue_type in issues.values()]): - code = FlagCode.GREEN - message = f"All values in columns met constraint: {fixed_stats_columns}" - else: - code = FlagCode.HALT - message = ( - f"Issues found {issues} that" f"fail the contraint: {fixed_stats_columns}." - ) - return {"code": code, "message": message} - - -def check_dge_table_log2fc_within_reason( - dge_table: Path, runsheet: Path, **_ -) -> FlagEntry: - LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD = 10 # Percent - LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT = 50 # Percent - - # TODO: discuss, this might even be fine to lower quite a bit - # e.g THRESHOLD_PERCENT_MEANS_DIFFERENCE = 1 # percent - THRESHOLD_PERCENT_MEANS_DIFFERENCE = 50 # percent - - # data specific preprocess - expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) - expected_comparisons = [ - "v".join(paired_groups) - for paired_groups in itertools.permutations(expected_groups, 2) - ] - df_dge = pd.read_csv(dge_table) - - # Track error messages - err_msg_yellow = "" - all_suspect_signs: dict[int, dict[str, float]] = dict() - for comparision in expected_comparisons: - query_column = f"Log2fc_{comparision}" - group1_mean_col = ( - "Group.Mean_" + comparision.split(")v(")[0] + ")" - ) # Uses parens and adds them back to prevent slicing on 'v' within factor names - group2_mean_col = "Group.Mean_" + "(" + comparision.split(")v(")[1] - computed_log2fc = (df_dge[group1_mean_col] / df_dge[group2_mean_col]).apply( - math.log, args=[2] - ) - abs_percent_difference = abs( - ((computed_log2fc - df_dge[query_column]) / df_dge[query_column]) * 100 - ) - percent_within_tolerance = ( - mean( - abs_percent_difference - < LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD - ) - * 100 - ) - # flag if not enough within tolerance - if percent_within_tolerance < LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT: - err_msg_yellow += ( - f"For comparison: '{comparision}' {percent_within_tolerance:.2f} % of genes have absolute percent differences " - f"(between log2fc direct computation and DESeq2's approach) " - f"less than {LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD} % which does not met the minimum percentage " - f"({LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT} %) of genes required. " - f"This may indicate misassigned or misaligned columns. " - ) - - #### sign based checks - - # filter to genes with based on groups means - abs_percent_differences = ( - abs( - (df_dge[group1_mean_col] - df_dge[group2_mean_col]) - / df_dge[group2_mean_col] - ) - * 100 - ) - df_dge_filtered = df_dge.loc[ - abs_percent_differences > THRESHOLD_PERCENT_MEANS_DIFFERENCE - ] - - df_dge_filtered["positive_sign_expected"] = ( - df_dge[group1_mean_col] - df_dge[group2_mean_col] > 0 - ) - - df_dge_filtered["matches_expected_sign"] = ( - (df_dge[query_column] > 0) & df_dge_filtered["positive_sign_expected"] - ) | ((df_dge[query_column] < 0) & ~df_dge_filtered["positive_sign_expected"]) - - all_suspect_signs = all_suspect_signs | df_dge_filtered.loc[ - df_dge_filtered["matches_expected_sign"] == False - ][[group1_mean_col, group2_mean_col, query_column]].to_dict("index") - - if all_suspect_signs: - code = FlagCode.RED - message = f"At least one log2fc sign is suspect, the following log2fc compared to actual group means: {all_suspect_signs}" - elif err_msg_yellow: - code = FlagCode.YELLOW - message = ( - f"All log2fc not within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% " - f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than " - f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. " - ) - else: - code = FlagCode.GREEN - message = ( - f"All log2fc within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% " - f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than " - f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. Additionally, for comparisons with mean differences " - f"greater than {THRESHOLD_PERCENT_MEANS_DIFFERENCE}% all have reasonable log2fc signs" - ) - - return {"code": code, "message": message} - - -def check_viz_table_columns_exist(dge_table: Path, runsheet: Path, **_) -> FlagEntry: - # data specific preprocess - expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) - expected_comparisons = [ - "v".join(paired_groups) - for paired_groups in itertools.permutations(expected_groups, 2) - ] - viz_pairwise_columns_prefixes = ( - ( - {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons}, - {"nonNull": False}, - ), - ( - {f"Sig.1_{comp}" for comp in expected_comparisons}, - {"allowedValues": [False, True], "nonNull": False}, - ), - ( - {f"Sig.05_{comp}" for comp in expected_comparisons}, - {"allowedValues": [False, True], "nonNull": False}, - ), - ( - {f"Log2_P.value_{comp}" for comp in expected_comparisons}, - {"nonNegative": False, "nonNull": False}, - ), - ( - {f"Updown_{comp}" for comp in expected_comparisons}, - {"allowedValues": [1, 0, -1], "nonNull": True}, - ), - ) - - expected_columns = set( - itertools.chain(*[c1 for c1, _ in viz_pairwise_columns_prefixes]) - ) - df_dge_columns = set(pd.read_csv(dge_table).columns) - missing_cols = expected_columns - df_dge_columns - - # check logic - if not missing_cols: - code = FlagCode.GREEN - message = f"All viz specific comparison columns present. {sorted(list(expected_columns))}" - else: - code = FlagCode.HALT - message = f"Missing these viz specific comparison columns: {sorted(list(missing_cols))}" - return {"code": code, "message": message} - - -def check_viz_table_columns_constraints( - dge_table: Path, runsheet: Path, **_ -) -> FlagEntry: - # data specific preprocess - expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) - expected_comparisons = [ - "v".join(paired_groups) - for paired_groups in itertools.permutations(expected_groups, 2) - ] - viz_pairwise_columns_constraints = ( - ( - {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons}, - {"nonNull": False}, - ), - ( - {f"Sig.1_{comp}" for comp in expected_comparisons}, - {"allowedValues": [False, True], "nonNull": False}, - ), - ( - {f"Sig.05_{comp}" for comp in expected_comparisons}, - {"allowedValues": [False, True], "nonNull": False}, - ), - ( - {f"Log2_P.value_{comp}" for comp in expected_comparisons}, - {"nonNegative": False, "nonNull": False}, - ), - ( - {f"Updown_{comp}" for comp in expected_comparisons}, - {"allowedValues": [1, 0, -1], "nonNull": True}, - ), - ) - - df_viz = pd.read_csv(dge_table) - - # issue trackers - # here: {prefix+constraint: [failed_columns]} - issues: dict[str, list[str]] = dict() - - issues = utils_common_constraints_on_dataframe( - df_viz, viz_pairwise_columns_constraints - ) - - # check logic - if not any([issue_type for issue_type in issues.values()]): - code = FlagCode.GREEN - message = ( - f"All values in columns met constraint: {viz_pairwise_columns_constraints}" - ) - else: - code = FlagCode.HALT - message = ( - f"Issues found {issues} that" - f"fail the contraint: {viz_pairwise_columns_constraints}." - ) - return {"code": code, "message": message} - - -def check_viz_pca_table_index_and_columns_exist( - pca_table: Path, samples: set[str] -) -> FlagEntry: - EXPECTED_VIS_PCA_COLUMNS = {"PC1", "PC2", "PC3"} - err_msg = "" - # data specific preprocess - df = pd.read_csv(pca_table, index_col=0) - - # check all samples included - if missing_samples := samples - set(df.index): - err_msg += f"Missing samples in index: {missing_samples}" - - # check all expected columns exist - if missing_cols := EXPECTED_VIS_PCA_COLUMNS - set(df.columns): - err_msg += f"Missing expected columns: {missing_cols}" - - if not err_msg: - code = FlagCode.GREEN - message = f"PCA Table has all the samples in the index and these columns exist: {EXPECTED_VIS_PCA_COLUMNS}" - else: - code = FlagCode.HALT - message = err_msg - - return {"code": code, "message": message} - - -def utils_formatting_list(l: list[str], spaces: int = 2) -> str: - """Reformats list to print friendly multi line string. - - Example: - Reformatting a list of samples:: - - l = ['groundControl_1','groundControl_2','spaceFlight_1','spaceFlight-2'] - print(f"Samples: \n{utils_formatting_list(l)}") - - Args: - l (list): A list of strings to format - spaces (int): Number of leading spaces per line - - Returns: - str: Print friendly multiline string - """ - leading_spaces = " " * spaces - return "\n".join([f"{leading_spaces}- {item}" for item in l]) - - -def utils_rsem_counts_table_to_dataframe( - counts_table: Path, describe: bool = True -) -> pd.DataFrame: - df = pd.read_csv(counts_table, index_col=0).rename_axis("geneID") - if describe: - print(f"Loaded rsem counts table:") - print(f" Samples: \n{utils_formatting_list(list(df.columns), spaces = 4)}") - print(f" Number of Genes: {len(df)}") - return df - - -def utils_get_asset(asset_name: str) -> Path: - [p] = (p for p in files("dp_tools") if p.name == asset_name) - return p.locate() - - -def check_ERCC_subgroup_representation(unnormalizedCountTable: Path, **_) -> FlagEntry: - """Check ERCC subgroup representation is robust. - Specifically, counts the dataset wide ERCC IDs then categorizes each subgroup - by the number of represented ERCC IDs in that subgroup. - Finally, generates a Flag result by comparison to thresholds. - - Args: - counts_table (Path): RSEM unnormalized counts table - - Returns: - FlagEntry: Result of the check. - """ - MINIMUM_GREEN = 21 - MINIMUM_YELLOW = 19 - MINIMUM_RED = 0 - MINIMUM_HALT = 0 - - # data specific preprocess - df_counts = utils_rsem_counts_table_to_dataframe(unnormalizedCountTable) - - ercc_file = utils_get_asset("cms_095046.txt") - df_ercc = pd.read_csv(ercc_file, sep="\t") - - # filter to only ercc genes - df_counts = df_counts.loc[df_counts.index.isin(df_ercc["ERCC ID"])] - - # filter to only genes with at least one count (i.e. ERCC genes represented in the dataset) - df_counts = df_counts.loc[df_counts.sum(axis="columns") > 0] - - # merge to ercc table data including subgroup - df_counts = df_counts.merge(df_ercc, left_index=True, right_on="ERCC ID") - - # generate subgroup counts - df_subgroup_counts = df_counts["subgroup"].value_counts().sort_index() - - green_key = f"green level subgroups: > {MINIMUM_GREEN} ERCC represented" - yellow_key = ( - f"yellow level subgroups: {MINIMUM_YELLOW}-{MINIMUM_GREEN} ERCC represented" - ) - red_key = f"red level subgroups: {MINIMUM_RED}-{MINIMUM_YELLOW} ERCC represented" - halt_key = f"halt level subgroups: < {MINIMUM_HALT} ERCC represented" - - # classify each representation count - representation_category: dict[str, dict[str,int]] = { - green_key: df_subgroup_counts.loc[df_subgroup_counts > MINIMUM_GREEN].to_dict(), - yellow_key: - df_subgroup_counts.loc[ - df_subgroup_counts.between(MINIMUM_YELLOW, MINIMUM_GREEN) - ].to_dict() - , - red_key: - df_subgroup_counts.loc[ - df_subgroup_counts.between( - MINIMUM_RED, MINIMUM_YELLOW, inclusive="left" - ) - ].to_dict() - , - halt_key: df_subgroup_counts.loc[df_subgroup_counts < MINIMUM_HALT].to_dict(), - } - - # check logic - if representation_category[halt_key]: - code = FlagCode.HALT - message = ( - f"Dataset wide ERCC representation is not robust: {representation_category}" - ) - elif representation_category[red_key]: - code = FlagCode.RED - message = ( - f"Dataset wide ERCC representation is not robust: {representation_category}" - ) - elif representation_category[yellow_key]: - code = FlagCode.YELLOW - message = ( - f"Dataset wide ERCC representation is not robust: {representation_category}" - ) - else: - code = FlagCode.GREEN - message = ( - f"Dataset wide ERCC representation is robust: {representation_category}" - ) - return {"code": code, "message": message} - - -def check_sample_in_multiqc_report( - samples: list[str], - multiqc_report_path: Path, - name_reformat_func: Callable = lambda s: s, -) -> FlagEntry: - """Determines if the query samples are present in the multiqc report. - - This is achieved by checking the 'multiqc_sources.txt' table, 'Sample Name' column. - An optional name_reformat_function can be supplied to address sample name changes that occur in the multiqc report. - An example being the renaming of Sample '-' characters to '_' for certain RSeQC modules. - - :param sample: Query sample names to check for presense - :type sample: list[str] - :param multiqc_report_path: MultiQC report directory - :type multiqc_report_path: Path - :param name_reformat_func: A function applied to the multiQC sample names before searching against query sample names, defaults to not renaming the multiQC sample names - :type name_reformat_func: Callable, optional - :return: Flag Entry denoting successful or failing results. Includes description of query sample names and any missing samples - :rtype: FlagEntry - """ - # Load multiQC sources table and retrieve set of samples - [sources_table] = multiqc_report_path.glob("**/multiqc_sources.txt") - multiQC_samples = list(pd.read_csv(sources_table, sep="\t")["Sample Name"]) - - # Transform multiQC samples using name_reformat_func - reformatted_multiQC_samples = [name_reformat_func(s) for s in multiQC_samples] - - # Check for any missing reformatted sample names. - # Also track extra samples, these are not errors but should be reported as well. - missing_samples = set(samples) - set(reformatted_multiQC_samples) - - # check logic - if len(missing_samples) == 0: - code = FlagCode.GREEN - message = f"Found all query samples after reformatting multiQC sample names. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }" - else: - code = FlagCode.HALT - message = f"Missing the following query samples: {missing_samples}. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }" - return {"code": code, "message": message} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/config.yaml b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/config.yaml deleted file mode 100644 index ea4f7041..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/config.yaml +++ /dev/null @@ -1,150 +0,0 @@ -# TOP LEVEL -NAME: "metagenomics" -VERSION: "1" - -Staging: - General: - Required Metadata: - From ISA: - - - ISA Field Name: - - Characteristics[Organism] - - Characteristics[organism] - ISA Table Source: Sample - Runsheet Column Name: organism - Processing Usage: >- - Mapping to the appropriate alignment reference and annotation databases. - Example: Microbiota - - - ISA Field Name: - - Characteristics[host organism] - - Characteristics[Host Organism] - - Characteristics[Host organism] - ISA Table Source: Sample - Runsheet Column Name: host organism - Processing Usage: >- - Mapping to the appropriate alignment reference and annotation databases. - Example: Mus musculus - - - ISA Field Name: Sample Name - ISA Table Source: Assay - Runsheet Column Name: sample_name - Runsheet Index: true - Processing Usage: >- - Sample name is used as a unique sample identifier during processing - Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 - - - ISA Field Name: - - Parameter Value[library layout] - - Parameter Value[Library Layout] - - Parameter Value: library layout - ISA Table Source: Assay - Runsheet Column Name: paired_end - Remapping: {"PAIRED":true, "Paired":true, "SINGLE":false, "Single":false} - Processing Usage: >- - Indicates if the sequencing was paired end. This controls how a variety of tools are invoked - including in-house written scripts. - Example: 'TRUE' - - # this entry denotes the following: - # retrieve from that ISA field name - # multiple values (separated by ",") - # index those to certain runsheet columns - # if the index doesn't exist, optional prevents raising an exception - # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls - # an exception will be raised if one and only one url is not mapped to each filename - - ISA Field Name: - - Parameter Value[Merged Sequence Data File] - - Characteristics[Merged Sequence Data File] - - Raw Data File - ISA Table Source: Assay - Multiple Values Per Entry: true - Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma - Runsheet Column Name: - - {'name':'read1_path', 'index':0} - - {'name':'read2_path', 'index':1, 'optional':true} - GLDS URL Mapping: True - Processing Usage: >- - Location to the raw data fastq file. May be a url or local path. - Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...' - - - ISA Field Name: - - Parameter Value[Merged Sequence Data File] - - Characteristics[Merged Sequence Data File] - - Raw Data File - ISA Table Source: Assay - Multiple Values Per Entry: true - Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma - Runsheet Column Name: - - {'name':'raw_R1_suffix', 'index':0} - - {'name':'raw_R2_suffix', 'index':1, 'optional':true} - - Processing Usage: >- - Raw data fastq file. - Example: '_R1_raw.fastq.gz or _raw.fastq.gz for SE' - - - ISA Field Name: Factor Value[{factor_name}] - ISA Table Source: [Assay, Sample] - Runsheet Column Name: Factor Value[{factor_name}] - Matches Multiple Columns: true - Match Regex: "Factor Value\\[.*\\]" - Append Column Following: "Unit" - Processing Usage: >- - Factor values in a study. Used to assign experimental groups for each sample. - Note: On the runsheet, a subsequent 'Unit' Column value will be - suffix-concatenated if it exists. - Example: Basal Control - - - ISA Field Name: Unit - ISA Table Source: [Assay, Sample] - Runsheet Column Name: null - Matches Multiple Columns: true - Autoload: false # handled by factor value loading above - Processing Usage: >- - Unit to be suffix-concatenated onto prior Factor value columns. - Example: day - - From User: - # Removed since unused by Processing via the runsheet - # - Runsheet Column Name: GLDS - # Processing Usage: >- - # The GLDS accession number - # Example: GLDS-205 - - - Runsheet Column Name: read1_path - # used to generate candidate file names for searching GLDS repository filelisting - Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"] - Processing Usage: >- - The location of either the forward reads (paired end) or only reads file (single end) - raw fastq file. Can be either a url or local path. - Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI - may be used to retrieve urls given the array data filename (sourced from ISA archive). - Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 - - - - Runsheet Column Name: read2_path - Data Asset Keys: ["raw reverse reads fastq GZ"] - Processing Usage: >- - The location of either the reverse reads (paired end) - raw fastq file. Can be either a url or local path. - For single end studies, this should be an empty string. - Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI - may be used to retrieve urls given the array data filename (sourced from ISA archive). - Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 - -ISA Meta: - Valid Study Assay Technology And Measurement Types: - - measurement: "Metagenomic sequencing" - technology: "Whole-Genome Shotgun Sequencing" - - # this is prepended to all file names in the curation assay table - Global file prefix: "{datasystem}_metagenomics_" - - # # configuration related to updating investigation file - # # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file - # # LEADCAP_organism should be the studied organisms scientific name with a leading cap - # Post Processing Add Study Protocol: - # GeneLab Methyl-Seq data processing protocol::{LEADCAP_organism} V1 - -data assets: - # resource categories: *neverPublished \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/protocol.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/protocol.py deleted file mode 100644 index 5eaa896a..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/protocol.py +++ /dev/null @@ -1,997 +0,0 @@ -from pathlib import Path -import re -from typing import Union -import yaml -import logging - -from dp_tools.core.entity_model import Dataset - -log = logging.getLogger(__name__) - -from dp_tools.core.check_model import ValidationProtocol - -from .checks import * - -CONFIG = { - "Metadata-check_metadata_attributes_exist": { - "expected_attrs": ["paired_end", "has_ERCC", "organism"] - }, - "Raw Reads-check_for_outliers": { - "mqc_module": "FastQC", - "mqc_plot": "general_stats", - "mqc_keys": [ - "percent_gc", - "avg_sequence_length", - "total_sequences", - "percent_duplicates", - ], - "thresholds": [ - {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, - {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, - ], - }, - "Trim Reads-check_for_outliers": { - "mqc_module": "FastQC", - "mqc_plot": "general_stats", - "mqc_keys": [ - "percent_gc", - "avg_sequence_length", - "total_sequences", - "percent_duplicates", - ], - "thresholds": [ - {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, - {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, - ], - }, - "Raw Reads By Sample-check_fastqgz_file_contents": { - "count_lines_to_check": 200000000 - }, - "Trim Reads By Sample-check_fastqgz_file_contents": { - "count_lines_to_check": 200000000 - }, - "STAR Alignments By Sample-check_thresholds-Mapped": { - "mqc_key": "STAR", - "stat_string": "uniquely_mapped_percent + multimapped_percent", - "thresholds": [ - {"code": "YELLOW", "type": "lower", "value": 70}, - {"code": "RED", "type": "lower", "value": 50}, - ], - }, - "STAR Alignments By Sample-check_thresholds-MultiMapped": { - "mqc_key": "STAR", - "stat_string": "multimapped_toomany_percent + multimapped_percent", - "thresholds": [ - {"code": "YELLOW", "type": "lower", "value": 30}, - {"code": "RED", "type": "lower", "value": 15}, - ], - }, - "STAR Alignments-check_for_outliers": { - "mqc_module": "STAR", - "mqc_plot": "general_stats", - "mqc_keys": [ - "uniquely_mapped_percent", - "avg_mapped_read_length", - "mismatch_rate", - "deletion_rate", - "deletion_length", - "insertion_rate", - "insertion_length", - "multimapped_percent", - "multimapped_toomany_percent", - "unmapped_mismatches_percent", - "unmapped_tooshort_percent", - "unmapped_other_percent", - ], - "thresholds": [ - {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, - {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, - ], - }, - "RSeQC-check_for_outliers-geneBody_coverage": { - "mqc_module": "RSeQC", - "mqc_plot": "Gene Body Coverage", - "mqc_keys": ["_ALL"], - "thresholds": [ - {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, - {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, - ], - }, - "RSeQC-check_for_outliers-infer_experiment": { - "mqc_module": "RSeQC", - "mqc_plot": "Infer experiment", - "mqc_keys": ["_ALL"], - "thresholds": [ - {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, - {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, - ], - }, - "RSeQC-check_for_outliers-inner_distance": { - "mqc_module": "RSeQC", - "mqc_plot": "Inner Distance", - "mqc_keys": ["_ALL"], - "thresholds": [ - {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, - {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, - ], - }, - "RSeQC-check_for_outliers-read_distribution": { - "mqc_module": "RSeQC", - "mqc_plot": "Read Distribution", - "mqc_keys": ["_ALL"], - "thresholds": [ - {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, - {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, - ], - }, - "RSeQC-check_strandedness_assessable_from_infer_experiment": { - "stranded_assessment_range": {"max": 100, "min": 75}, - "unstranded_assessment_range": {"min": 40, "max": 60}, - "valid_dominant_strandedness_assessments": [ - "Sense (% Tags)", - "Antisense (% Tags)", - ], - }, - "RSEM Counts-check_for_outliers": { - "mqc_module": "Rsem", - "mqc_plot": "general_stats", - "mqc_keys": [ - "Unalignable", - "Alignable", - "Filtered", - "Total", - "alignable_percent", - "Unique", - "Multi", - "Uncertain", - ], - "thresholds": [ - {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, - {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, - ], - }, -} - -# Manual kept in sync for now -COMPONENTS_LIST = [ - "Metadata", # for raw reads V&V - "Raw Reads", # for raw reads V&V - "Raw Reads By Sample", # for raw reads V&V - "Trim Reads", # for trim reads V&V - "Trimmed Reads By Sample", # for trim reads V&V - "STAR Alignments", # for star alignment V&V - "STAR Alignments By Sample", # for star alignment V&V - "RSeQC By Sample", # for RSeQC V&V - "RSeQC", # for RSeQC V&V - "RSEM Counts", # for after RSEM V&V - "Unnormalized Gene Counts", # for after RSEM V&V - "DGE Metadata", # for post DGE - "DGE Metadata ERCC", # for post DGE - "DGE Output", # for post DGE - "DGE Output ERCC", # for post DGE -] - - -def validate( - dataset: Dataset, - config_path: Path = None, - run_args: dict = None, - report_args: dict = None, - protocol_args: dict = None, - defer_run: bool = False, -) -> Union[ValidationProtocol, ValidationProtocol.Report]: - - if config_path is not None: - with open(config_path, "r") as f: - config = yaml.safe_load(f) - else: - config = CONFIG - - if run_args is None: - run_args = dict() - - if report_args is None: - report_args = dict() - - if protocol_args is None: - protocol_args = dict() - - # Modify protocol_args to convert run_components to skip_components based on COMPONENTS_LIST - if ( - "run_components" in protocol_args - and protocol_args.get("run_components") is not None - ): - protocol_args["skip_components"] = [ - c for c in COMPONENTS_LIST if c not in protocol_args["run_components"] - ] - # Check if any run components are not in COMPONENTS_LIST - if set(protocol_args["run_components"]) - set(COMPONENTS_LIST): - raise ValueError( - f"run_components contains components not in COMPONENTS_LIST. Unique to run_components: {set(protocol_args['run_components']) - set(COMPONENTS_LIST)}. All Components: {COMPONENTS_LIST}" - ) - del protocol_args["run_components"] - - # init validation protocol - vp = ValidationProtocol(**protocol_args) - # fmt: on - with vp.component_start( - name=dataset.name, - description="Validate processing from trim reads through differential gene expression output", - ): - - with vp.component_start( - name="Metadata", description="Metadata file validation" - ): - with vp.payload(payloads=[{"dataset": dataset}]): - vp.add( - check_metadata_attributes_exist, - config=config["Metadata-check_metadata_attributes_exist"], - ) - - with vp.component_start( - name="Raw Reads", description="Raw Reads Outliers Detection" - ): - with vp.payload( - payloads=[ - { - "dataset": dataset, - "data_asset_keys": ["raw reads fastQC ZIP"], - } - ] - if not dataset.metadata["paired_end"] - else [ - { - "dataset": dataset, - "data_asset_keys": [ - "raw forward reads fastQC ZIP", - ], - }, - { - "dataset": dataset, - "data_asset_keys": [ - "raw reverse reads fastQC ZIP", - ], - }, - ] - ): - vp.add( - check_for_outliers, config=config["Raw Reads-check_for_outliers"] - ) - - with vp.payload( - payloads=[ - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "raw MultiQC directory" - ].path, - "name_reformat_func": lambda: lambda s: re.sub( - "_raw|_R1_raw|_R2_raw$", "", s - ), - }, - ] - ): - vp.add( - check_sample_in_multiqc_report, - description="Check all samples are present in raw reads multiQC report", - ) - - with vp.component_start( - name="Trim Reads", description="Trimmed Reads Outliers Detection" - ): - with vp.payload( - payloads=[ - { - "dataset": dataset, - "data_asset_keys": ["trimmed reads fastQC ZIP"], - } - ] - if not dataset.metadata["paired_end"] - else [ - { - "dataset": dataset, - "data_asset_keys": [ - "trimmed forward reads fastQC ZIP", - ], - }, - { - "dataset": dataset, - "data_asset_keys": [ - "trimmed reverse reads fastQC ZIP", - ], - }, - ] - ): - vp.add( - check_for_outliers, config=config["Trim Reads-check_for_outliers"] - ) - with vp.payload( - payloads=[ - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "trimmed fastQC MultiQC directory" - ].path, - "name_reformat_func": lambda: lambda s: re.sub( - "_R1|_R2$", "", s - ), - }, - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "trimming MultiQC directory" - ].path, - "name_reformat_func": lambda: lambda s: re.sub( - "_raw|_R1_raw|_R2_raw$", "", s - ), - }, - ] - ): - vp.add( - check_sample_in_multiqc_report, - description="Check that all samples are present in the trimmed FastQC and trimming report multiQC reports", - ) - with vp.component_start( - name="STAR Alignments", - description="Dataset wide checks including outliers detection", - ): - with vp.payload( - payloads=[ - { - "dataset": dataset, - "data_asset_keys": ["aligned log Final"], - } - ] - ): - vp.add( - check_for_outliers, - config=config["STAR Alignments-check_for_outliers"], - ) - with vp.payload( - payloads=[ - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "aligned MultiQC directory" - ].path, - }, - ] - ): - vp.add( - check_sample_in_multiqc_report, - description="Check all samples are present in STAR multiQC report", - ) - - with vp.component_start( - name="RSeQC", - description="RSeQC submodule outliers checking and other submodule specific dataset wide checks", - ): - with vp.payload( - payloads=[ - { - "dataset": dataset, - "data_asset_keys": ["genebody coverage out"], - } - ] - ): - vp.add( - check_for_outliers, - description="Check for outliers in geneBody Coverage", - config=config["RSeQC-check_for_outliers-geneBody_coverage"], - ) - with vp.payload( - payloads=[ - { - "dataset": dataset, - "data_asset_keys": ["infer experiment out"], - } - ] - ): - vp.add( - check_for_outliers, - description="Check for outliers in infer experiment", - config=config["RSeQC-check_for_outliers-infer_experiment"], - ) - with vp.payload( - payloads=[ - { - "dataset": dataset, - "data_asset_keys": ["inner distance out"], - } - ] - ): - vp.add( - check_for_outliers, - description="Check for outliers in inner distance", - config=config["RSeQC-check_for_outliers-inner_distance"], - skip=(not dataset.metadata["paired_end"]), - ) - with vp.payload( - payloads=[ - { - "dataset": dataset, - "data_asset_keys": ["read distribution out"], - } - ] - ): - vp.add( - check_for_outliers, - description="Check for outliers in read distribution", - config=config["RSeQC-check_for_outliers-read_distribution"], - ) - - with vp.payload(payloads=[{"dataset": dataset}]): - vp.add( - check_strandedness_assessable_from_infer_experiment, - config=config[ - "RSeQC-check_strandedness_assessable_from_infer_experiment" - ], - ) - with vp.payload( - payloads=[ - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "genebody coverage MultiQC directory" - ].path, - }, - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "infer experiment MultiQC directory" - ].path, - "name_reformat_func": lambda: lambda s: re.sub( - "_infer_expt$", "", s - ), - }, - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "read distribution MultiQC directory" - ].path, - "name_reformat_func": lambda: lambda s: re.sub( - "_read_dist$", "", s - ), - }, - ] - ): - vp.add( - check_sample_in_multiqc_report, - description="Check all samples are present in RSeQC multiQC reports", - ) - with vp.payload( - payloads=[ - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "inner distance MultiQC directory" - ].path, - }, - ] - ): - vp.add( - check_sample_in_multiqc_report, - description="Check all samples are present in RSeQC inner distance multiQC report (paired end only)", - skip=(not dataset.metadata["paired_end"]), - ) - with vp.component_start( - name="RSEM Counts", - description="Dataset wide checks including outliers detection", - ): - with vp.payload( - payloads=[ - { - "dataset": dataset, - "data_asset_keys": ["sample counts stats directory"], - } - ] - ): - vp.add( - check_for_outliers, config=config["RSEM Counts-check_for_outliers"] - ) - with vp.payload( - payloads=[ - { - "samples": list(dataset.samples), - "multiqc_report_path": lambda: dataset.data_assets[ - "RSEM counts MultiQC directory" - ].path, - }, - ] - ): - vp.add( - check_sample_in_multiqc_report, - description="Check all samples are present in RSEM multiQC report", - ) - with vp.component_start( - name="Unnormalized Gene Counts", - description="Validate normalization related output", - ): - - with vp.payload( - payloads=[ - { - "unnormalizedCountTable": lambda: dataset.data_assets[ - "star unnormalized counts table" - ].path, - "samplewise_tables": lambda: { - s.name: s.data_assets["sample reads per gene table"].path - for s in dataset.samples.values() - }, - }, - ] - ): - vp.add( - check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables - ) - with vp.payload( - payloads=[ - { - "unnormalizedCountTable": lambda: dataset.data_assets[ - "rsem unnormalized counts table" - ].path, - "samplewise_tables": lambda: { - s.name: s.data_assets["sample gene counts table"].path - for s in dataset.samples.values() - }, - }, - ] - ): - vp.add( - check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables - ) - vp.add( - check_ERCC_subgroup_representation, - skip=(not dataset.metadata["has_ERCC"]), - ) - - with vp.component_start( - name="DGE Metadata", - description="", - ): - - with vp.component_start( - name="Sample Table", - description="", - ): - with vp.payload( - payloads=[ - { - "runsheet": lambda: dataset.data_assets["runsheet"].path, - "sampleTable": lambda: dataset.data_assets[ - "sample table" - ].path, - } - ] - ): - vp.add( - check_sample_table_against_runsheet, - config={"all_samples_required": True}, - ) - vp.add(check_sample_table_for_correct_group_assignments) - - with vp.component_start( - name="Contrasts Tables", - description="", - ): - with vp.payload( - payloads=[ - { - "runsheet": lambda: dataset.data_assets["runsheet"].path, - "contrasts_table": lambda: dataset.data_assets[ - "DESeq2 contrasts table" - ].path, - } - ] - ): - vp.add(check_contrasts_table_headers) - vp.add(check_contrasts_table_rows) - - with vp.component_start( - name="DGE Metadata ERCC", - description="", - skip=(not dataset.metadata["has_ERCC"]), - ): - - with vp.component_start( - name="Sample Table", - description="", - ): - with vp.payload( - payloads=[ - { - "runsheet": lambda: dataset.data_assets["runsheet"].path, - "sampleTable": lambda: dataset.data_assets[ - "ERCC sample table" - ].path, - } - ] - ): - vp.add( - check_sample_table_against_runsheet, - config={"all_samples_required": False}, - ) - vp.add(check_sample_table_for_correct_group_assignments) - - with vp.component_start( - name="Contrasts Tables", - description="", - ): - with vp.payload( - payloads=[ - { - "runsheet": lambda: dataset.data_assets["runsheet"].path, - "contrasts_table": lambda: dataset.data_assets[ - "ERCC normalized DESeq2 contrasts table" - ].path, - } - ] - ): - vp.add(check_contrasts_table_headers) - vp.add(check_contrasts_table_rows) - - with vp.component_start( - name="DGE Output", - description="", - ): - with vp.payload( - payloads=[ - { - "rsem_table_path": lambda: dataset.data_assets[ - "rsem unnormalized counts table" - ].path, - "deseq2_table_path": lambda: dataset.data_assets[ - "DESeq2 unnormalized counts table" - ].path, - } - ] - ): - vp.add( - check_rsem_counts_and_unnormalized_tables_parity, - skip=( - "rsem unnormalized counts table" not in dataset.data_assets - or "DESeq2 unnormalized counts table" not in dataset.data_assets - ), - ) - - with vp.payload( - payloads=[ - { - "organism": lambda: dataset.metadata["organism"], - "samples": lambda: set(dataset.samples), - "dge_table": lambda: dataset.data_assets[ - "DESeq2 annotated DGE table" - ].path, - "runsheet": lambda: dataset.data_assets["runsheet"].path, - } - ] - ): - vp.add(check_dge_table_annotation_columns_exist) - vp.add(check_dge_table_sample_columns_exist) - vp.add(check_dge_table_sample_columns_constraints) - vp.add(check_dge_table_group_columns_exist) - vp.add(check_dge_table_group_columns_constraints) - vp.add(check_dge_table_comparison_statistical_columns_exist) - vp.add(check_dge_table_group_statistical_columns_constraints) - vp.add(check_dge_table_fixed_statistical_columns_exist) - vp.add(check_dge_table_fixed_statistical_columns_constraints) - vp.add(check_dge_table_log2fc_within_reason) - - with vp.component_start( - name="Viz Tables", - description="Extended from the dge tables", - ): - with vp.payload( - payloads=[ - { - "organism": lambda: dataset.metadata["organism"], - "samples": lambda: set(dataset.samples), - "dge_table": lambda: dataset.data_assets[ - "DESeq2 annotated DGE extended for viz table" - ].path, - "runsheet": lambda: dataset.data_assets["runsheet"].path, - } - ] - ): - vp.add(check_dge_table_annotation_columns_exist) - vp.add(check_dge_table_sample_columns_exist) - vp.add(check_dge_table_sample_columns_constraints) - vp.add(check_dge_table_group_columns_exist) - vp.add(check_dge_table_group_columns_constraints) - vp.add(check_dge_table_comparison_statistical_columns_exist) - vp.add(check_dge_table_group_statistical_columns_constraints) - vp.add(check_dge_table_fixed_statistical_columns_exist) - vp.add(check_dge_table_fixed_statistical_columns_constraints) - vp.add(check_dge_table_log2fc_within_reason) - vp.add(check_viz_table_columns_exist) - vp.add(check_viz_table_columns_constraints) - - with vp.payload( - payloads=[ - { - "samples": lambda: set(dataset.samples), - "pca_table": lambda: dataset.data_assets[ - "DESeq2 viz PCA table" - ].path, - } - ] - ): - vp.add(check_viz_pca_table_index_and_columns_exist) - - with vp.component_start( - name="DGE Output ERCC", - description="", - skip=(not dataset.metadata["has_ERCC"]), - ): - with vp.payload( - payloads=[ - { - "organism": lambda: dataset.metadata["organism"], - "samples": lambda: set( - pd.read_csv( - dataset.data_assets["ERCC sample table"].path, - index_col=0, - ).index - ), - "dge_table": lambda: dataset.data_assets[ - "ERCC normalized DESeq2 annotated DGE table" - ].path, - "runsheet": lambda: dataset.data_assets["runsheet"].path, - } - ] - ): - vp.add(check_dge_table_annotation_columns_exist) - vp.add(check_dge_table_sample_columns_exist) - vp.add(check_dge_table_sample_columns_constraints) - vp.add(check_dge_table_group_columns_exist) - vp.add(check_dge_table_group_columns_constraints) - vp.add(check_dge_table_comparison_statistical_columns_exist) - vp.add(check_dge_table_group_statistical_columns_constraints) - vp.add(check_dge_table_fixed_statistical_columns_exist) - vp.add(check_dge_table_fixed_statistical_columns_constraints) - vp.add(check_dge_table_log2fc_within_reason) - - with vp.component_start( - name="Viz Tables", - description="Extended from the dge tables", - ): - with vp.payload( - payloads=[ - { - "organism": lambda: dataset.metadata["organism"], - "samples": lambda: set( - pd.read_csv( - dataset.data_assets["ERCC sample table"].path, - index_col=0, - ).index - ), - "dge_table": lambda: dataset.data_assets[ - "ERCC normalized DESeq2 annotated DGE extended for viz table" - ].path, - "runsheet": lambda: dataset.data_assets["runsheet"].path, - } - ] - ): - vp.add(check_dge_table_annotation_columns_exist) - vp.add(check_dge_table_sample_columns_exist) - vp.add(check_dge_table_sample_columns_constraints) - vp.add(check_dge_table_group_columns_exist) - vp.add(check_dge_table_group_columns_constraints) - vp.add(check_dge_table_comparison_statistical_columns_exist) - vp.add(check_dge_table_group_statistical_columns_constraints) - vp.add(check_dge_table_fixed_statistical_columns_exist) - vp.add(check_dge_table_fixed_statistical_columns_constraints) - vp.add(check_dge_table_log2fc_within_reason) - vp.add(check_viz_table_columns_exist) - vp.add(check_viz_table_columns_constraints) - - with vp.payload( - payloads=[ - { - "samples": lambda: set( - pd.read_csv( - dataset.data_assets["ERCC sample table"].path, - index_col=0, - ).index - ), - "pca_table": lambda: dataset.data_assets[ - "ERCC normalized DESeq2 viz PCA table" - ].path, - } - ] - ): - vp.add(check_viz_pca_table_index_and_columns_exist) - - for sample in dataset.samples.values(): - with vp.component_start( - name=sample.name, description="Samples level checks" - ): - with vp.component_start( - name="Raw Reads By Sample", description="Raw reads" - ): - with vp.payload( - payloads=( - [ - { - "file": lambda sample=sample: sample.data_assets[ - "raw forward reads fastq GZ" - ].path - }, - { - "file": lambda sample=sample: sample.data_assets[ - "raw reverse reads fastq GZ" - ].path - }, - ] - if dataset.metadata["paired_end"] - else [ - { - "file": lambda sample=sample: sample.data_assets[ - "raw reads fastq GZ" - ].path - }, - ] - ) - ): - vp.add( - check_fastqgz_file_contents, - config=config[ - "Raw Reads By Sample-check_fastqgz_file_contents" - ], - ) - vp.add( - check_gzip_file_integrity, - ) - with vp.payload( - payloads=[ - { - "sample": sample, - "reads_key_1": "raw forward reads fastQC ZIP", - "reads_key_2": "raw reverse reads fastQC ZIP", - }, - ], - ): - vp.add( - check_forward_and_reverse_reads_counts_match, - skip=(not dataset.metadata["paired_end"]), - ) - with vp.component_start( - name="Trimmed Reads By Sample", description="Trimmed reads" - ): - with vp.payload( - payloads=( - [ - { - "file": lambda sample=sample: sample.data_assets[ - "trimmed forward reads fastq GZ" - ].path - }, - { - "file": lambda sample=sample: sample.data_assets[ - "trimmed reverse reads fastq GZ" - ].path - }, - ] - if dataset.metadata["paired_end"] - else [ - { - "file": lambda sample=sample: sample.data_assets[ - "trimmed reads fastq GZ" - ].path - } - ] - ) - ): - vp.add(check_file_exists, description="Check reads files exist") - vp.add( - check_fastqgz_file_contents, - config=config[ - "Trim Reads By Sample-check_fastqgz_file_contents" - ], - ) - - with vp.payload( - payloads=[ - { - "sample": sample, - "reads_key_1": "trimmed forward reads fastQC ZIP", - "reads_key_2": "trimmed reverse reads fastQC ZIP", - }, - ], - ): - vp.add( - check_forward_and_reverse_reads_counts_match, - skip=(not dataset.metadata["paired_end"]), - ) - - with vp.component_start( - name="STAR Alignments By Sample", - description="STAR Alignment outputs", - ): - - with vp.payload( - payloads=[ - { - "file": lambda sample=sample: sample.data_assets[ - "aligned ToTranscriptome Bam" - ].path, - }, - { - "file": lambda sample=sample: sample.data_assets[ - "aligned SortedByCoord Bam" - ].path, - }, - ] - ): - vp.add( - check_bam_file_integrity, - config={ - "samtools_bin": "samtools" - }, # assumes accessible on path already - ) - - with vp.payload( - payloads=[ - { - "multiqc_inputs": lambda sample=sample: [ - sample.data_assets["aligned log Final"].path - ], - }, - ] - ): - vp.add( - check_thresholds, - config=config[ - "STAR Alignments By Sample-check_thresholds-Mapped" - ], - description="Check that mapping rates are reasonable, specifically most reads map to the target genome", - ) - vp.add( - check_thresholds, - config=config[ - "STAR Alignments By Sample-check_thresholds-MultiMapped" - ], - description="Check that mapping rates are reasonable, specifically that a considerable amount of reads multimap to the target genome", - ) - - with vp.component_start( - name="RSeQC By Sample", - description="RNASeq QA outputs", - ): - with vp.component_start( - name="geneBody_coverage", - description="Assess integrity of transcripts and library prep signatures", - ): - with vp.payload( - payloads=[ - { - "input_dir": lambda sample=sample: sample.data_assets[ - "genebody coverage out" - ].path - }, - ] - ): - vp.add(check_genebody_coverage_output) - with vp.component_start( - name="inner_distance", - description="Reports on distance between mate reads based on gene annotations", - skip=(not dataset.metadata["paired_end"]), - ): - with vp.payload( - payloads=[ - { - "input_dir": lambda sample=sample: sample.data_assets[ - "inner distance out" - ].path - }, - ] - ): - vp.add(check_inner_distance_output) - # return protocol object without running or generating a report - if defer_run: - return vp - - vp.run(**run_args) - - # return report - return vp.report(**report_args, combine_with_flags=dataset.loaded_assets_dicts) \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/schemas.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/schemas.py deleted file mode 100644 index d3db1810..00000000 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads-B/workflow_code/config/schemas.py +++ /dev/null @@ -1,62 +0,0 @@ -""" Schemas for validation -Uses Schema to allow usage of validation functions -""" -from schema import Schema -from schema import Optional as schema_Optional -from typing import Optional -import pandera as pa - - -check_read2_path_populated_if_paired_end = pa.Check( - lambda df: ("read2_path" in df.columns and df['paired_end'].iloc[0] == True) or - ("read2_path" not in df.columns and df['paired_end'].iloc[0] == False), - title="Check 'read2_path' is only populated if paired_end is True", - description="Failures here are likely either due to manual user error or inappropriate source file (e.g. ISA archive)", - error="Expected 'read2_path' to be populated only if paired_end is True" - ) - -runsheet = { - "metagenomics": pa.DataFrameSchema( - columns={ - "Original Sample Name": pa.Column(str), - "read1_path": pa.Column(str), - "read2_path": pa.Column(str, required=False), # Expect if paired_end is True - }#, - # define checks at the DataFrameSchema-level - #checks=check_read2_path_populated_if_paired_end - ) -} - -import pandas as pd - -class runsheet: # Bad casing since we will use the class definition itself for all static methods - - @staticmethod - def check_single_value(column: pd.Series, error_msg: str, errors: list[str]) -> None: - if len(column.unique()) != 1: - errors.append(error_msg) - - @staticmethod - def check_read2_path_populated_if_paired_end(df: pd.DataFrame, errors: list[str]) -> None: - if (("read2_path" in df.columns and df['paired_end'][0] == True) or - ("read2_path" not in df.columns and df['paired_end'][0] == False)): - return - else: - errors.append("Expected 'read2_path' to be populated only if paired_end is True") - - @staticmethod - def validate(df_runsheet: pd.DataFrame) -> bool: - errors = [] - - # Check for single value in specified columns - - runsheet.check_single_value(df_runsheet['organism'], "Dataset level columns do NOT contain one unique value for 'organism'", errors) - runsheet.check_single_value(df_runsheet['paired_end'], "Dataset level columns do NOT contain one unique value for 'paired_end'", errors) - - # Check for 'read2_path' population if paired_end is True - #runsheet.check_read2_path_populated_if_paired_end(df_runsheet, errors) - - if errors: - raise ValueError("\n".join(errors)) - else: - return True \ No newline at end of file From 4a8b16c991da02bdb4fe55e53b09117dfde45b4d Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 9 Jul 2024 17:20:44 -0700 Subject: [PATCH 24/33] Update README.md --- .../workflow_code/README.md | 88 ++++++++++++++++--- 1 file changed, 74 insertions(+), 14 deletions(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md index a14d2193..38aa25d9 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md @@ -2,29 +2,81 @@ ## General workflow info -The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [NextFlow](https://www.nextflow.io/docs/stable/index.html) DSL2 workflow and utilizes [Docker](https://www.docker.com/) run all tools in containers. This workflow (NF_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with NextFlow and Docker, but if you want to learn more about those, [this NextFlow tutorial](https://training.nextflow.io/basic_training/) within [NextFlow's documentation](https://www.nextflow.io/docs/stable/index.html) is a good place to start for that. +The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [NextFlow](https://www.nextflow.io/docs/stable/index.html) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) run all tools in containers. This workflow (NF_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with NextFlow and Singularity, but if you want to learn more about those, [this NextFlow tutorial](https://training.nextflow.io/basic_training/) within [NextFlow's documentation](https://www.nextflow.io/docs/stable/index.html) is a good place to start for that. ## Utilizing the workflow -1. [Install NextFlow and Docker](#1-install-NextFlow-Docker) -2. [Download the workflow template files](#2-download-the-workflow-template-files) -3. [Modify the variables in the Remove_Human_Reads.config file](#3-modify-the-variables-in-the-config-file) -4. [Run the workflow](#4-run-the-workflow) +1. [Install conda, mamba, and `genelab-utils` package](#1-install-conda-mamba-and-genelab-utils-package) +2. [Install NextFlow and Singularity](#2-install-NextFlow-Singularity) + 2a. [Install Nextflow](#2a-install-nextflow) + 2b. [Install Singularity](#2b-install-singularity) +3. [Download the workflow template files](#3-download-the-workflow-template-files) +4. [Modify the variables in the Remove_Human_Reads.config file](#4-modify-the-variables-in-the-config-file) +5. [Run the workflow](#5-run-the-workflow) -### 1. Install NextFlow and Docker -You can install NextFlow into your specified directory using the following code: + +
+ +--- + +### 1. Install conda, mamba, and `genelab-utils` package +We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). + +Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations: ```bash -curl -s https://get.nextflow.io | bash +conda install -n base -c conda-forge mamba +``` + +> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5) if wanted. -sudo mv nextflow /usr/local/bin +Once mamba is installed, you can install the genelab-utils conda package in a new environment with the following command: + +```bash +mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike 'genelab-utils>=1.1.02' ``` -Docker can be installed according to the [NextFlow setup page](https://training.nextflow.io/basic_training/) +The environment then needs to be activated: + +```bash +conda activate genelab-utils +``` + +
+ +--- + +### 2. Install Nextflow and Singularity + +#### 2a. Install Nextflow +Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). +> Note: If you want to install Anaconda, we recommend installing a Miniconda, Python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). +> +> Once conda is installed on your system, you can install the latest version of Nextflow by running the following commands: +> +> ```bash +> conda install -c bioconda nextflow +> nextflow self-update +> ``` + +
+ +#### 2b. Install Singularity + +Singularity is a container platform that allows usage of containerized software. This enables the GeneLab RCP workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. -### 2. Download the workflow template files +We recommend installing Singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). + +> Note: Singularity is also available through [Anaconda](https://anaconda.org/conda-forge/singularity). + +
+ +--- + + +### 3. Download the workflow template files All workflow files for removing human reads from metagenomics data are in the [workflow_code](workflow_code) directory. To get a copy of the latest NF_MGRemoveHumanReads-A version on to your system, run the following command: ```bash @@ -38,7 +90,11 @@ This downloaded the workflow into a directory called `NF_MGRemoveHumanReads-*/`, > GL-get-workflow NF_MGRemoveHumanReads-A --wanted-version 1.0.0 > ``` -### 3. Modify the variables in the Remove_Human_Reads.config file +
+ +--- + +### 4. Modify the variables in the Remove_Human_Reads.config file Once you've downloaded the workflow template, you can modify the variables in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below - if you are running the example dataset, this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt)). You will also need to indicate the path to your input data (raw reads) and the root directory for where the kraken2 reference database should be stored (it will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file to be consistent with the study you want to process and the machine you're using. > Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). @@ -69,9 +125,13 @@ Sample-1 Sample-2 ``` -### 4. Run the workflow +
+ +--- + +### 5. Run the workflow -While in the directory holding the NextFlow file, .config file, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: +While in the directory holding the NextFlow file, .config file, and other workflow files that you downloaded in [step 3](#3-download-the-workflow-template-files), here is one example command of how to run the workflow: ```bash nextflow run *path/to/Remove_Human_Reads.nf* -ansi-log false -specify_reads false From 7dfc889c783d183d5692705e25dc423564318814 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 9 Jul 2024 17:22:11 -0700 Subject: [PATCH 25/33] Update Remove_Human_reads.config --- .../workflow_code/Remove_Human_reads.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config index 5781bc20..511228dd 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config @@ -29,5 +29,7 @@ params.num_threads = 2 params.kraken_output_dir = "$projectDir/kraken2-outputs" //location to output files, relative to wd or full path params.human_db_name = 'kraken2-human-db' // params.human_db_path = "$projectDir/${params.human_db_name}" -docker {enabled = true} +singularity { + enabled = true + autoConvert = true} params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' From c800d50b577872305638a54467725b8dbf2196fc Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 9 Jul 2024 17:47:08 -0700 Subject: [PATCH 26/33] Update Remove_Human_reads.config --- .../workflow_code/Remove_Human_reads.config | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config index 511228dd..3537336a 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config @@ -31,5 +31,10 @@ params.human_db_name = 'kraken2-human-db' // params.human_db_path = "$projectDir/${params.human_db_name}" singularity { enabled = true - autoConvert = true} + autoConvert = true + autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false} params.kraken2container = 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' From 74a627e8269590e5274e82ff30e2893644e90583 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:50:52 -0700 Subject: [PATCH 27/33] fix link Co-authored-by: Barbara Novak <19824106+bnovak32@users.noreply.github.com> --- .../NF_MGRemoveHumanReads-A/workflow_code/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md index 38aa25d9..00388ff5 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md @@ -20,7 +20,7 @@ The current pipeline for how GeneLab identifies and removes human DNA in Illumin --- ### 1. Install conda, mamba, and `genelab-utils` package -We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). +We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the Happy Belly Bioinformatics conda tutorial](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations: From 706d04519dff016d389593b827864682c83b7ae7 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Sat, 31 Aug 2024 15:07:23 -0700 Subject: [PATCH 28/33] Update and rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md corrected readme location and uploaded changes --- .../NF_MGRemoveHumanReads-A/README.md | 165 ++++++++++++++++++ .../workflow_code/README.md | 154 ---------------- 2 files changed, 165 insertions(+), 154 deletions(-) create mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md delete mode 100644 Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md new file mode 100644 index 00000000..4a3f489b --- /dev/null +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md @@ -0,0 +1,165 @@ +# NF_MGRemoveHumanReads-A Workflow Information and Usage Instructions + + +## General workflow info +The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [NextFlow](https://www.nextflow.io/docs/stable/index.html) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) to run all tools in containers. This workflow (NF_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. While knowledge of creating or modifying Nextflow workflows is not required to run the workflow as-is, the [Nextflow documentation](https://www.nextflow.io/docs/stable/index.html) is a useful resource for users who wish to modify and/or extend the workflow. + +## Utilizing the workflow + +1.. [Install NextFlow and Singularity](#1-install-NextFlow-Singularity) + 1a. [Install Nextflow](#1a-install-nextflow) + 1b. [Install Singularity](#1b-install-singularity) +2. [Download the workflow template files](#2-download-the-workflow-template-files) +3. [Modify the variables in the nextflow.config file](#3-modify-the-variables-in-the-config-file) +4. [Run the workflow](#4-run-the-workflow) + + +
+ +--- + +### 1.Install Nextflow and Singularity + +#### 1a. Install Nextflow + +To install NextFlow, follow the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). +> +> Download and install NextFlow directly: +> +> ```bash +> curl -s https://get.nextflow.io | bash +> sudo mv nextflow /usr/local/bin +> ``` + +Or, if Conda is installed on your system (and you prefer to use it) install and activate the “genelab-utils” Conda package including NextFlow by running the following commands: +> +> ```bash +> conda install -n base -c conda-forge mamba +> ``` +> +> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5) if desired. +> +> Once mamba is installed, you can install the genelab-utils conda package in a new environment > with the following command: +> +> ```bash +> mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike 'genelab-utils>=1.1.02' +> +> conda activate genelab-utils +> ``` +> + + + + + + +
+ +#### 1b. Install Singularity + +Singularity is a container platform that allows usage of containerized software. This enables the workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. + +We recommend installing Singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). + +> Note: Singularity is also available through [Anaconda](https://anaconda.org/conda-forge/singularity) if you are using Conda. + +
+ +--- + + +### 2. Download the workflow template files +All workflow files for removing human reads from metagenomics data are in the [workflow_code](workflow_code) directory. You can do this by either downloading the files for this workflow from GitHub or by [cloning](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) the repository. + + +If you are using Conda with “genelab-utils”, you can copy the workflow files to your system using the “GL-get-workflow” command: +> ```bash +> GL-get-workflow NF_MGRemoveHumanReads-A +> ``` +> +> This downloaded the workflow into a directory called `NF_MGRemoveHumanReads-*/`, with the workflow version number at the end. +> +> Note: If wanting an earlier version, the wanted version can be provided as an optional argument like so: +> ```bash +> GL-get-workflow NF_MGRemoveHumanReads-A --wanted-version 1.0.0 +> ``` + +
+ +--- + +### 3. Modify the variables in the nextflow.config file +Once you've downloaded the workflow template, you can modify the variables in the [nextflow.config](workflow_code/nextflow.config) file as needed. You will also need to indicate the path to your input data (raw reads) and the root directory for where the kraken2 reference database should be stored (it will be set up automatically). Additionally, if necessary, you'll need to modify each variable in the [nextflow.config](workflow_code/nextflow.config) file to be consistent with the study you want to process and the machine you're using. +Confirm the following variables are appropriate for your data: +- DL_kraken +- single_end +- specify_reads +- Sample_id_list +- reads_dir +- PE_reads_suffix or SE_reads_suffix +- PE_reads_out_suffix or SE_reads_out_suffix +- kraken_output_dir +- human_db_path + + +If you want to specify certain read files you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below - if you are running the example dataset, this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt)). +> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). + +**Example for how to create a single-column list of unique sample identifiers from your raw data file names** + +For example, if you only want to process a subset of the read files within the reads directory and have paired-end read data for 2 samples located in `../Raw_Sequence_Data/` relative to your workflow directory, that would look like this: + +```bash +ls ../Raw_Sequence_Data/ +``` + +``` +Sample-1_R1.fastq.gz +Sample-1_R2.fastq.gz +Sample-2_R1.fastq.gz +Sample-2_R2.fastq.gz +``` + +You would set up your `unique-sample-IDs.txt` file as follows: + +```bash +cat unique-sample-IDs.txt +``` + +``` +Sample-1 +Sample-2 +``` + +
+ +--- + +### 4. Run the workflow + +While in the directory holding the NextFlow file, nextflow.config file, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: + +```bash +nextflow run *path/to/Remove_Human_Reads.nf* -resume -ansi-log false --DL_kraken false +``` + + +* `-resume` – continues to run the workflow using cached data from the previous run +* `-ansi-log false` – specifies to print out each command being run to the screen instead of dynamically updating the log +* `--specify_reads false` - processes all reads in the working directory, without requiring a sample ID list +* `--single_end true` – indicates reads are single-ended +* `--DL_kraken true` – runs a process before the rest of the workflow to download and install the necessary database. +> +> Note - Nextflow options use a single dash prefix, e.g. -resume, whereas pipeline parameters use double dash notation, e.g. --specify_reads. All of the pipeline parameters can and should be set from the nextflow.config file to avoid typos or errors. +> + +See `nextflow -h` and [NextFlow's documentation](https://www.nextflow.io/docs/master/index.html) for more options and details. + + +--- + +## Reference database info +The database we use was built with kraken2 v2.1.1 as detailed below, and can be downloaded to run with this workflow (it's ~4.3 GB uncompressed). The steps for building it are described on the [reference database info page](https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/SW_MGRemoveHumanReads-A/reference-database-info.md). + +--- + diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md deleted file mode 100644 index 00388ff5..00000000 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/README.md +++ /dev/null @@ -1,154 +0,0 @@ -# NF_MGRemoveHumanReads-A Workflow Information and Usage Instructions - - -## General workflow info -The current pipeline for how GeneLab identifies and removes human DNA in Illumina metagenomics sequencing data (MGRemoveHumanReads), [GL-DPPD-7105-A.md](../../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-A.md), is implemented as a [NextFlow](https://www.nextflow.io/docs/stable/index.html) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) run all tools in containers. This workflow (NF_MGRemoveHumanReads-A) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with NextFlow and Singularity, but if you want to learn more about those, [this NextFlow tutorial](https://training.nextflow.io/basic_training/) within [NextFlow's documentation](https://www.nextflow.io/docs/stable/index.html) is a good place to start for that. - -## Utilizing the workflow - -1. [Install conda, mamba, and `genelab-utils` package](#1-install-conda-mamba-and-genelab-utils-package) -2. [Install NextFlow and Singularity](#2-install-NextFlow-Singularity) - 2a. [Install Nextflow](#2a-install-nextflow) - 2b. [Install Singularity](#2b-install-singularity) -3. [Download the workflow template files](#3-download-the-workflow-template-files) -4. [Modify the variables in the Remove_Human_Reads.config file](#4-modify-the-variables-in-the-config-file) -5. [Run the workflow](#5-run-the-workflow) - - -
- ---- - -### 1. Install conda, mamba, and `genelab-utils` package -We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the Happy Belly Bioinformatics conda tutorial](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). - -Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations: - -```bash -conda install -n base -c conda-forge mamba -``` - -> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5) if wanted. - -Once mamba is installed, you can install the genelab-utils conda package in a new environment with the following command: - -```bash -mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike 'genelab-utils>=1.1.02' -``` - -The environment then needs to be activated: - -```bash -conda activate genelab-utils -``` - -
- ---- - -### 2. Install Nextflow and Singularity - -#### 2a. Install Nextflow - -Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). - -> Note: If you want to install Anaconda, we recommend installing a Miniconda, Python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). -> -> Once conda is installed on your system, you can install the latest version of Nextflow by running the following commands: -> -> ```bash -> conda install -c bioconda nextflow -> nextflow self-update -> ``` - -
- -#### 2b. Install Singularity - -Singularity is a container platform that allows usage of containerized software. This enables the GeneLab RCP workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. - -We recommend installing Singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). - -> Note: Singularity is also available through [Anaconda](https://anaconda.org/conda-forge/singularity). - -
- ---- - - -### 3. Download the workflow template files -All workflow files for removing human reads from metagenomics data are in the [workflow_code](workflow_code) directory. To get a copy of the latest NF_MGRemoveHumanReads-A version on to your system, run the following command: - -```bash -GL-get-workflow NF_MGRemoveHumanReads-A -``` - -This downloaded the workflow into a directory called `NF_MGRemoveHumanReads-*/`, with the workflow version number at the end. - -> Note: If wanting an earlier version, the wanted version can be provided as an optional argument like so: -> ```bash -> GL-get-workflow NF_MGRemoveHumanReads-A --wanted-version 1.0.0 -> ``` - -
- ---- - -### 4. Modify the variables in the Remove_Human_Reads.config file -Once you've downloaded the workflow template, you can modify the variables in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below - if you are running the example dataset, this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt)). You will also need to indicate the path to your input data (raw reads) and the root directory for where the kraken2 reference database should be stored (it will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file to be consistent with the study you want to process and the machine you're using. - -> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). - -**Example for how to create a single-column list of unique sample identifiers from your raw data file names** - -For example, if you only want to process a subset of the read files within the reads directory and have paired-end read data for 2 samples located in `../Raw_Sequence_Data/` relative to your workflow directory, that would look like this: - -```bash -ls ../Raw_Sequence_Data/ -``` - -``` -Sample-1_R1.fastq.gz -Sample-1_R2.fastq.gz -Sample-2_R1.fastq.gz -Sample-2_R2.fastq.gz -``` - -You would set up your `unique-sample-IDs.txt` file as follows: - -```bash -cat unique-sample-IDs.txt -``` - -``` -Sample-1 -Sample-2 -``` - -
- ---- - -### 5. Run the workflow - -While in the directory holding the NextFlow file, .config file, and other workflow files that you downloaded in [step 3](#3-download-the-workflow-template-files), here is one example command of how to run the workflow: - -```bash -nextflow run *path/to/Remove_Human_Reads.nf* -ansi-log false -specify_reads false -``` - -* `-ansi-log false` – specifies to print out each command being run to the screen -* `-resume` – continues to run the workflow using cached data from the previous run -* `-specify_reads false` - processes all reads in the working directory, without requiring a sample ID list - - -See `nextflow -h` and [NextFlow's documentation](https://www.nextflow.io/docs/master/index.html) for more options and details. - -A quick example can be run with the files included in the [workflow_code](workflow_code) directory after specifying a location for the reference database in the [Remove_Human_Reads.config](workflow_code/Remove_Human_Reads.config) file. - ---- - -## Reference database info -The database we use was built with kraken2 v2.1.1 as detailed below, and can be downloaded to run with this workflow (it's ~4.3 GB uncompressed). - ---- From 39769ef9744f7f7fdce876bd132d4f5552d7cc38 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Sat, 31 Aug 2024 15:24:12 -0700 Subject: [PATCH 29/33] fixed section headers --- .../NF_MGRemoveHumanReads-A/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md index 4a3f489b..fb83cb8f 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md @@ -6,19 +6,19 @@ The current pipeline for how GeneLab identifies and removes human DNA in Illumin ## Utilizing the workflow -1.. [Install NextFlow and Singularity](#1-install-NextFlow-Singularity) +1. [Install NextFlow and Singularity](#1-install-nextflow-and-singularity) 1a. [Install Nextflow](#1a-install-nextflow) 1b. [Install Singularity](#1b-install-singularity) 2. [Download the workflow template files](#2-download-the-workflow-template-files) -3. [Modify the variables in the nextflow.config file](#3-modify-the-variables-in-the-config-file) -4. [Run the workflow](#4-run-the-workflow) +3. [Modify the variables in the nextflow.config file](#3-modify-the-variables-in-the-nextflowconfig-file) +4. [Run the workflow](#4-run-the-workflow)
--- -### 1.Install Nextflow and Singularity +### 1. Install Nextflow and Singularity #### 1a. Install Nextflow From 00674b65cfe6d1c88f0089d0595ebff31c713a4b Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:27:27 -0700 Subject: [PATCH 30/33] Update and rename nextflow.config --- .../{Remove_Human_reads.config => nextflow.config} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/{Remove_Human_reads.config => nextflow.config} (85%) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/nextflow.config similarity index 85% rename from Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config rename to Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/nextflow.config index 3537336a..cde9e0a5 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_reads.config +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/nextflow.config @@ -6,7 +6,7 @@ params.single_end = false // single-end reads (false if paired-end) params.specify_reads = true //if true, only process reads specified by the sample_id_list -params.sample_id_list = "/workspace/GeneLab_Data_Processing/rmv/unique_sample_ids.txt" //list of sample IDs to proccess if specify_reads is true +params.sample_id_list = "/path/to/unique_sample_ids.txt" //list of sample IDs to proccess if specify_reads is true params.reads_dir = "$projectDir/example-reads_PE/" //directory to find sample reads @@ -28,7 +28,7 @@ params.SE_reads_out_suffix = "_raw_hrRemoved.fastq" //suffix to use for final ( params.num_threads = 2 params.kraken_output_dir = "$projectDir/kraken2-outputs" //location to output files, relative to wd or full path params.human_db_name = 'kraken2-human-db' // -params.human_db_path = "$projectDir/${params.human_db_name}" +params.human_db_path = "/path/to/kraken2/database/${params.human_db_name}" singularity { enabled = true autoConvert = true From 87c03d83bbac8e44e9d39afbd47f3e193e85b29b Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Wed, 26 Feb 2025 09:55:35 -0800 Subject: [PATCH 31/33] Update README.md making some fixes as per Barbara's comments --- .../Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md index fb83cb8f..c3983204 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/README.md @@ -137,15 +137,14 @@ Sample-2 ### 4. Run the workflow -While in the directory holding the NextFlow file, nextflow.config file, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: +While in the workflow_code directory containing the NextFlow file and nextflow.config file you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: ```bash -nextflow run *path/to/Remove_Human_Reads.nf* -resume -ansi-log false --DL_kraken false +nextflow run *path/to/Remove_Human_Reads.nf* -resume --DL_kraken false ``` * `-resume` – continues to run the workflow using cached data from the previous run -* `-ansi-log false` – specifies to print out each command being run to the screen instead of dynamically updating the log * `--specify_reads false` - processes all reads in the working directory, without requiring a sample ID list * `--single_end true` – indicates reads are single-ended * `--DL_kraken true` – runs a process before the rest of the workflow to download and install the necessary database. From d4f25b33960f2d0648632b88edbedb74d3cc43a9 Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Wed, 26 Feb 2025 10:00:17 -0800 Subject: [PATCH 32/33] Update Remove_Human_Reads.nf corrected variable name --- .../NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_Reads.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_Reads.nf b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_Reads.nf index b3f0b381..4f428661 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_Reads.nf +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/Remove_Human_Reads.nf @@ -6,7 +6,7 @@ log.info """\ Download DB: ${params.DL_kraken} Single end reads: ${params.single_end} Use SampleID file: ${params.specify_reads} - Outputs: ${params.human_db_path} + Outputs: ${params.kraken_output_dir} """ .stripIndent() From 2a3cbc57040c1bf83730b9750fbe8619afeb909d Mon Sep 17 00:00:00 2001 From: kieranmbrown <60366396+kieranmbrown@users.noreply.github.com> Date: Wed, 26 Feb 2025 10:57:09 -0800 Subject: [PATCH 33/33] Update nextflow.config small changes for clarity --- .../workflow_code/nextflow.config | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/nextflow.config b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/nextflow.config index cde9e0a5..5b6a6cf4 100644 --- a/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/nextflow.config +++ b/Metagenomics/Remove_human_reads_from_raw_data/Workflow_Documentation/NF_MGRemoveHumanReads-A/workflow_code/nextflow.config @@ -8,17 +8,17 @@ params.specify_reads = true //if true, only process reads specified by the samp params.sample_id_list = "/path/to/unique_sample_ids.txt" //list of sample IDs to proccess if specify_reads is true -params.reads_dir = "$projectDir/example-reads_PE/" //directory to find sample reads +params.reads_dir = "$projectDir/example-reads_PE/" //directory to find sample reads, format "$projectDir/path/to/reads/" params.PE_reads_suffix = "_R{1,2}.fastq.gz" //raw read suffixes (region following the unique part of the sample names) //e.g. for "Sample-1_R1/2_raw.fastq.gz" would be "_R1_raw.fastq.gz" -params.PE_reads_out_suffix = "_R#_raw_hrRemoved.fastq" //suffix to use for final (human reads removed) output files +params.PE_reads_out_suffix = "_R#_raw_hrRemoved.fastq" //suffix to use for final (human reads removed) output files -params.SE_reads_suffix = "_raw.fastq.gz" //if single-end, set this. raw read suffixes which follow the unique part of sample name +params.SE_reads_suffix = "_raw.fastq.gz" // Raw read suffixes which follow the unique part of sample name. Only referenced if 'single_end' is set to true -params.SE_reads_out_suffix = "_raw_hrRemoved.fastq" //suffix to use for final (human reads removed) output files +params.SE_reads_out_suffix = "_raw_hrRemoved.fastq" //suffix to use for final (human reads removed) output files. Only referenced if 'single_end' is set to true @@ -26,7 +26,7 @@ params.SE_reads_out_suffix = "_raw_hrRemoved.fastq" //suffix to use for final ( //Only change if desired: params.num_threads = 2 -params.kraken_output_dir = "$projectDir/kraken2-outputs" //location to output files, relative to wd or full path +params.kraken_output_dir = "$projectDir/kraken2-outputs" //location to output files, relative to wd or full pathy. Note: NextFlow references the working directory as $projectDir params.human_db_name = 'kraken2-human-db' // params.human_db_path = "/path/to/kraken2/database/${params.human_db_name}" singularity {