From 16fa31ee61c8a2085ba56023e8b513005ee33941 Mon Sep 17 00:00:00 2001 From: Huiting Xu Date: Thu, 14 Mar 2024 11:45:16 +0100 Subject: [PATCH 1/7] add_PE_function --- Snakefile | 21 +++++++++++-- rules/align.smk | 79 +++++++++++++++++++++++++++++++---------------- rules/summary.smk | 8 ++--- rules/trim.smk | 61 ++++++++++++++++++++++++------------ 4 files changed, 118 insertions(+), 51 deletions(-) diff --git a/Snakefile b/Snakefile index 871161f..923f090 100644 --- a/Snakefile +++ b/Snakefile @@ -9,20 +9,37 @@ from snakemake.utils import min_version min_version("5.2.0") configfile: "config.defaults.yml" -sample_files = snakemake.utils.listfiles(config["fastq_file_pattern"]) +sample_files = snakemake.utils.listfiles(config["fastq_file_pattern"]+"/{sample}.fastq.gz") samples = dict((y[0], x) for x, y in sample_files) assert len(samples) > 0, "ERROR: No fastq files were found using pattern '{}' (set in configfile)".format(config["fastq_file_pattern"]) +SAMPLES_ALL = glob_wildcards(config['fastq_file_pattern']+"/{sample}.fastq.gz").sample +SAMPLES_PAIRED = glob_wildcards(config['fastq_file_pattern']+"/{sample}_R1_001.fastq.gz").sample log_dir = config["results_dir"] + "/logs" def get_fastq(wildcards): return samples[wildcards.sample] +if_SE=all("R2" not in name for name in samples.keys()) + +def get_matched_fastq(wildcards): + paired_files = [samples[i] for i in samples.keys() if ("R1" in i or "R2" in i) and wildcards.sample in i] + print(wildcards) + print(paired_files) + if len(paired_files) == 2 : + return paired_files + else: + raise ValueError(f"Error in matched pairs {wildcards.sample}") + +def get_paired_fastq(wildcards): + return get_matched_fastq(wildcards) + + rule all: input: config["results_dir"] + "/multiqc.html", - config["results_dir"] + "/combined_gene_counts.tsv" + #config["results_dir"] + "/combined_gene_counts.tsv" #expand("working/trimmed/{sample}.fastq.gz", sample=samples.keys()) include: "rules/fastqc.smk" diff --git a/rules/align.smk b/rules/align.smk index af8f184..d01f260 100644 --- a/rules/align.smk +++ b/rules/align.smk @@ -24,29 +24,56 @@ rule star_genome_index: "--runThreadN {threads} " ">{log:q} 2>&1" - -rule star_align: - input: - fastq=config["working_dir"] + "/trimmed/{sample}.fastq.gz", - genome_index=star_genome_dir - output: - bam=config["working_dir"] + "/star/{sample}/Aligned.sortedByCoord.out.bam", - log=config["working_dir"] + "/star/{sample}/Log.final.out" - params: - prefix=config["working_dir"] + "/star/{sample}/" - log: - log_dir + "/star/{sample}.log" - threads: 32 - conda: - "../envs/star.yml" - shell: - "STAR " - "{config[params][star][extra]} " - "--runThreadN {threads} " - "--genomeDir {input.genome_index:q} " - "--readFilesIn {input.fastq:q} " - "--readFilesCommand {config[params][star][zcat_command]} " - "--outSAMtype BAM SortedByCoordinate " - "--outFileNamePrefix {params.prefix:q} " - "--outStd Log " - ">{log:q} 2>&1" +if if_SE: + rule star_align_SE: + input: + fastq=config["working_dir"] + "/trimmed/{sample}.fastq.gz", + genome_index=star_genome_dir + output: + bam=config["working_dir"] + "/star/{sample}/Aligned.sortedByCoord.out.bam", + log=config["working_dir"] + "/star/{sample}/Log.final.out" + params: + prefix=config["working_dir"] + "/star/{sample}/" + log: + log_dir + "/star/{sample}.log" + threads: 32 + conda: + "../envs/star.yml" + shell: + "STAR " + "{config[params][star][extra]} " + "--runThreadN {threads} " + "--genomeDir {input.genome_index:q} " + "--readFilesIn {input.fastq:q} " + "--readFilesCommand {config[params][star][zcat_command]} " + "--outSAMtype BAM SortedByCoordinate " + "--outFileNamePrefix {params.prefix:q} " + "--outStd Log " + ">{log:q} 2>&1" +else: + rule star_align_PE: + input: + fastq1=config["working_dir"] + "/trimmed/{sample}_R1.fastq.gz", + fastq2=config["working_dir"] + "/trimmed/{sample}_R2.fastq.gz", + genome_index=star_genome_dir + output: + bam=config["working_dir"] + "/star/{sample}/Aligned.sortedByCoord.out.bam", + log=config["working_dir"] + "/star/{sample}/Log.final.out" + params: + prefix=config["working_dir"] + "/star/{sample}/" + log: + log_dir + "/star/{sample}.log" + threads: 32 + conda: + "../envs/star.yml" + shell: + "STAR " + "{config[params][star][extra]} " + "--runThreadN {threads} " + "--genomeDir {input.genome_index:q} " + "--readFilesIn {input.fastq1:q},{input.fastq2:q} " + "--readFilesCommand {config[params][star][zcat_command]} " + "--outSAMtype BAM SortedByCoordinate " + "--outFileNamePrefix {params.prefix:q} " + "--outStd Log " + ">{log:q} 2>&1" diff --git a/rules/summary.smk b/rules/summary.smk index 0e88c09..a0a0d44 100644 --- a/rules/summary.smk +++ b/rules/summary.smk @@ -15,10 +15,10 @@ from os import path rule multiqc: input: - expand(log_dir + "/trimmomatic/{sample}.log", sample=samples.keys()), - expand(config["working_dir"] + "/star/{sample}/Log.final.out", sample=samples.keys()), - expand(config["working_dir"] + "/featurecounts/{sample}_counts.tsv.summary", sample=samples.keys()), - expand(config["working_dir"] + "/fastqc/{sample}_fastqc.zip", sample=samples.keys()) + expand(log_dir + "/trimmomatic/{sample}.log", sample = SAMPLES_ALL if if_SE else SAMPLES_PAIRED), + expand(config["working_dir"] + "/star/{sample}/Log.final.out", sample = SAMPLES_ALL if if_SE else SAMPLES_PAIRED), + expand(config["working_dir"] + "/featurecounts/{sample}_counts.tsv.summary", sample = SAMPLES_ALL if if_SE else SAMPLES_PAIRED), + expand(config["working_dir"] + "/fastqc/{sample}_fastqc.zip", sample=SAMPLES_ALL) output: config["results_dir"] + "/multiqc.html" params: diff --git a/rules/trim.smk b/rules/trim.smk index 359d91d..3e75261 100644 --- a/rules/trim.smk +++ b/rules/trim.smk @@ -1,19 +1,42 @@ -rule trimmomatic: - input: - get_fastq - output: - fastq=config["working_dir"] + "/trimmed/{sample}.fastq.gz" - params: - trimmer=" ".join(config["params"]["trimmomatic"]["trimmers"]), - extra=config["params"]["trimmomatic"]["extra"] - log: - log_dir + "/trimmomatic/{sample}.log" - threads: - 32 - conda: - "../envs/trimmomatic.yml" - shell: - "trimmomatic SE {params.extra} " - "{input:q} {output:q} " - "{params.trimmer} " - ">{log} 2>&1" +if if_SE: + rule trimmomatic_SE: + input: + get_fastq + output: + fastq=config["working_dir"] + "/trimmed/{sample}.fastq.gz" + params: + trimmer=" ".join(config["params"]["trimmomatic"]["trimmers"]), + extra=config["params"]["trimmomatic"]["extra"] + log: + log_dir + "/trimmomatic/{sample}.log" + threads: + 32 + conda: + "../envs/trimmomatic.yml" + shell: + "trimmomatic SE {params.extra} " + "{input:q} {output:q} " + "{params.trimmer} " + ">{log} 2>&1" + +else: + rule trimmomatic_PE: + input: + get_paired_fastq + output: + fastq1=config["working_dir"] + "/trimmed/{sample}_R1.fastq.gz", + fastq2=config["working_dir"] + "/trimmed/{sample}_R2.fastq.gz" + params: + trimmer=" ".join(config["params"]["trimmomatic"]["trimmers"]), + extra=config["params"]["trimmomatic"]["extra"] + log: + log_dir + "/trimmomatic/{sample}.log" + threads: + 32 + conda: + "../envs/trimmomatic.yml" + shell: + "trimmomatic PE {params.extra} " + "{input:q} {output:q} " + "{params.trimmer} " + ">{log} 2>&1" From a533806a5e74de2eecdfa5b7826daaa8b73d1328 Mon Sep 17 00:00:00 2001 From: Ahuiting <104276232+Ahuiting@users.noreply.github.com> Date: Thu, 14 Mar 2024 12:44:08 +0100 Subject: [PATCH 2/7] Update config.defaults.yml --- config.defaults.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.defaults.yml b/config.defaults.yml index eab4715..4275f1b 100644 --- a/config.defaults.yml +++ b/config.defaults.yml @@ -1,7 +1,7 @@ # Pattern to input fastq files, use {sample} to indicate portion of filename # that corresponds to the sample name (e.g. data/{sample}.fq.gz) # The pipeline will automatically detect matching files and process each of them -fastq_file_pattern: "data/{sample}.fastq.gz" +fastq_file_pattern: "data" # Output directories # Directory to put intermediate working files (trimmed reads, bam files, etc.) From fced0941977b73505fc8426ce2b635785c312a4d Mon Sep 17 00:00:00 2001 From: Ahuiting Date: Thu, 14 Mar 2024 16:50:23 +0100 Subject: [PATCH 3/7] debug_rules --- rules/align.smk | 6 +++--- rules/trim.smk | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/rules/align.smk b/rules/align.smk index d01f260..ce01fc5 100644 --- a/rules/align.smk +++ b/rules/align.smk @@ -53,8 +53,8 @@ if if_SE: else: rule star_align_PE: input: - fastq1=config["working_dir"] + "/trimmed/{sample}_R1.fastq.gz", - fastq2=config["working_dir"] + "/trimmed/{sample}_R2.fastq.gz", + fastq1=config["working_dir"] + "/trimmed/{sample}_R1_paired.fastq.gz", + fastq2=config["working_dir"] + "/trimmed/{sample}_R2_paired.fastq.gz", genome_index=star_genome_dir output: bam=config["working_dir"] + "/star/{sample}/Aligned.sortedByCoord.out.bam", @@ -71,7 +71,7 @@ else: "{config[params][star][extra]} " "--runThreadN {threads} " "--genomeDir {input.genome_index:q} " - "--readFilesIn {input.fastq1:q},{input.fastq2:q} " + "--readFilesIn {input.fastq1:q} {input.fastq2:q} " "--readFilesCommand {config[params][star][zcat_command]} " "--outSAMtype BAM SortedByCoordinate " "--outFileNamePrefix {params.prefix:q} " diff --git a/rules/trim.smk b/rules/trim.smk index 3e75261..0536968 100644 --- a/rules/trim.smk +++ b/rules/trim.smk @@ -3,7 +3,7 @@ if if_SE: input: get_fastq output: - fastq=config["working_dir"] + "/trimmed/{sample}.fastq.gz" + fastq=config["working_dir"] + "/trimmed/{sample}.fastq.gz", params: trimmer=" ".join(config["params"]["trimmomatic"]["trimmers"]), extra=config["params"]["trimmomatic"]["extra"] @@ -24,8 +24,10 @@ else: input: get_paired_fastq output: - fastq1=config["working_dir"] + "/trimmed/{sample}_R1.fastq.gz", - fastq2=config["working_dir"] + "/trimmed/{sample}_R2.fastq.gz" + paired1=config["working_dir"] + "/trimmed/{sample}_R1_paired.fastq.gz", + unpaired1=config["working_dir"] + "/trimmed/{sample}_R1_unpaired.fastq.gz", + paired2=config["working_dir"] + "/trimmed/{sample}_R2_paired.fastq.gz", + unpaired2=config["working_dir"] + "/trimmed/{sample}_R2_unpaired.fastq.gz" params: trimmer=" ".join(config["params"]["trimmomatic"]["trimmers"]), extra=config["params"]["trimmomatic"]["extra"] From af55a1b52ee66a33a4097081a195739da5d4f991 Mon Sep 17 00:00:00 2001 From: Ahuiting <104276232+Ahuiting@users.noreply.github.com> Date: Thu, 14 Mar 2024 16:55:55 +0100 Subject: [PATCH 4/7] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 923f090..1854556 100644 --- a/Snakefile +++ b/Snakefile @@ -39,7 +39,7 @@ def get_paired_fastq(wildcards): rule all: input: config["results_dir"] + "/multiqc.html", - #config["results_dir"] + "/combined_gene_counts.tsv" + config["results_dir"] + "/combined_gene_counts.tsv" #expand("working/trimmed/{sample}.fastq.gz", sample=samples.keys()) include: "rules/fastqc.smk" From 5eb97d68d1ebb2ad2bd7cd93ec29a87b98ead823 Mon Sep 17 00:00:00 2001 From: Ahuiting Date: Thu, 14 Mar 2024 17:13:25 +0100 Subject: [PATCH 5/7] add_PE_function --- rules/count.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/count.smk b/rules/count.smk index 6b7f55e..7b9b9bf 100644 --- a/rules/count.smk +++ b/rules/count.smk @@ -27,7 +27,7 @@ rule count: rule combined_counts: input: expand(config["working_dir"] + "/featurecounts/{sample}_counts.tsv", - sample=samples.keys()) + sample = SAMPLES_ALL if if_SE else SAMPLES_PAIRED) output: config["results_dir"] + "/combined_gene_counts.tsv" run: From 1a73427b302019a62d8bd2c8833665a1d99a470c Mon Sep 17 00:00:00 2001 From: Ahuiting <104276232+Ahuiting@users.noreply.github.com> Date: Thu, 14 Mar 2024 18:04:25 +0100 Subject: [PATCH 6/7] Update Snakefile --- Snakefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Snakefile b/Snakefile index 1854556..cccf1e0 100644 --- a/Snakefile +++ b/Snakefile @@ -24,8 +24,6 @@ if_SE=all("R2" not in name for name in samples.keys()) def get_matched_fastq(wildcards): paired_files = [samples[i] for i in samples.keys() if ("R1" in i or "R2" in i) and wildcards.sample in i] - print(wildcards) - print(paired_files) if len(paired_files) == 2 : return paired_files else: From 626eed01758540700bac980d42aebc207b20367a Mon Sep 17 00:00:00 2001 From: Ahuiting <104276232+Ahuiting@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:35:41 +0100 Subject: [PATCH 7/7] Update Snakefile sort R1 R2 list --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index cccf1e0..83c7ef6 100644 --- a/Snakefile +++ b/Snakefile @@ -20,12 +20,12 @@ log_dir = config["results_dir"] + "/logs" def get_fastq(wildcards): return samples[wildcards.sample] -if_SE=all("R2" not in name for name in samples.keys()) +if_SE = all("R2" not in name for name in samples.keys()) def get_matched_fastq(wildcards): paired_files = [samples[i] for i in samples.keys() if ("R1" in i or "R2" in i) and wildcards.sample in i] if len(paired_files) == 2 : - return paired_files + return sorted(paired_files) else: raise ValueError(f"Error in matched pairs {wildcards.sample}")