Skip to content

Commit 249cb04

Browse files
author
weichan
committed
Added cmod functionality
1 parent 0103dde commit 249cb04

File tree

7 files changed

+250
-43
lines changed

7 files changed

+250
-43
lines changed

config/config_CD19_BBz.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Long-read data
2-
experiment: "BBz"
2+
experiment: "test_cmod"
33
samples:
44
MK028_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK028_BBz_pooled.bam"
55
MK014_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK014_BBz_pooled.bam"
@@ -61,3 +61,5 @@ downstream: "rules/downstream.smk"
6161
#epigenetics: "rules/epigenetics.smk"
6262
#variants rule collection WIP
6363
#variants: "rules/variants.smk"
64+
cmod: "rules/cmod.smk"
65+
threads: 25

workflow/Snakefile

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,19 @@ SAMPLES = expand(config["samples"])
1212
outdir = os.path.join(config["processing_dir"],str(config["experiment"]))
1313
fragmentsize=config["fragment_size"]
1414

15+
# needs to be improved somehow. Maybe even by removing the global import and replacing every single fucntion with a local import line
16+
#local functions - path to helper fucntions needs to be added to the sys path, otherwise import won't find the file
17+
rootpath = os.path.join("workflow/scripts")
18+
sys.path.append(rootpath)
19+
cwd=os.getcwd()
20+
1521
import VIS_helper_functions as vhf #custom functions to make snakemake pipeline leaner
1622

1723
#inmport rules
1824
include: config["detection"]
1925
include: config["functional_genomics"]
2026
include: config["quality_control"]
2127

22-
2328
#analysis-specific rules
2429
#include: config["downstream"]
2530

@@ -28,29 +33,35 @@ include: config["quality_control"]
2833
#include: config["variants"]
2934
#include: config["development"]
3035

31-
32-
#target rule
33-
rule all:
34-
input:
35-
#detection
36-
expand(f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES),
37-
f"{outdir}/final/localization/Heatmap_Insertion_Chr.png",
38-
f"{outdir}/final/localization/Insertion_length.png",
39-
40-
#functional genomics
41-
expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES),
42-
expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES),
43-
expand(f"{outdir}/intermediate/localization/annotation/Annotation_gene_Insertions_{{sample}}.bed", sample=SAMPLES),
44-
expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES),
45-
#quality control
46-
expand(f"{outdir}/final/qc/mapq/{{sample}}_mapq_heatmap_image.png", sample=SAMPLES),
47-
expand(f"{outdir}/final/qc/Fragmentation/Insertions/insertions_{fragmentsize}_{{sample}}", sample=SAMPLES),
48-
expand(f"{outdir}/final/qc/Fragmentation/Longest_Interval/{{sample}}/", sample=SAMPLES),
49-
f"{outdir}/final/qc/multiqc_report.html",
50-
# process
51-
f"{outdir}/config_settings.yml",
52-
53-
# downstream
54-
#expand(f"{outdir}/intermediate/blastn/Filtered_Annotated_{fragmentsize}_InsertionMatches_{{sample}}.gff", sample=SAMPLES),
55-
#expand(f"{outdir}/final/functional_genomics/localization/{fragmentsize}_{{sample}}", sample=SAMPLES),
56-
36+
# Optional c-modification rule: Only works if input BAMs have MM tags
37+
if "cmod" in config:
38+
include: config["cmod"]
39+
rule all:
40+
input:
41+
expand(f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam", sample=SAMPLES),
42+
expand(f"{outdir}/intermediate/cmod/Isolated_Reads_{{sample}}.tsv", sample=SAMPLES),
43+
#expand(f"{outdir}/intermediate/cmod/Calls_Isolated_Reads_{{sample}}.tsv", sample=SAMPLES)
44+
else:
45+
rule all:
46+
input:
47+
#detection
48+
expand(f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES),
49+
f"{outdir}/final/localization/Heatmap_Insertion_Chr.png",
50+
f"{outdir}/final/localization/Insertion_length.png",
51+
#functional genomics
52+
expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES),
53+
expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES),
54+
expand(f"{outdir}/intermediate/localization/annotation/Annotation_gene_Insertions_{{sample}}.bed", sample=SAMPLES),
55+
expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES),
56+
#quality control
57+
expand(f"{outdir}/final/qc/mapq/{{sample}}_mapq_heatmap_image.png", sample=SAMPLES),
58+
expand(f"{outdir}/final/qc/Fragmentation/Insertions/insertions_{fragmentsize}_{{sample}}", sample=SAMPLES),
59+
expand(f"{outdir}/final/qc/Fragmentation/Longest_Interval/{{sample}}/", sample=SAMPLES),
60+
f"{outdir}/final/qc/multiqc_report.html",
61+
# process
62+
f"{outdir}/config_settings.yml",
63+
# downstream
64+
#expand(f"{outdir}/intermediate/blastn/Filtered_Annotated_{fragmentsize}_InsertionMatches_{{sample}}.gff", sample=SAMPLES),
65+
#expand(f"{outdir}/final/functional_genomics/localization/{fragmentsize}_{{sample}}", sample=SAMPLES),
66+
# for msa
67+
#expand(f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa", sample=SAMPLES)

workflow/envs/VIS_modkit_env.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: VIS_modkit_env
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
dependencies:
7+
- ont-modkit

workflow/rules/cmod.smk

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
######
2+
######
3+
###### C-modifications of reads with isnertions
4+
######
5+
######
6+
7+
# Notes: Methylartist works in principle, but there is one very interesting problem: SInce the Insertion are usually cut out from the optimal alignment, their modifications cannot be seen anymore.
8+
# This means that it is not possible to have an idea about what is going on at the exact insertiopn if we use the common tools! -> this makes manual labor necessary
9+
10+
rule readnames_final:
11+
input:
12+
final_insertions=f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed"
13+
log:
14+
log=f"{outdir}/intermediate/log/cmod/readnames_final/{{sample}}.log"
15+
output:
16+
readnames=f"{outdir}/intermediate/cmod/final_insertion_readnames_{{sample}}.txt"
17+
conda:
18+
"../envs/VIS_dummy_env.yml"
19+
shell:
20+
"""
21+
(
22+
cut {input} -f 4 > {output}
23+
) > {log.log} 2>&1
24+
"""
25+
26+
27+
rule only_keep_valid_insertions:
28+
input:
29+
isobam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam",
30+
readnames=f"{outdir}/intermediate/cmod/final_insertion_readnames_{{sample}}.txt"
31+
log:
32+
log=f"{outdir}/intermediate/log/cmod/filter/{{sample}}.log"
33+
output:
34+
bam=f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam"
35+
conda:
36+
"../envs/VIS_samtools_env.yml"
37+
shell:
38+
"""
39+
(
40+
samtools view -h -b -N {input.readnames} {input.isobam} | samtools sort > {output.bam}
41+
samtools index {output.bam}
42+
) > {log.log} 2>&1
43+
"""
44+
45+
46+
rule modkit:
47+
input:
48+
isobam=f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam"
49+
log:
50+
log=f"{outdir}/intermediate/log/cmod/modkit/{{sample}}.log"
51+
output:
52+
tsv=f"{outdir}/intermediate/cmod/Isolated_Reads_{{sample}}.tsv"
53+
conda:
54+
"../envs/VIS_modkit_env.yml"
55+
shell:
56+
"""
57+
(
58+
modkit extract full -t 20 {input.isobam} {output.tsv}
59+
) > {log.log} 2>&1
60+
"""
61+
''''
62+
rule call_modkit:
63+
input:
64+
isobam=f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam"
65+
log:
66+
log=f"{outdir}/intermediate/log/cmod/Calls_modkit/{{sample}}.log"
67+
output:
68+
tsv=f"{outdir}/intermediate/cmod/Calls_Isolated_Reads_{{sample}}.tsv"
69+
conda:
70+
"../envs/VIS_modkit_env.yml"
71+
shell:
72+
"""
73+
(
74+
modkit call-mods {input.isobam} - | modkit extract - {output.tsv}
75+
) > {log.log} 2>&1
76+
"""
77+
78+
'''
79+
'''
80+
rule specific_methylartist:
81+
input:
82+
bam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam",
83+
ref=f"{outdir}/intermediate/mapping/insertion_ref_genome.fa" #must be indexed!
84+
log:
85+
log=f"{outdir}/intermediate/log/cmod/specific_methylartist/{{sample}}.log"
86+
output:
87+
plot=f"{outdir}/intermediate/cmod/Region3_Reads_{{sample}}.png"
88+
conda:
89+
"../envs/VIS_methylartist_env.yml"
90+
shell:
91+
"""
92+
(
93+
methylartist locus -b {input.bam} -i chr17:31124037-31180287 -n C -r {input.ref} -l 31154037-31159287 -o {output.plot}
94+
) > {log.log} 2>&1
95+
"""
96+
'''
97+
98+
rule inserted_seq:
99+
input:
100+
insertion=f"{outdir}/intermediate/fasta/Isolated_Reads_{{sample}}.fa",
101+
coordinates=f"{outdir}/intermediate/blastn/Coordinates_{fragmentsize}_InsertionMatches_{{sample}}.blastn"
102+
log:
103+
log=f"{outdir}/intermediate/log/cmod/inserted_seq/{{sample}}.log"
104+
output:
105+
fasta=f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa"
106+
run:
107+
try:
108+
vhf.get_inserted_fasta_seq(input.insertion, input.coordinates, output[0], log.log)
109+
except Exception as e:
110+
with open(log.log, "a") as log_file:
111+
log_file.write(f"Error: {str(e)}\n")

workflow/rules/detection.smk

Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ rule build_insertion_reference:
2929
log:
3030
log=f"{outdir}/intermediate/log/detection/build_insertion_reference/out.log"
3131
output:
32-
temp(f"{outdir}/intermediate/mapping/insertion_ref_genome.fa")
32+
f"{outdir}/intermediate/mapping/insertion_ref_genome.fa"
3333
conda:
3434
"../envs/VIS_dummy_env.yml"
3535
shell:
3636
"""
3737
(
38-
cat {input.ref} {input.insertion} > {output}
38+
cat {input.ref} {input.insertion} | awk 'NF' > {output}
3939
) > {log.log} 2>&1
4040
"""
4141

@@ -60,19 +60,61 @@ rule minimap_index:
6060

6161
rule make_fasta_without_tags: #fasta of raw data no trimming whatsoever
6262
input:
63-
fq=lambda wildcards: config["samples"][wildcards.sample]
63+
bam=lambda wildcards: config["samples"][wildcards.sample]
6464
log:
6565
log=f"{outdir}/intermediate/log/detection/make_fasta_without_tags/{{sample}}.log"
6666
output:
6767
fasta=f"{outdir}/intermediate/fasta/Full_{{sample}}.fa"
6868
conda:
6969
"../envs/VIS_samtools_env.yml"
7070
shell:
71+
"""
72+
(
73+
samtools fasta {input.bam} -o {output.fasta} > {output.fasta}
74+
) > {log.log} 2>&1
75+
"""
76+
77+
######
78+
######
79+
###### Only use reads with insertions for speed
80+
######
81+
######
82+
83+
rule insertion_reads:
84+
input:
85+
bam=lambda wildcards: config["samples"][wildcards.sample],
86+
readnames=f"{outdir}/intermediate/blastn/Readnames_{fragmentsize}_InsertionMatches_{{sample}}.txt"
87+
log:
88+
log=f"{outdir}/intermediate/log/detection/insertion_reads_cmod/{{sample}}.log"
89+
output:
90+
isobam=f"{outdir}/intermediate/mapping/Isolated_Reads_{{sample}}.bam"
91+
conda:
92+
"../envs/VIS_samtools_env.yml"
93+
shell:
94+
"""
95+
(
96+
samtools view -b -N {input.readnames} {input.bam} | samtools sort > {output.isobam}
97+
samtools index {output.isobam}
98+
) > {log.log} 2>&1
99+
"""
100+
101+
rule fasta_insertion_reads:
102+
input:
103+
f"{outdir}/intermediate/mapping/Isolated_Reads_{{sample}}.bam"
104+
log:
105+
log=f"{outdir}/intermediate/log/detection/fasta_insertion_reads_cmod/{{sample}}.log"
106+
output:
107+
f"{outdir}/intermediate/fasta/Isolated_Reads_{{sample}}.fa"
108+
conda:
109+
"../envs/VIS_samtools_env.yml"
110+
shell:
71111
"""
72112
(
73113
samtools fasta {input} -o {output} > {output}
74114
) > {log.log} 2>&1
75115
"""
116+
117+
76118
######
77119
######
78120
###### "Clean" BAM: Cut-out fasta to BAM via Mapping to reference
@@ -90,19 +132,21 @@ rule Non_insertion_mapping: #mapping against the unaltered referenc egenome
90132
log=f"{outdir}/intermediate/log/detection/Non_insertion_mapping/{{sample}}.log"
91133
resources:
92134
mem_mb=5000
135+
threads: config["threads"]
93136
conda:
94137
"../envs/VIS_minimap_env.yml"
95138
shell: #N=0 instead of default N=1
96139
"""
97140
(
98-
minimap2 -y -ax map-ont --score-N 0 {input.genome} {input.fasta} | samtools sort | samtools view -F 2304 -o {output}
141+
minimap2 -t {threads} -y -ax map-ont --score-N 0 {input.genome} {input.fasta} | samtools sort | samtools view -F 2304 -o {output}
99142
samtools index {output}
100143
) > {log.log} 2>&1
101144
"""
102145

103146
rule insertion_mapping: #conserves tags!
104147
input:
105-
bam=lambda wildcards: config["samples"][wildcards.sample],
148+
#bam=lambda wildcards: config["samples"][wildcards.sample], #full data
149+
bam=f"{outdir}/intermediate/mapping/Isolated_Reads_{{sample}}.bam", #only reads with insertions
106150
minimapref=f"{outdir}/intermediate/mapping/ref_genome_index.mmi",
107151
ref=f"{outdir}/intermediate/mapping/insertion_ref_genome.fa"
108152
output:
@@ -111,12 +155,13 @@ rule insertion_mapping: #conserves tags!
111155
log=f"{outdir}/intermediate/log/detection/insertion_mapping/{{sample}}.log"
112156
resources:
113157
mem_mb=5000
158+
threads: config["threads"]
114159
conda:
115160
"../envs/VIS_minimap_env.yml"
116161
shell:
117162
"""
118163
(
119-
samtools bam2fq -T '*' {input.bam}| minimap2 -y -ax map-ont {input.minimapref} - | samtools sort | samtools view -F 2304 -o {output}
164+
samtools bam2fq -T '*' {input.bam}| minimap2 -t {threads} -y -ax map-ont {input.minimapref} - | samtools sort | samtools view -F 2304 -o {output}
120165
samtools index {output}
121166
) > {log.log} 2>&1
122167
"""
@@ -196,7 +241,8 @@ rule get_coordinates_for_fasta: #filters and combines matches
196241
rule split_fasta:
197242
input:
198243
breakpoints=f"{outdir}/intermediate/blastn/Coordinates_{fragmentsize}_InsertionMatches_{{sample}}.blastn",
199-
fasta=f"{outdir}/intermediate/fasta/Full_{{sample}}.fa"
244+
#fasta=f"{outdir}/intermediate/fasta/Full_{{sample}}.fa"
245+
fasta=f"{outdir}/intermediate/fasta/Isolated_Reads_{{sample}}.fa"
200246
params:
201247
mode=config["splitmode"] #if each split fasta substring should be used individually, use "Separated" Join, New mode: Buffer
202248
log:
@@ -287,6 +333,8 @@ rule find_insertion_BLASTn:
287333
log=f"{outdir}/intermediate/log/detection/find_insertion_BLASTn/{{sample}}.log"
288334
output:
289335
temp(f"{outdir}/intermediate/blastn/{fragmentsize}_InsertionMatches_{{sample}}.blastn")
336+
threads:
337+
config["threads"]
290338
conda:
291339
"../envs/VIS_blastn_env.yml"
292340
shell:
@@ -295,6 +343,7 @@ rule find_insertion_BLASTn:
295343
mkdir {params.tempdir}
296344
297345
blastn \
346+
-num_threads {threads} \
298347
-query {input.fasta} \
299348
-db {input.insertion} \
300349
-out {params.tempdir}/temp_output.blastn \
@@ -315,6 +364,8 @@ rule find_insertion_BLASTn_in_Ref:
315364
fasta=f"{outdir}/intermediate/fasta/fragments/{fragmentsize}_Insertion_fragments.fa"
316365
params:
317366
refdb=config.get("blastn_db", "") # Optional blastn database path
367+
threads:
368+
config["threads"]
318369
log:
319370
log=f"{outdir}/intermediate/log/detection/find_insertion_BLASTn_in_Ref/{{sample}}.log"
320371
output:
@@ -330,6 +381,7 @@ rule find_insertion_BLASTn_in_Ref:
330381
else
331382
# If blastn_db is provided, run the blastn command
332383
blastn \
384+
-num_threads {threads} \
333385
-query {input.fasta} \
334386
-db {params.refdb} \
335387
-out {output} \
@@ -380,13 +432,12 @@ rule extract_by_length:
380432
) > {log.log} 2>&1
381433
"""
382434

435+
383436
######
384437
######
385438
###### Visualisation of insertions
386439
######
387440

388-
389-
390441
rule basic_insertion_plots:
391442
input:
392443
expand(f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES)

0 commit comments

Comments
 (0)