Skip to content

Commit c69ab30

Browse files
authored
Merge pull request #3 from aweich/cmod
Cmod
2 parents 0103dde + 6243f93 commit c69ab30

11 files changed

+762
-465
lines changed

config/config_CD19_BBz.yml

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Long-read data
2-
experiment: "BBz"
2+
experiment: "annotation_test_cmod"
33
samples:
44
MK028_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK028_BBz_pooled.bam"
55
MK014_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK014_BBz_pooled.bam"
@@ -27,26 +27,28 @@ ref_genome_ctrl: "/home/weichan/permanent/Projects/VIS/Data/VIS_Magdeburg_withBa
2727
#ucsc_H3K27Ac: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K27Ac_02_24"
2828
#ucsc_H3K4Me3: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me3_02_24"
2929
#ucsc_DNaseH: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_DNaseClusters_02_24_cut"
30-
ucsc_TF: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
30+
annotate_ucsc_tf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
3131
#annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODEV44_processed.bed"
32-
annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
32+
annotate_gencode: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
3333
#ucsc_Genes_gtf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/gencode.v44.annotation.gtf"
3434
#ucsc exons
35-
ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
36-
ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
35+
annotate_ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
36+
annotate_ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
3737
#promoters
38-
ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
38+
annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
39+
#enhancer
40+
annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
3941
#sedb
4042
#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
4143
#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
4244
#tss
4345
#ucsc_tss: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TSS_peaks_processed_sorted.bed"
4446
#miRNA
45-
#ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
47+
annotate_ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
4648
#cancer genes
47-
cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
49+
annotate_cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
4850
#hiC
49-
encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
51+
annotate_encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
5052
#VIS detection
5153
detection: "rules/detection.smk"
5254
#qc rule collection
@@ -61,3 +63,6 @@ downstream: "rules/downstream.smk"
6163
#epigenetics: "rules/epigenetics.smk"
6264
#variants rule collection WIP
6365
#variants: "rules/variants.smk"
66+
base_modifications: "rules/base_modifications.smk"
67+
plot_functional_genomics: "rules/plot_functional_genomics.smk"
68+
threads: 10

config/config_Clone1.yml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Long-read data
2+
experiment: "Clone1_cmod_28z"
3+
samples:
4+
Clone1: "/home/weichan/permanent/Projects/VIS/Data/CAR_clonality/combined_first_clone_sup.bam"
5+
processing_dir: "/home/weichan/temporary/Projects/VIS_out"
6+
#source_dir: "Src/"
7+
insertion_fasta: "/home/weichan/permanent/Projects/VIS/Data/pSLCAR-CD19-28z/pSLCAR-CD19-28z.fa"
8+
blastn_db: "/home/weichan/blastNdb/blastNdb" #human nucleotides to see which parts of the human genome match the CAR construct
9+
# protein BLASTp database
10+
#proteindb: "/home/weichan/swissprot/swissprot"
11+
splitmode: "Buffer" #"Buffer", "Join", "Separated"
12+
fragment_size: 100 #input for custom function
13+
bridging_size: 300 #amount of bases that are bridged if fragments do not match between other fragments
14+
MinLength: 1
15+
MAPQ: 20 #mapping quality filter after pseudo-read generation
16+
MinInsertionLength: 500 #212 eef1a length,cd247 1600nt. 1182 in car123 maybe 1200 with buffer?
17+
ref_genome_ctrl: "/home/weichan/permanent/Projects/VIS/Data/VIS_Magdeburg_withBasecalling/hg38.fa"
18+
#ref_genome_ctrl: "/home/weichan/permanent/Src/T2T_ref/chm13v2.0.fa"
19+
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/CD123/hg38_CD123.fasta" #might still contain empty last row!
20+
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/VIS_Magdeburg_withBasecalling/hg38_CD19.fasta"
21+
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/pSLCAR-CD19-28z/hg38_CD19_28z.fa"
22+
#ucsc_repeats: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_RepeatMasker_02_24"
23+
#all bed files need to be sorted and without the header
24+
#ucsc_H3K4Me1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me1_02_24"
25+
#ucsc_H3K27Ac: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K27Ac_02_24"
26+
#ucsc_H3K4Me3: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me3_02_24"
27+
#ucsc_DNaseH: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_DNaseClusters_02_24_cut"
28+
ucsc_TF: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
29+
#annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODEV44_processed.bed"
30+
annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
31+
#ucsc_Genes_gtf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/gencode.v44.annotation.gtf"
32+
#ucsc exons
33+
ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
34+
ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
35+
#promoters
36+
ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
37+
#sedb
38+
#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
39+
#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
40+
#tss
41+
#ucsc_tss: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TSS_peaks_processed_sorted.bed"
42+
#miRNA
43+
#ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
44+
#cancer genes
45+
cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
46+
#hiC
47+
encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
48+
#VIS detection
49+
detection: "rules/detection.smk"
50+
#qc rule collection
51+
quality_control: "rules/qc.smk"
52+
#functional genomics rule collection
53+
functional_genomics: "rules/functional_genomics.smk"
54+
# downstream analysis rules; not as formally strict
55+
downstream: "rules/downstream.smk"
56+
#rules that are still under development WIP
57+
#development: "rules/development.smk"
58+
#epigenetics rule collection WIP
59+
#epigenetics: "rules/epigenetics.smk"
60+
#variants rule collection WIP
61+
#variants: "rules/variants.smk"
62+
cmod: "rules/cmod.smk"
63+
threads: 10

workflow/Snakefile

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,45 +12,58 @@ SAMPLES = expand(config["samples"])
1212
outdir = os.path.join(config["processing_dir"],str(config["experiment"]))
1313
fragmentsize=config["fragment_size"]
1414

15+
# needs to be improved somehow. Maybe even by removing the global import and replacing every single fucntion with a local import line
16+
#local functions - path to helper fucntions needs to be added to the sys path, otherwise import won't find the file
17+
rootpath = os.path.join("workflow/scripts")
18+
sys.path.append(rootpath)
19+
cwd=os.getcwd()
20+
1521
import VIS_helper_functions as vhf #custom functions to make snakemake pipeline leaner
1622

17-
#inmport rules
23+
#inmport default rules
1824
include: config["detection"]
19-
include: config["functional_genomics"]
2025
include: config["quality_control"]
2126

27+
#conditional rule all based on defined rules
28+
conditional_output = list()
2229

23-
#analysis-specific rules
24-
#include: config["downstream"]
30+
if "functional_genomics" in config:
31+
import VIS_functional_genomics_helper_functions as vhf_fg
32+
include: config["functional_genomics"]
33+
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES))
34+
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES))
35+
conditional_output.append(expand(f"{outdir}/intermediate/localization/annotation/Annotation_{{annotation}}_Insertions_{{sample}}.bed",
36+
annotation=[k.replace("annotate_", "") for k in config if k.startswith("annotate_")],
37+
sample=SAMPLES))
2538

26-
#development
27-
#include: config["epigenetics"]
28-
#include: config["variants"]
29-
#include: config["development"]
39+
if "plot_functional_genomics" in config:
40+
import VIS_plot_functional_genomics_helper_functions as vhf_pfg
41+
include: config["plot_functional_genomics"]
42+
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES))
43+
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES))
3044

45+
if "base_modifications" in config:
46+
include: config["base_modifications"]
47+
conditional_output.append(expand(f"{outdir}/final/base_modifications/Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
48+
conditional_output.append(expand(f"{outdir}/final/base_modifications/Calls_Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
3149

32-
#target rule
3350
rule all:
34-
input:
51+
input:
3552
#detection
3653
expand(f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES),
3754
f"{outdir}/final/localization/Heatmap_Insertion_Chr.png",
3855
f"{outdir}/final/localization/Insertion_length.png",
39-
40-
#functional genomics
41-
expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES),
42-
expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES),
43-
expand(f"{outdir}/intermediate/localization/annotation/Annotation_gene_Insertions_{{sample}}.bed", sample=SAMPLES),
44-
expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES),
4556
#quality control
4657
expand(f"{outdir}/final/qc/mapq/{{sample}}_mapq_heatmap_image.png", sample=SAMPLES),
4758
expand(f"{outdir}/final/qc/Fragmentation/Insertions/insertions_{fragmentsize}_{{sample}}", sample=SAMPLES),
4859
expand(f"{outdir}/final/qc/Fragmentation/Longest_Interval/{{sample}}/", sample=SAMPLES),
4960
f"{outdir}/final/qc/multiqc_report.html",
5061
# process
5162
f"{outdir}/config_settings.yml",
52-
53-
# downstream
63+
# other output
64+
conditional_output
65+
# downstream
5466
#expand(f"{outdir}/intermediate/blastn/Filtered_Annotated_{fragmentsize}_InsertionMatches_{{sample}}.gff", sample=SAMPLES),
5567
#expand(f"{outdir}/final/functional_genomics/localization/{fragmentsize}_{{sample}}", sample=SAMPLES),
56-
68+
# for msa
69+
#expand(f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa", sample=SAMPLES)

workflow/envs/VIS_modkit_env.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: VIS_modkit_env
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
dependencies:
7+
- ont-modkit
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
######
2+
######
3+
###### Base modifications of reads with isnertions
4+
######
5+
######
6+
7+
# Notes: Methylartist works in principle, but there is one very interesting problem: SInce the Insertion are usually cut out from the optimal alignment, their modifications cannot be seen anymore.
8+
# This means that it is not possible to have an idea about what is going on at the exact insertiopn if we use the common tools! -> this makes manual labor necessary
9+
10+
rule readnames_final:
11+
input:
12+
final_insertions=f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed"
13+
log:
14+
log=f"{outdir}/intermediate/log/base_modifications/readnames_final/{{sample}}.log"
15+
output:
16+
readnames=temp(f"{outdir}/intermediate/base_modifications/final_insertion_readnames_{{sample}}.txt")
17+
conda:
18+
"../envs/VIS_dummy_env.yml"
19+
shell:
20+
"""
21+
(
22+
cut {input} -f 4 > {output}
23+
) > {log.log} 2>&1
24+
"""
25+
26+
rule only_keep_filtered_insertions:
27+
input:
28+
isobam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam",
29+
readnames=f"{outdir}/intermediate/cmod/final_insertion_readnames_{{sample}}.txt"
30+
log:
31+
log=f"{outdir}/intermediate/log/base_modifications/only_keep_filtered_insertions/{{sample}}.log"
32+
output:
33+
bam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
34+
conda:
35+
"../envs/VIS_samtools_env.yml"
36+
shell:
37+
"""
38+
(
39+
samtools view -h -b -N {input.readnames} {input.isobam} | samtools sort > {output.bam}
40+
samtools index {output.bam}
41+
) > {log.log} 2>&1
42+
"""
43+
44+
rule modkit:
45+
input:
46+
isobam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
47+
log:
48+
log=f"{outdir}/intermediate/log/base_modifications/modkit/{{sample}}.log"
49+
output:
50+
tsv=f"{outdir}/final/base_modifications/Isolated_Reads_{{sample}}.tsv"
51+
conda:
52+
"../envs/VIS_modkit_env.yml"
53+
threads: config["threads"]
54+
shell:
55+
"""
56+
(
57+
modkit extract full -t {threads} {input.isobam} {output.tsv}
58+
) > {log.log} 2>&1
59+
"""
60+
61+
rule call_modkit:
62+
input:
63+
isobam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
64+
log:
65+
log=f"{outdir}/intermediate/log/base_modifications/call_modkit/{{sample}}.log"
66+
output:
67+
tsv=f"{outdir}/final/base_modifications/Calls_Isolated_Reads_{{sample}}.tsv"
68+
conda:
69+
"../envs/VIS_modkit_env.yml"
70+
threads: config["threads"]
71+
shell:
72+
"""
73+
(
74+
modkit extract calls -t {threads} {input.isobam} {output.tsv}
75+
) > {log.log} 2>&1
76+
"""
77+
78+
'''
79+
rule specific_methylartist:
80+
input:
81+
bam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam",
82+
ref=f"{outdir}/intermediate/mapping/insertion_ref_genome.fa" #must be indexed!
83+
log:
84+
log=f"{outdir}/intermediate/log/cmod/specific_methylartist/{{sample}}.log"
85+
output:
86+
plot=f"{outdir}/intermediate/cmod/Region3_Reads_{{sample}}.png"
87+
conda:
88+
"../envs/VIS_methylartist_env.yml"
89+
shell:
90+
"""
91+
(
92+
methylartist locus -b {input.bam} -i chr17:31124037-31180287 -n C -r {input.ref} -l 31154037-31159287 -o {output.plot}
93+
) > {log.log} 2>&1
94+
"""
95+
96+
97+
rule inserted_seq:
98+
input:
99+
insertion=f"{outdir}/intermediate/fasta/Isolated_Reads_{{sample}}.fa",
100+
coordinates=f"{outdir}/intermediate/blastn/Coordinates_{fragmentsize}_InsertionMatches_{{sample}}.blastn"
101+
log:
102+
log=f"{outdir}/intermediate/log/cmod/inserted_seq/{{sample}}.log"
103+
output:
104+
fasta=f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa"
105+
run:
106+
try:
107+
vhf.get_inserted_fasta_seq(input.insertion, input.coordinates, output[0], log.log)
108+
except Exception as e:
109+
with open(log.log, "a") as log_file:
110+
log_file.write(f"Error: {str(e)}\n")
111+
'''

0 commit comments

Comments
 (0)