Skip to content

Commit dc2147d

Browse files
author
weichan
committed
Major changes to functional genomics. Flexibility increased
1 parent 249cb04 commit dc2147d

File tree

8 files changed

+566
-424
lines changed

8 files changed

+566
-424
lines changed

config/config_CD19_BBz.yml

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Long-read data
2-
experiment: "test_cmod"
2+
experiment: "annotation_test_cmod"
33
samples:
44
MK028_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK028_BBz_pooled.bam"
55
MK014_BBz: "/home/weichan/permanent/Projects/VIS/Data/VIS_MPX_pooled/VIS_MPX_MK014_BBz_pooled.bam"
@@ -27,26 +27,26 @@ ref_genome_ctrl: "/home/weichan/permanent/Projects/VIS/Data/VIS_Magdeburg_withBa
2727
#ucsc_H3K27Ac: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K27Ac_02_24"
2828
#ucsc_H3K4Me3: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me3_02_24"
2929
#ucsc_DNaseH: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_DNaseClusters_02_24_cut"
30-
ucsc_TF: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
30+
annotate_ucsc_tf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
3131
#annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODEV44_processed.bed"
32-
annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
32+
annotate_gencode: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
3333
#ucsc_Genes_gtf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/gencode.v44.annotation.gtf"
3434
#ucsc exons
35-
ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
36-
ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
35+
annotate_ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
36+
annotate_ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
3737
#promoters
38-
ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
38+
annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
3939
#sedb
4040
#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
4141
#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
4242
#tss
4343
#ucsc_tss: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TSS_peaks_processed_sorted.bed"
4444
#miRNA
45-
#ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
45+
annotate_ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
4646
#cancer genes
47-
cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
47+
annotate_cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
4848
#hiC
49-
encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
49+
annotate_encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
5050
#VIS detection
5151
detection: "rules/detection.smk"
5252
#qc rule collection
@@ -62,4 +62,5 @@ downstream: "rules/downstream.smk"
6262
#variants rule collection WIP
6363
#variants: "rules/variants.smk"
6464
cmod: "rules/cmod.smk"
65-
threads: 25
65+
plot_functional_genomics: "rules/plot_functional_genomics.smk"
66+
threads: 10

config/config_Clone1.yml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Long-read data
2+
experiment: "Clone1_cmod_28z"
3+
samples:
4+
Clone1: "/home/weichan/permanent/Projects/VIS/Data/CAR_clonality/combined_first_clone_sup.bam"
5+
processing_dir: "/home/weichan/temporary/Projects/VIS_out"
6+
#source_dir: "Src/"
7+
insertion_fasta: "/home/weichan/permanent/Projects/VIS/Data/pSLCAR-CD19-28z/pSLCAR-CD19-28z.fa"
8+
blastn_db: "/home/weichan/blastNdb/blastNdb" #human nucleotides to see which parts of the human genome match the CAR construct
9+
# protein BLASTp database
10+
#proteindb: "/home/weichan/swissprot/swissprot"
11+
splitmode: "Buffer" #"Buffer", "Join", "Separated"
12+
fragment_size: 100 #input for custom function
13+
bridging_size: 300 #amount of bases that are bridged if fragments do not match between other fragments
14+
MinLength: 1
15+
MAPQ: 20 #mapping quality filter after pseudo-read generation
16+
MinInsertionLength: 500 #212 eef1a length,cd247 1600nt. 1182 in car123 maybe 1200 with buffer?
17+
ref_genome_ctrl: "/home/weichan/permanent/Projects/VIS/Data/VIS_Magdeburg_withBasecalling/hg38.fa"
18+
#ref_genome_ctrl: "/home/weichan/permanent/Src/T2T_ref/chm13v2.0.fa"
19+
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/CD123/hg38_CD123.fasta" #might still contain empty last row!
20+
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/VIS_Magdeburg_withBasecalling/hg38_CD19.fasta"
21+
#ref_genome: "/home/weichan/permanent/Projects/VIS/dev/pSLCAR-CD19-28z/hg38_CD19_28z.fa"
22+
#ucsc_repeats: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_RepeatMasker_02_24"
23+
#all bed files need to be sorted and without the header
24+
#ucsc_H3K4Me1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me1_02_24"
25+
#ucsc_H3K27Ac: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K27Ac_02_24"
26+
#ucsc_H3K4Me3: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_LayeredH3K4Me3_02_24"
27+
#ucsc_DNaseH: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_DNaseClusters_02_24_cut"
28+
ucsc_TF: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TFClusters_processed.bed"
29+
#annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODEV44_processed.bed"
30+
annotation_1: "/home/weichan/permanent/Projects/VIS/dev/UCSC/new_UCSC_GENCODEV44_processed.bed"
31+
#ucsc_Genes_gtf: "/home/weichan/permanent/Projects/VIS/dev/UCSC/gencode.v44.annotation.gtf"
32+
#ucsc exons
33+
ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Exons"
34+
ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
35+
#promoters
36+
ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
37+
#sedb
38+
#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
39+
#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
40+
#tss
41+
#ucsc_tss: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_TSS_peaks_processed_sorted.bed"
42+
#miRNA
43+
#ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_processed_sorted.bed"
44+
#cancer genes
45+
cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
46+
#hiC
47+
encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
48+
#VIS detection
49+
detection: "rules/detection.smk"
50+
#qc rule collection
51+
quality_control: "rules/qc.smk"
52+
#functional genomics rule collection
53+
functional_genomics: "rules/functional_genomics.smk"
54+
# downstream analysis rules; not as formally strict
55+
downstream: "rules/downstream.smk"
56+
#rules that are still under development WIP
57+
#development: "rules/development.smk"
58+
#epigenetics rule collection WIP
59+
#epigenetics: "rules/epigenetics.smk"
60+
#variants rule collection WIP
61+
#variants: "rules/variants.smk"
62+
cmod: "rules/cmod.smk"
63+
threads: 10

workflow/Snakefile

Lines changed: 43 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import VIS_helper_functions as vhf #custom functions to make snakemake pipeline
2222

2323
#inmport rules
2424
include: config["detection"]
25-
include: config["functional_genomics"]
2625
include: config["quality_control"]
2726

2827
#analysis-specific rules
@@ -33,35 +32,48 @@ include: config["quality_control"]
3332
#include: config["variants"]
3433
#include: config["development"]
3534

36-
# Optional c-modification rule: Only works if input BAMs have MM tags
35+
36+
#conditional rule all based on defined rules
37+
conditional_output = list()
38+
39+
if "functional_genomics" in config:
40+
include: config["functional_genomics"]
41+
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES))
42+
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES))
43+
conditional_output.append(expand(f"{outdir}/intermediate/localization/annotation/Annotation_{{annotation}}_Insertions_{{sample}}.bed",
44+
annotation=[k.replace("annotate_", "") for k in config if k.startswith("annotate_")],
45+
sample=SAMPLES))
46+
47+
if "plot_functional_genomics" in config:
48+
49+
import VIS_plot_functional_annotation_helper_functions as vhf_fa
50+
include: config["plot_functional_genomics"]
51+
52+
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES))
53+
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES))
54+
3755
if "cmod" in config:
3856
include: config["cmod"]
39-
rule all:
40-
input:
41-
expand(f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam", sample=SAMPLES),
42-
expand(f"{outdir}/intermediate/cmod/Isolated_Reads_{{sample}}.tsv", sample=SAMPLES),
43-
#expand(f"{outdir}/intermediate/cmod/Calls_Isolated_Reads_{{sample}}.tsv", sample=SAMPLES)
44-
else:
45-
rule all:
46-
input:
47-
#detection
48-
expand(f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES),
49-
f"{outdir}/final/localization/Heatmap_Insertion_Chr.png",
50-
f"{outdir}/final/localization/Insertion_length.png",
51-
#functional genomics
52-
expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES),
53-
expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES),
54-
expand(f"{outdir}/intermediate/localization/annotation/Annotation_gene_Insertions_{{sample}}.bed", sample=SAMPLES),
55-
expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES),
56-
#quality control
57-
expand(f"{outdir}/final/qc/mapq/{{sample}}_mapq_heatmap_image.png", sample=SAMPLES),
58-
expand(f"{outdir}/final/qc/Fragmentation/Insertions/insertions_{fragmentsize}_{{sample}}", sample=SAMPLES),
59-
expand(f"{outdir}/final/qc/Fragmentation/Longest_Interval/{{sample}}/", sample=SAMPLES),
60-
f"{outdir}/final/qc/multiqc_report.html",
61-
# process
62-
f"{outdir}/config_settings.yml",
63-
# downstream
64-
#expand(f"{outdir}/intermediate/blastn/Filtered_Annotated_{fragmentsize}_InsertionMatches_{{sample}}.gff", sample=SAMPLES),
65-
#expand(f"{outdir}/final/functional_genomics/localization/{fragmentsize}_{{sample}}", sample=SAMPLES),
66-
# for msa
67-
#expand(f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa", sample=SAMPLES)
57+
conditional_output.append(expand(f"{outdir}/intermediate/cmod/Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
58+
conditional_output.append(expand(f"{outdir}/intermediate/cmod/Calls_Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
59+
60+
rule all:
61+
input:
62+
#detection
63+
expand(f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed", sample=SAMPLES),
64+
f"{outdir}/final/localization/Heatmap_Insertion_Chr.png",
65+
f"{outdir}/final/localization/Insertion_length.png",
66+
#quality control
67+
expand(f"{outdir}/final/qc/mapq/{{sample}}_mapq_heatmap_image.png", sample=SAMPLES),
68+
expand(f"{outdir}/final/qc/Fragmentation/Insertions/insertions_{fragmentsize}_{{sample}}", sample=SAMPLES),
69+
expand(f"{outdir}/final/qc/Fragmentation/Longest_Interval/{{sample}}/", sample=SAMPLES),
70+
f"{outdir}/final/qc/multiqc_report.html",
71+
# process
72+
f"{outdir}/config_settings.yml",
73+
# other output
74+
conditional_output
75+
# downstream
76+
#expand(f"{outdir}/intermediate/blastn/Filtered_Annotated_{fragmentsize}_InsertionMatches_{{sample}}.gff", sample=SAMPLES),
77+
#expand(f"{outdir}/final/functional_genomics/localization/{fragmentsize}_{{sample}}", sample=SAMPLES),
78+
# for msa
79+
#expand(f"{outdir}/intermediate/fasta/Inserted_sequence_{{sample}}.fa", sample=SAMPLES)

workflow/rules/cmod.smk

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ rule modkit:
5858
modkit extract full -t 20 {input.isobam} {output.tsv}
5959
) > {log.log} 2>&1
6060
"""
61-
''''
61+
6262
rule call_modkit:
6363
input:
6464
isobam=f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam"
@@ -71,11 +71,11 @@ rule call_modkit:
7171
shell:
7272
"""
7373
(
74-
modkit call-mods {input.isobam} - | modkit extract - {output.tsv}
74+
modkit extract calls -t 20 {input.isobam} {output.tsv}
7575
) > {log.log} 2>&1
7676
"""
7777

78-
'''
78+
7979
'''
8080
rule specific_methylartist:
8181
input:

0 commit comments

Comments
 (0)