Skip to content

Commit 6243f93

Browse files
author
weichan
committed
Reformatting the output
1 parent dc2147d commit 6243f93

9 files changed

+127
-179
lines changed

config/config_CD19_BBz.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ annotate_ucsc_exons: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE
3636
annotate_ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCODE_V44_Introns"
3737
#promoters
3838
annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
39+
#enhancer
40+
annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
3941
#sedb
4042
#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
4143
#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
@@ -61,6 +63,6 @@ downstream: "rules/downstream.smk"
6163
#epigenetics: "rules/epigenetics.smk"
6264
#variants rule collection WIP
6365
#variants: "rules/variants.smk"
64-
cmod: "rules/cmod.smk"
66+
base_modifications: "rules/base_modifications.smk"
6567
plot_functional_genomics: "rules/plot_functional_genomics.smk"
6668
threads: 10

workflow/Snakefile

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,15 @@ cwd=os.getcwd()
2020

2121
import VIS_helper_functions as vhf #custom functions to make snakemake pipeline leaner
2222

23-
#inmport rules
23+
#inmport default rules
2424
include: config["detection"]
2525
include: config["quality_control"]
2626

27-
#analysis-specific rules
28-
#include: config["downstream"]
29-
30-
#development
31-
#include: config["epigenetics"]
32-
#include: config["variants"]
33-
#include: config["development"]
34-
35-
3627
#conditional rule all based on defined rules
3728
conditional_output = list()
3829

3930
if "functional_genomics" in config:
31+
import VIS_functional_genomics_helper_functions as vhf_fg
4032
include: config["functional_genomics"]
4133
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES))
4234
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Functional_distances_to_Insertions_{{sample}}.bed", sample=SAMPLES))
@@ -45,17 +37,15 @@ if "functional_genomics" in config:
4537
sample=SAMPLES))
4638

4739
if "plot_functional_genomics" in config:
48-
49-
import VIS_plot_functional_annotation_helper_functions as vhf_fa
40+
import VIS_plot_functional_genomics_helper_functions as vhf_pfg
5041
include: config["plot_functional_genomics"]
51-
5242
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png", sample=SAMPLES))
5343
conditional_output.append(expand(f"{outdir}/final/functional_genomics/Insertion_Scoring_{{sample}}.svg", sample=SAMPLES))
5444

55-
if "cmod" in config:
56-
include: config["cmod"]
57-
conditional_output.append(expand(f"{outdir}/intermediate/cmod/Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
58-
conditional_output.append(expand(f"{outdir}/intermediate/cmod/Calls_Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
45+
if "base_modifications" in config:
46+
include: config["base_modifications"]
47+
conditional_output.append(expand(f"{outdir}/final/base_modifications/Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
48+
conditional_output.append(expand(f"{outdir}/final/base_modifications/Calls_Isolated_Reads_{{sample}}.tsv", sample=SAMPLES))
5949

6050
rule all:
6151
input:
Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
######
22
######
3-
###### C-modifications of reads with isnertions
3+
###### Base modifications of reads with isnertions
44
######
55
######
66

@@ -11,9 +11,9 @@ rule readnames_final:
1111
input:
1212
final_insertions=f"{outdir}/final/localization/ExactInsertions_{{sample}}.bed"
1313
log:
14-
log=f"{outdir}/intermediate/log/cmod/readnames_final/{{sample}}.log"
14+
log=f"{outdir}/intermediate/log/base_modifications/readnames_final/{{sample}}.log"
1515
output:
16-
readnames=f"{outdir}/intermediate/cmod/final_insertion_readnames_{{sample}}.txt"
16+
readnames=temp(f"{outdir}/intermediate/base_modifications/final_insertion_readnames_{{sample}}.txt")
1717
conda:
1818
"../envs/VIS_dummy_env.yml"
1919
shell:
@@ -23,15 +23,14 @@ rule readnames_final:
2323
) > {log.log} 2>&1
2424
"""
2525

26-
27-
rule only_keep_valid_insertions:
26+
rule only_keep_filtered_insertions:
2827
input:
2928
isobam=f"{outdir}/intermediate/mapping/Precut_{{sample}}_sorted.bam",
3029
readnames=f"{outdir}/intermediate/cmod/final_insertion_readnames_{{sample}}.txt"
3130
log:
32-
log=f"{outdir}/intermediate/log/cmod/filter/{{sample}}.log"
31+
log=f"{outdir}/intermediate/log/base_modifications/only_keep_filtered_insertions/{{sample}}.log"
3332
output:
34-
bam=f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam"
33+
bam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
3534
conda:
3635
"../envs/VIS_samtools_env.yml"
3736
shell:
@@ -42,40 +41,40 @@ rule only_keep_valid_insertions:
4241
) > {log.log} 2>&1
4342
"""
4443

45-
4644
rule modkit:
4745
input:
48-
isobam=f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam"
46+
isobam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
4947
log:
50-
log=f"{outdir}/intermediate/log/cmod/modkit/{{sample}}.log"
48+
log=f"{outdir}/intermediate/log/base_modifications/modkit/{{sample}}.log"
5149
output:
52-
tsv=f"{outdir}/intermediate/cmod/Isolated_Reads_{{sample}}.tsv"
50+
tsv=f"{outdir}/final/base_modifications/Isolated_Reads_{{sample}}.tsv"
5351
conda:
5452
"../envs/VIS_modkit_env.yml"
53+
threads: config["threads"]
5554
shell:
5655
"""
5756
(
58-
modkit extract full -t 20 {input.isobam} {output.tsv}
57+
modkit extract full -t {threads} {input.isobam} {output.tsv}
5958
) > {log.log} 2>&1
6059
"""
6160

6261
rule call_modkit:
6362
input:
64-
isobam=f"{outdir}/intermediate/cmod/Final_Isolated_Reads_{{sample}}.bam"
63+
isobam=f"{outdir}/intermediate/base_modifications/Final_Isolated_Reads_{{sample}}.bam"
6564
log:
66-
log=f"{outdir}/intermediate/log/cmod/Calls_modkit/{{sample}}.log"
65+
log=f"{outdir}/intermediate/log/base_modifications/call_modkit/{{sample}}.log"
6766
output:
68-
tsv=f"{outdir}/intermediate/cmod/Calls_Isolated_Reads_{{sample}}.tsv"
67+
tsv=f"{outdir}/final/base_modifications/Calls_Isolated_Reads_{{sample}}.tsv"
6968
conda:
7069
"../envs/VIS_modkit_env.yml"
70+
threads: config["threads"]
7171
shell:
7272
"""
7373
(
74-
modkit extract calls -t 20 {input.isobam} {output.tsv}
74+
modkit extract calls -t {threads} {input.isobam} {output.tsv}
7575
) > {log.log} 2>&1
7676
"""
7777

78-
7978
'''
8079
rule specific_methylartist:
8180
input:
@@ -93,7 +92,7 @@ rule specific_methylartist:
9392
methylartist locus -b {input.bam} -i chr17:31124037-31180287 -n C -r {input.ref} -l 31154037-31159287 -o {output.plot}
9493
) > {log.log} 2>&1
9594
"""
96-
'''
95+
9796
9897
rule inserted_seq:
9998
input:
@@ -109,3 +108,4 @@ rule inserted_seq:
109108
except Exception as e:
110109
with open(log.log, "a") as log_file:
111110
log_file.write(f"Error: {str(e)}\n")
111+
'''

workflow/rules/detection.smk

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ rule fasta_insertion_reads:
114114
) > {log.log} 2>&1
115115
"""
116116

117-
118117
######
119118
######
120119
###### "Clean" BAM: Cut-out fasta to BAM via Mapping to reference
@@ -262,6 +261,7 @@ rule split_fasta:
262261
###### Insertion preparation: Fragmentation
263262
######
264263
######
264+
265265
rule prepare_insertion:
266266
input:
267267
config["insertion_fasta"] #insertion fasta sequence
@@ -484,7 +484,23 @@ rule calculate_exact_insertion_coordinates:
484484
except Exception as e:
485485
with open(log.log, "a") as log_file:
486486
log_file.write(f"Error: {str(e)}\n")
487-
487+
488+
rule insertion_points:
489+
input:
490+
f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed"
491+
output:
492+
f"{outdir}/final/localization/InsertionPoints_{{sample}}.bed"
493+
log:
494+
f"{outdir}/intermediate/log/detection/insertion_points/{{sample}}.log"
495+
conda:
496+
"../envs/VIS_dummy_env.yml"
497+
shell:
498+
"""
499+
(
500+
awk '{{OFS="\t"; $3 = $2 + 1; print $0}}' {input} > {output}
501+
) > {log} 2>&1
502+
"""
503+
488504
rule collect_outputs:
489505
input:
490506
coordinates=f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed",

workflow/rules/functional_genomics.smk

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
rule sort_insertion_file:
55
input:
66
exact=f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed",
7-
point=f"{outdir}/intermediate/localization/annotation/temp_Insertions_{{sample}}.bed"
7+
point=f"{outdir}/final/localization/InsertionPoints_{{sample}}.bed"
88
log:
99
log=f"{outdir}/intermediate/log/functional_genomics/sort_insertion_file/{{sample}}.log",
10-
log2=f"{outdir}/intermediate/log/functional_genomics/sort_insertion_file/temp_{{sample}}.log"
10+
log2=f"{outdir}/intermediate/log/functional_genomics/sort_insertion_file/points_{{sample}}.log"
1111
output:
1212
exact=temp(f"{outdir}/intermediate/localization/Sorted_ExactInsertions_{{sample}}.bed"),
13-
point=f"{outdir}/intermediate/localization/temp_Sorted_ExactInsertions_{{sample}}.bed"
13+
point=f"{outdir}/intermediate/localization/Sorted_InsertionPoints_{{sample}}.bed"
1414
conda:
1515
"../envs/VIS_dummy_env.yml"
1616
shell:
@@ -26,7 +26,7 @@ rule sort_insertion_file:
2626
rule calc_distance_to_elements:
2727
input:
2828
#insertions=f"{outdir}/intermediate/localization/Sorted_ExactInsertions_{{sample}}.bed",
29-
insertions=f"{outdir}/intermediate/localization/temp_Sorted_ExactInsertions_{{sample}}.bed",
29+
insertions=f"{outdir}/intermediate/localization/Sorted_InsertionPoints_{{sample}}.bed",
3030
ref=config["ref_genome_ctrl"]
3131
params:
3232
annotation_files={k: v for k, v in config.items() if k.startswith("annotate_")}
@@ -37,26 +37,10 @@ rule calc_distance_to_elements:
3737
run:
3838
vhf.calculate_element_distance(input.insertions, output[0], log.log, params.annotation_files)
3939

40-
rule modify_insertions:
41-
input:
42-
f"{outdir}/intermediate/localization/ExactInsertions_{{sample}}.bed"
43-
output:
44-
temp(f"{outdir}/intermediate/localization/annotation/temp_Insertions_{{sample}}.bed")
45-
log:
46-
f"{outdir}/intermediate/log/functional_genomics/modify_insertions/{{sample}}.log"
47-
conda:
48-
"../envs/VIS_dummy_env.yml"
49-
shell:
50-
"""
51-
(
52-
awk '{{OFS="\t"; $3 = $2 + 1; print $0}}' {input} > {output}
53-
) > {log} 2>&1
54-
"""
55-
5640
#kinda 'hacky' solution for flexibility: Now it does not matter if one annotation or 20 are defined.
5741
rule annotation_overlap_insertion:
5842
input:
59-
insertions_bed=f"{outdir}/intermediate/localization/temp_Sorted_ExactInsertions_{{sample}}.bed"
43+
insertions_bed=f"{outdir}/intermediate/localization/Sorted_InsertionPoints_{{sample}}.bed"
6044
params:
6145
annotation_files={k.replace("annotate_", ""): v for k, v in config.items() if k.startswith("annotate_")}
6246
log:
@@ -65,5 +49,5 @@ rule annotation_overlap_insertion:
6549
**{k.replace("annotate_", ""): f"{outdir}/intermediate/localization/annotation/Annotation_{k.replace('annotate_', '')}_Insertions_{{sample}}.bed"
6650
for k in config if k.startswith("annotate_")}
6751
run:
68-
vhf.run_bedtools_intersect(input.insertions_bed, output, log.log, params.annotation_files)
52+
vhf_fg.run_bedtools_intersect(input.insertions_bed, output, log.log, params.annotation_files)
6953

workflow/rules/plot_functional_genomics.smk

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ rule plot_distance_to_elements:
1111
scatter=report(f"{outdir}/final/functional_genomics/Plot_Distance_to_Genes_{fragmentsize}_{{sample}}.png"),
1212
run:
1313
try:
14-
vhf_fa.plot_element_distance(input.distancetable, params.distances, params.threshold, output.scatter, log.log1)
14+
vhf_pfg.plot_element_distance(input.distancetable, params.distances, params.threshold, output.scatter, log.log1)
1515
except Exception as e:
1616
with open(log.log1, "a") as log_file:
1717
log_file.write(f"Error: {str(e)}\n")
@@ -26,7 +26,7 @@ rule plot_scoring:
2626
data=(f"{outdir}/final/functional_genomics/Insertion_Scoring_Data_{{sample}}.txt")
2727
run:
2828
try:
29-
vhf_fa.scoring_insertions(input[0], output.plot, output.data, log.log)
29+
vhf_pfg.scoring_insertions(input[0], output.plot, output.data, log.log)
3030
except Exception as e:
3131
with open(log.log, "a") as log_file:
3232
log_file.write(f"Error: {str(e)}\n")
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python3
2+
3+
import pandas as pd
4+
import pybedtools
5+
6+
#wrapper
7+
from VIS_helper_functions import redirect_logging
8+
9+
#functional
10+
@redirect_logging(logfile_param="logfile")
11+
def calculate_element_distance(insertions_bed, output_bed, logfile, annotation_files):
12+
"""
13+
Calculates distances between insertion sites and genomic annotations using bedtools closest.
14+
15+
Parameters:
16+
- insertions_bed (str): Path to the BED file containing the insertion sites.
17+
- output_bed (str): Path to save the output BED file.
18+
- annotation_files (dict): Dict with config_entry as keys and pathways as values.
19+
"""
20+
21+
# At least one annotation input necessary
22+
if not annotation_files:
23+
raise ValueError("At least one annotation file must be provided.")
24+
25+
# Create DataFrame for combined annotations
26+
combined_df = pd.DataFrame()
27+
28+
for tag, file in annotation_files.items():
29+
try:
30+
df = pd.read_csv(file, sep="\t", header=None, usecols=[0, 1, 2, 3, 4, 5])
31+
if df.iloc[0, 0].startswith("chr"):
32+
df["source"] = tag # Add source column
33+
print(f"Loaded {tag}: {df.head()}")
34+
combined_df = pd.concat([combined_df, df], ignore_index=True)
35+
except:
36+
print(f"Error reading {file})")
37+
print("BED files are expected to follow BED6 format convention. If more than 6 columns are provided, the first 6 will be used.")
38+
continue
39+
40+
# Convert combined DataFrame bed object
41+
combined_bed = pybedtools.BedTool.from_dataframe(combined_df)
42+
sorted_annotations = combined_bed.sort()
43+
44+
insertions = pybedtools.BedTool(insertions_bed)
45+
46+
#bedtools closest operation
47+
closest = insertions.closest(sorted_annotations, D="a", filenames=True)
48+
49+
print(type(closest))
50+
print(closest)
51+
closest.saveas(output_bed)
52+
print(f"Distances calculated and saved to {output_bed}")
53+
54+
@redirect_logging(logfile_param="logfile")
55+
def run_bedtools_intersect(insertions_bed, output_files, logfile, annotations):
56+
"""
57+
Runs bedtools intersect for each annotation using pybedtools.
58+
"""
59+
insertions = pybedtools.BedTool(insertions_bed)
60+
61+
for key, annotation_file in annotations.items():
62+
output_file = output_files[key]
63+
annotation = pybedtools.BedTool(annotation_file)
64+
65+
intersected = insertions.intersect(annotation, wb=True)
66+
67+
intersected.saveas(output_file)
68+
print(f"Completed: {key}")

0 commit comments

Comments
 (0)