Skip to content

Commit 6f5cbc1

Browse files
author
weichan
committed
Optimized bedtools closeness
1 parent 02dfe36 commit 6f5cbc1

File tree

4 files changed

+53
-28
lines changed

4 files changed

+53
-28
lines changed

config/config_CD19_28z.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ annotate_ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCO
3737
#promoters
3838
annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
3939
#enhancer
40-
annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
40+
#annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
4141
#enhancer fantom5
4242
annotate_fantom5_enhancer: "/home/weichan/permanent/Projects/VIS/dev/FANTOM5/F5.hg38.enhancers.bed"
4343
#sedb
@@ -51,6 +51,8 @@ annotate_ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_p
5151
annotate_cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
5252
#hiC
5353
annotate_encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
54+
#ultraconservedregions
55+
annotate_ucne_conservation: "/home/weichan/permanent/Projects/VIS/dev/UCNE/hglft_genome_38268a_8d60.bed"
5456
#VIS detection
5557
detection: "rules/detection.smk"
5658
#qc rule collection

config/config_CD19_BBz.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Prom
3939
#enhancer fantom5
4040
annotate_fantom5_enhancer: "/home/weichan/permanent/Projects/VIS/dev/FANTOM5/F5.hg38.enhancers.bed"
4141
#enhancer refTSS
42-
annotate_refTSS_tss: "/home/weichan/permanent/Projects/VIS/dev/refTSS/refTSS_v4.1_human_coordinate.hg38.bed"
42+
#annotate_refTSS_tss: "/home/weichan/permanent/Projects/VIS/dev/refTSS/refTSS_v4.1_human_coordinate.hg38.bed"
4343
#enhancer
44-
annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
44+
#annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
4545
#sedb
4646
#sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
4747
#sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
@@ -53,6 +53,8 @@ annotate_ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_p
5353
annotate_cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
5454
#hiC
5555
annotate_encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
56+
#ultraconservedregions
57+
annotate_ucne_conservation: "/home/weichan/permanent/Projects/VIS/dev/UCNE/hglft_genome_38268a_8d60.bed"
5658
#VIS detection
5759
detection: "rules/detection.smk"
5860
#qc rule collection

workflow/rules/functional_genomics.smk

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,3 @@ rule annotation_overlap_insertion:
5050
for k in config if k.startswith("annotate_")}
5151
run:
5252
vhf_fg.run_bedtools_intersect(input.insertions_bed, output, log.log, params.annotation_files)
53-

workflow/scripts/VIS_functional_genomics_helper_functions.py

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,41 +15,63 @@ def calculate_element_distance(insertions_bed, output_bed, logfile, annotation_f
1515
Parameters:
1616
- insertions_bed (str): Path to the BED file containing the insertion sites.
1717
- output_bed (str): Path to save the output BED file.
18-
- annotation_files (dict): Dict with config_entry as keys and pathways as values.
18+
- annotation_files (dict): Dict with config_entry as keys and file paths as values.
1919
"""
20-
21-
# At least one annotation input necessary
20+
21+
# Ensure at least one annotation file is provided
2222
if not annotation_files:
2323
raise ValueError("At least one annotation file must be provided.")
24-
25-
# Create DataFrame for combined annotations
26-
combined_df = pd.DataFrame()
24+
25+
# Load insertions as a BedTool object
26+
insertions = pybedtools.BedTool(insertions_bed)
27+
28+
# Store processed closest distances for all annotations
29+
closest_results = []
2730

2831
for tag, file in annotation_files.items():
2932
try:
30-
df = pd.read_csv(file, sep="\t", header=None, usecols=[0, 1, 2, 3, 4, 5])
33+
df = pd.read_csv(file, sep="\t", header=None)
34+
35+
# Check if the file has at least 4 columns
36+
if df.shape[1] < 4:
37+
raise ValueError(f"{file} has fewer than 4 columns. This is unexpected.")
38+
39+
# If the file has only 5 columns, add a dummy strand column (+)
40+
while df.shape[1] < 6:
41+
df[df.shape[[1]]] = "."
42+
43+
#makes sure no other irrelevant columns are introduced since the downstream scripts depend on the specific size
44+
df = df.iloc[:,:6]
45+
46+
# Ensure first column has "chr" to confirm it's a BED file
3147
if df.iloc[0, 0].startswith("chr"):
32-
df["source"] = tag # Add source column
48+
df["source"] = tag # Add annotation source
3349
print(f"Loaded {tag}: {df.head()}")
34-
combined_df = pd.concat([combined_df, df], ignore_index=True)
35-
except:
36-
print(f"Error reading {file})")
37-
print("BED files are expected to follow BED6 format convention. If more than 6 columns are provided, the first 6 will be used.")
38-
continue
39-
40-
# Convert combined DataFrame bed object
41-
combined_bed = pybedtools.BedTool.from_dataframe(combined_df)
42-
sorted_annotations = combined_bed.sort()
4350

44-
insertions = pybedtools.BedTool(insertions_bed)
51+
# df to bed and sort
52+
bed = pybedtools.BedTool.from_dataframe(df).sort()
53+
54+
# bedtools closest
55+
closest = insertions.closest(bed, D="a", t='first')
56+
57+
# Convert result to df and add to results
58+
closest_results.append(closest.to_dataframe(header=None))
59+
60+
except Exception as e:
61+
print(f"Error reading {file}: {e}")
62+
with open(logfile, "a") as log:
63+
log.write(f"Error reading {file}: {e}\n")
64+
continue
4565

46-
#bedtools closest operation
47-
closest = insertions.closest(sorted_annotations, D="a", filenames=True)
66+
print(closest_results)
67+
# Combine all results and save to output
68+
if closest_results:
69+
final_df = pd.concat(closest_results, ignore_index=True)
70+
final_df.to_csv(output_bed, sep="\t", index=False, header=None)
71+
print(f"Distances calculated and saved to {output_bed}")
72+
else:
73+
print("No valid annotations processed. Output file not created.")
4874

49-
print(type(closest))
50-
print(closest)
51-
closest.saveas(output_bed)
52-
print(f"Distances calculated and saved to {output_bed}")
5375

5476
@redirect_logging(logfile_param="logfile")
5577
def run_bedtools_intersect(insertions_bed, output_files, logfile, annotations):

0 commit comments

Comments
 (0)