Optimized bedtools closeness

weichan · weichan · commit 6f5cbc1b96ff · 2025-03-11T16:17:42.000+01:00
diff --git a/config/config_CD19_28z.yml b/config/config_CD19_28z.yml
@@ -37,7 +37,7 @@ annotate_ucsc_introns: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_GENCO
 #promoters
 annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Promoters_EPDnew"
 #enhancer
-annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
+#annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
 #enhancer fantom5
 annotate_fantom5_enhancer: "/home/weichan/permanent/Projects/VIS/dev/FANTOM5/F5.hg38.enhancers.bed"
 #sedb
@@ -51,6 +51,8 @@ annotate_ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_p
 annotate_cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
 #hiC
 annotate_encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
+#ultraconservedregions
+annotate_ucne_conservation: "/home/weichan/permanent/Projects/VIS/dev/UCNE/hglft_genome_38268a_8d60.bed"
 #VIS detection
 detection: "rules/detection.smk"
 #qc rule collection
diff --git a/config/config_CD19_BBz.yml b/config/config_CD19_BBz.yml
@@ -39,9 +39,9 @@ annotate_ucsc_promoter: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_Prom
 #enhancer fantom5
 annotate_fantom5_enhancer: "/home/weichan/permanent/Projects/VIS/dev/FANTOM5/F5.hg38.enhancers.bed"
 #enhancer refTSS
-annotate_refTSS_tss: "/home/weichan/permanent/Projects/VIS/dev/refTSS/refTSS_v4.1_human_coordinate.hg38.bed"
+#annotate_refTSS_tss: "/home/weichan/permanent/Projects/VIS/dev/refTSS/refTSS_v4.1_human_coordinate.hg38.bed"
 #enhancer
-annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
+#annotate_ucsc_enhancer: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_VISTA_Enhancer_02_2025.bed"
 #sedb
 #sedb_cd4: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd4_closest_gene.bed"
 #sedb_cd8: "/home/weichan/permanent/Projects/VIS/dev/SEdb/sedb_cd8_closest_gene.bed"
@@ -53,6 +53,8 @@ annotate_ucsc_mirna: "/home/weichan/permanent/Projects/VIS/dev/UCSC/UCSC_miRNA_p
 annotate_cosmic_genes: "/home/weichan/permanent/Projects/VIS/dev/COSMIC/Cosmic_CancerGeneCensus_Tsv_v101_GRCh38/Cosmic_CancerGeneCensus_v101_GRCh38_processed.bed"
 #hiC
 annotate_encode_hic: "/home/weichan/permanent/Projects/VIS/dev/ENCODE/HiC_Tcells/HiC_processed.bed"
+#ultraconservedregions
+annotate_ucne_conservation: "/home/weichan/permanent/Projects/VIS/dev/UCNE/hglft_genome_38268a_8d60.bed"
 #VIS detection
 detection: "rules/detection.smk"
 #qc rule collection
diff --git a/workflow/rules/functional_genomics.smk b/workflow/rules/functional_genomics.smk
@@ -50,4 +50,3 @@ rule annotation_overlap_insertion:
            for k in config if k.startswith("annotate_")}
     run:
         vhf_fg.run_bedtools_intersect(input.insertions_bed, output, log.log, params.annotation_files)
-
diff --git a/workflow/scripts/VIS_functional_genomics_helper_functions.py b/workflow/scripts/VIS_functional_genomics_helper_functions.py
@@ -15,41 +15,63 @@ def calculate_element_distance(insertions_bed, output_bed, logfile, annotation_f
     Parameters:
     - insertions_bed (str): Path to the BED file containing the insertion sites.
     - output_bed (str): Path to save the output BED file.
-    - annotation_files (dict): Dict with config_entry as keys and pathways as values.
+    - annotation_files (dict): Dict with config_entry as keys and file paths as values.
     """
-    
-    # At least one annotation input necessary
+
+    # Ensure at least one annotation file is provided
     if not annotation_files:
         raise ValueError("At least one annotation file must be provided.")
-    
-    # Create DataFrame for combined annotations
-    combined_df = pd.DataFrame()
+
+    # Load insertions as a BedTool object
+    insertions = pybedtools.BedTool(insertions_bed)
+
+    # Store processed closest distances for all annotations
+    closest_results = []
 
     for tag, file in annotation_files.items():
         try:
-            df = pd.read_csv(file, sep="\t", header=None, usecols=[0, 1, 2, 3, 4, 5])
+            df = pd.read_csv(file, sep="\t", header=None)
+
+            # Check if the file has at least 4 columns
+            if df.shape[1] < 4:
+                raise ValueError(f"{file} has fewer than 4 columns. This is unexpected.")
+
+            # If the file has only 5 columns, add a dummy strand column (+)
+            while df.shape[1] < 6:
+                df[df.shape[[1]]] = "."
+            
+            #makes sure no other irrelevant columns are introduced since the downstream scripts depend on the specific size
+            df = df.iloc[:,:6] 
+            
+            # Ensure first column has "chr" to confirm it's a BED file
             if df.iloc[0, 0].startswith("chr"): 
-                df["source"] = tag  # Add source column
+                df["source"] = tag  # Add annotation source
                 print(f"Loaded {tag}: {df.head()}")
-                combined_df = pd.concat([combined_df, df], ignore_index=True)
-        except:
-            print(f"Error reading {file})")
-            print("BED files are expected to follow BED6 format convention. If more than 6 columns are provided, the first 6 will be used.")
-            continue
-    
-    # Convert combined DataFrame bed object
-    combined_bed = pybedtools.BedTool.from_dataframe(combined_df)
-    sorted_annotations = combined_bed.sort()
 
-    insertions = pybedtools.BedTool(insertions_bed)
+                # df to bed and sort
+                bed = pybedtools.BedTool.from_dataframe(df).sort()
+            
+                # bedtools closest 
+                closest = insertions.closest(bed, D="a", t='first')
+            
+                # Convert result to df and add to results
+                closest_results.append(closest.to_dataframe(header=None))
+
+        except Exception as e:
+            print(f"Error reading {file}: {e}")
+            with open(logfile, "a") as log:
+                log.write(f"Error reading {file}: {e}\n")
+            continue
     
-    #bedtools closest operation
-    closest = insertions.closest(sorted_annotations, D="a", filenames=True)
+    print(closest_results)
+    # Combine all results and save to output
+    if closest_results:
+        final_df = pd.concat(closest_results, ignore_index=True)
+        final_df.to_csv(output_bed, sep="\t", index=False, header=None)
+        print(f"Distances calculated and saved to {output_bed}")
+    else:
+        print("No valid annotations processed. Output file not created.")
 
-    print(type(closest))
-    print(closest)
-    closest.saveas(output_bed)
-    print(f"Distances calculated and saved to {output_bed}")
 
 @redirect_logging(logfile_param="logfile")
 def run_bedtools_intersect(insertions_bed, output_files, logfile, annotations):