@@ -15,41 +15,63 @@ def calculate_element_distance(insertions_bed, output_bed, logfile, annotation_f
1515 Parameters:
1616 - insertions_bed (str): Path to the BED file containing the insertion sites.
1717 - output_bed (str): Path to save the output BED file.
18- - annotation_files (dict): Dict with config_entry as keys and pathways as values.
18+ - annotation_files (dict): Dict with config_entry as keys and file paths as values.
1919 """
20-
21- # At least one annotation input necessary
20+
21+ # Ensure at least one annotation file is provided
2222 if not annotation_files :
2323 raise ValueError ("At least one annotation file must be provided." )
24-
25- # Create DataFrame for combined annotations
26- combined_df = pd .DataFrame ()
24+
25+ # Load insertions as a BedTool object
26+ insertions = pybedtools .BedTool (insertions_bed )
27+
28+ # Store processed closest distances for all annotations
29+ closest_results = []
2730
2831 for tag , file in annotation_files .items ():
2932 try :
30- df = pd .read_csv (file , sep = "\t " , header = None , usecols = [0 , 1 , 2 , 3 , 4 , 5 ])
33+ df = pd .read_csv (file , sep = "\t " , header = None )
34+
35+ # Check if the file has at least 4 columns
36+ if df .shape [1 ] < 4 :
37+ raise ValueError (f"{ file } has fewer than 4 columns. This is unexpected." )
38+
39+ # If the file has only 5 columns, add a dummy strand column (+)
40+ while df .shape [1 ] < 6 :
41+ df [df .shape [[1 ]]] = "."
42+
43+ #makes sure no other irrelevant columns are introduced since the downstream scripts depend on the specific size
44+ df = df .iloc [:,:6 ]
45+
46+ # Ensure first column has "chr" to confirm it's a BED file
3147 if df .iloc [0 , 0 ].startswith ("chr" ):
32- df ["source" ] = tag # Add source column
48+ df ["source" ] = tag # Add annotation source
3349 print (f"Loaded { tag } : { df .head ()} " )
34- combined_df = pd .concat ([combined_df , df ], ignore_index = True )
35- except :
36- print (f"Error reading { file } )" )
37- print ("BED files are expected to follow BED6 format convention. If more than 6 columns are provided, the first 6 will be used." )
38- continue
39-
40- # Convert combined DataFrame bed object
41- combined_bed = pybedtools .BedTool .from_dataframe (combined_df )
42- sorted_annotations = combined_bed .sort ()
4350
44- insertions = pybedtools .BedTool (insertions_bed )
51+ # df to bed and sort
52+ bed = pybedtools .BedTool .from_dataframe (df ).sort ()
53+
54+ # bedtools closest
55+ closest = insertions .closest (bed , D = "a" , t = 'first' )
56+
57+ # Convert result to df and add to results
58+ closest_results .append (closest .to_dataframe (header = None ))
59+
60+ except Exception as e :
61+ print (f"Error reading { file } : { e } " )
62+ with open (logfile , "a" ) as log :
63+ log .write (f"Error reading { file } : { e } \n " )
64+ continue
4565
46- #bedtools closest operation
47- closest = insertions .closest (sorted_annotations , D = "a" , filenames = True )
66+ print (closest_results )
67+ # Combine all results and save to output
68+ if closest_results :
69+ final_df = pd .concat (closest_results , ignore_index = True )
70+ final_df .to_csv (output_bed , sep = "\t " , index = False , header = None )
71+ print (f"Distances calculated and saved to { output_bed } " )
72+ else :
73+ print ("No valid annotations processed. Output file not created." )
4874
49- print (type (closest ))
50- print (closest )
51- closest .saveas (output_bed )
52- print (f"Distances calculated and saved to { output_bed } " )
5375
5476@redirect_logging (logfile_param = "logfile" )
5577def run_bedtools_intersect (insertions_bed , output_files , logfile , annotations ):
0 commit comments