@@ -54,12 +54,15 @@ def run_cov_analyzer(
5454
5555 high_merged_regions = cov_stats .merge_windows (type = "high" , threshold = high_threshold , contig_lengths = contig_lengths , allowed_gap = allowed_gap )
5656 low_merged_regions = cov_stats .merge_windows (type = "low" , threshold = 1 / low_threshold , contig_lengths = contig_lengths , allowed_gap = allowed_gap )
57- low_merged_regions = filter_Ns_from_low_merged_regions (reference_fasta , low_merged_regions )
57+ low_merged_regions = filter_nonATGCs_from_low_merged_regions (reference_fasta , low_merged_regions )
5858
5959 if min_region_length > 0 :
6060 high_merged_regions = high_merged_regions .loc [high_merged_regions ["length" ] >= min_region_length ].reset_index (drop = True )
6161 low_merged_regions = low_merged_regions .loc [low_merged_regions ["length" ] >= min_region_length ].reset_index (drop = True )
6262
63+ high_merged_regions = annotate_zero_cov_bases (high_merged_regions , bam_file )
64+ low_merged_regions = annotate_zero_cov_bases (low_merged_regions , bam_file )
65+
6366 generate_outputs (reference_fasta , high_merged_regions , low_merged_regions ,
6467 cov_df , cov_stats , buffer , output_dir , contig_lengths , no_window_stats ,
6568 log_file )
@@ -211,7 +214,7 @@ def get_windows(self, type = "high", method = "fold", threshold = 10):
211214 return self .df .loc [mask ].reset_index (drop = True )
212215
213216
214- def merge_windows (self , * , type = "high" , method = "fold" , threshold = 10 , edge_pad = 200 , contig_lengths = None , allowed_gap = 0 ):
217+ def merge_windows (self , * , type = "high" , method = "fold" , threshold = 10 , edge_pad = 500 , contig_lengths = None , allowed_gap = 0 ):
215218
216219 df = self .get_windows (type = type , method = method , threshold = threshold )
217220
@@ -277,22 +280,64 @@ def _summarize_region(self, contig, start, end, covs):
277280 return (contig , start , end , region_length , region_mean_cov , percentile , zscore , signed_fold_diff , fold_diff , log2_fold_diff )
278281
279282
280- def filter_Ns_from_low_merged_regions (reference_fasta , low_merged_regions ):
281- # filtering out regions comprised of more than 10% Ns
282- max_Ns_fraction = 0.1
283+ def filter_nonATGCs_from_low_merged_regions (reference_fasta , low_merged_regions ):
284+ # filtering out regions comprised of >= 5% non-ATGCs
285+ # originally was doing just Ns, but some seqs had odd characters
286+ # e.g., NC_003070.9:15345880-15346580 in GCF_000001735.4
287+ max_nonATGCs_fraction = 0.05
283288 regions_to_keep = []
284289 with pysam .FastaFile (reference_fasta ) as fasta :
285290 for idx , row in low_merged_regions .iterrows ():
286291 seq = fasta .fetch (row .contig , row .start , row .end ).upper ()
287- ns_frac = seq .count ("N" ) / len (seq )
288- if ns_frac <= max_Ns_fraction :
292+ total_len = len (seq )
293+ non_atgc = total_len - (
294+ seq .count ("A" ) +
295+ seq .count ("T" ) +
296+ seq .count ("C" ) +
297+ seq .count ("G" )
298+ )
299+ non_atgc_frac = non_atgc / total_len
300+
301+ if non_atgc_frac < max_nonATGCs_fraction :
289302 regions_to_keep .append (idx )
290303
291304 low_merged_regions = low_merged_regions .loc [regions_to_keep ].reset_index (drop = True )
292305
293306 return low_merged_regions
294307
295308
309+ def annotate_zero_cov_bases (merged_regions , bam_file ):
310+
311+ zero_counts = []
312+
313+ with pysam .AlignmentFile (bam_file , "rb" ) as aln :
314+ for _ , row in merged_regions .iterrows ():
315+
316+ # count_coverage returns 4 arrays (A,C,G,T) of per-base counts
317+ cov_arrays = aln .count_coverage (
318+ row .contig ,
319+ start = int (row .start ),
320+ stop = int (row .end ),
321+ quality_threshold = 0
322+ )
323+
324+ # sum across nucleotides → total depth at each position
325+ depth = np .zeros (len (cov_arrays [0 ]), dtype = int )
326+
327+ for arr in cov_arrays :
328+ depth += np .array (arr , dtype = int )
329+ zero_counts .append (int ((depth == 0 ).sum ()))
330+
331+ merged_regions = merged_regions .copy ()
332+ merged_regions ['zero_cov_bases' ] = zero_counts
333+
334+ # reordering columns
335+ cols = merged_regions .columns .tolist ()
336+ cols .insert (4 , cols .pop (cols .index ('zero_cov_bases' )))
337+
338+ return merged_regions [cols ]
339+
340+
296341def generate_outputs (reference_fasta , high_merged_regions , low_merged_regions ,
297342 cov_df , cov_stats , buffer , output_dir , contig_lengths , no_window_stats ,
298343 log_file ):
0 commit comments