Skip to content

Commit 639ddfc

Browse files
Mike LeeMike Lee
authored andcommitted
bit-cov-analyzer tuning: default 500 minimum window length; added column for zero_cov_bases
1 parent 3015a83 commit 639ddfc

File tree

2 files changed

+54
-9
lines changed

2 files changed

+54
-9
lines changed

bit/cli/cov_analyzer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ def main():
7272
"--min-region-length",
7373
metavar="<INT>",
7474
type=int,
75-
help='Minimum region length (in bp) to be reported (default: 0; so reports all regions, even single windows)',
76-
default=0
75+
help='Minimum region length (in bp) to be reported (default: 500; set to 0 to report all regions, even single windows)',
76+
default=500
7777
)
7878
optional.add_argument(
7979
"-e",

bit/modules/cov_analyzer.py

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,15 @@ def run_cov_analyzer(
5454

5555
high_merged_regions = cov_stats.merge_windows(type="high", threshold = high_threshold, contig_lengths = contig_lengths, allowed_gap = allowed_gap)
5656
low_merged_regions = cov_stats.merge_windows(type="low", threshold = 1/low_threshold, contig_lengths = contig_lengths, allowed_gap = allowed_gap)
57-
low_merged_regions = filter_Ns_from_low_merged_regions(reference_fasta, low_merged_regions)
57+
low_merged_regions = filter_nonATGCs_from_low_merged_regions(reference_fasta, low_merged_regions)
5858

5959
if min_region_length > 0:
6060
high_merged_regions = high_merged_regions.loc[high_merged_regions["length"] >= min_region_length].reset_index(drop=True)
6161
low_merged_regions = low_merged_regions.loc[low_merged_regions["length"] >= min_region_length].reset_index(drop=True)
6262

63+
high_merged_regions = annotate_zero_cov_bases(high_merged_regions, bam_file)
64+
low_merged_regions = annotate_zero_cov_bases(low_merged_regions, bam_file)
65+
6366
generate_outputs(reference_fasta, high_merged_regions, low_merged_regions,
6467
cov_df, cov_stats, buffer, output_dir, contig_lengths, no_window_stats,
6568
log_file)
@@ -211,7 +214,7 @@ def get_windows(self, type = "high", method = "fold", threshold = 10):
211214
return self.df.loc[mask].reset_index(drop=True)
212215

213216

214-
def merge_windows(self, *, type = "high", method = "fold", threshold = 10, edge_pad = 200, contig_lengths = None, allowed_gap = 0):
217+
def merge_windows(self, *, type = "high", method = "fold", threshold = 10, edge_pad = 500, contig_lengths = None, allowed_gap = 0):
215218

216219
df = self.get_windows(type=type, method=method, threshold=threshold)
217220

@@ -277,22 +280,64 @@ def _summarize_region(self, contig, start, end, covs):
277280
return (contig, start, end, region_length, region_mean_cov, percentile, zscore, signed_fold_diff, fold_diff, log2_fold_diff)
278281

279282

280-
def filter_Ns_from_low_merged_regions(reference_fasta, low_merged_regions):
281-
# filtering out regions comprised of more than 10% Ns
282-
max_Ns_fraction = 0.1
283+
def filter_nonATGCs_from_low_merged_regions(reference_fasta, low_merged_regions):
284+
# filtering out regions comprised of >= 5% non-ATGCs
285+
# originally was doing just Ns, but some seqs had odd characters
286+
# e.g., NC_003070.9:15345880-15346580 in GCF_000001735.4
287+
max_nonATGCs_fraction = 0.05
283288
regions_to_keep = []
284289
with pysam.FastaFile(reference_fasta) as fasta:
285290
for idx, row in low_merged_regions.iterrows():
286291
seq = fasta.fetch(row.contig, row.start, row.end).upper()
287-
ns_frac = seq.count("N") / len(seq)
288-
if ns_frac <= max_Ns_fraction:
292+
total_len = len(seq)
293+
non_atgc = total_len - (
294+
seq.count("A") +
295+
seq.count("T") +
296+
seq.count("C") +
297+
seq.count("G")
298+
)
299+
non_atgc_frac = non_atgc / total_len
300+
301+
if non_atgc_frac < max_nonATGCs_fraction:
289302
regions_to_keep.append(idx)
290303

291304
low_merged_regions = low_merged_regions.loc[regions_to_keep].reset_index(drop=True)
292305

293306
return low_merged_regions
294307

295308

309+
def annotate_zero_cov_bases(merged_regions, bam_file):
310+
311+
zero_counts = []
312+
313+
with pysam.AlignmentFile(bam_file, "rb") as aln:
314+
for _, row in merged_regions.iterrows():
315+
316+
# count_coverage returns 4 arrays (A,C,G,T) of per-base counts
317+
cov_arrays = aln.count_coverage(
318+
row.contig,
319+
start = int(row.start),
320+
stop = int(row.end),
321+
quality_threshold = 0
322+
)
323+
324+
# sum across nucleotides → total depth at each position
325+
depth = np.zeros(len(cov_arrays[0]), dtype=int)
326+
327+
for arr in cov_arrays:
328+
depth += np.array(arr, dtype=int)
329+
zero_counts.append(int((depth == 0).sum()))
330+
331+
merged_regions = merged_regions.copy()
332+
merged_regions['zero_cov_bases'] = zero_counts
333+
334+
# reordering columns
335+
cols = merged_regions.columns.tolist()
336+
cols.insert(4, cols.pop(cols.index('zero_cov_bases')))
337+
338+
return merged_regions[cols]
339+
340+
296341
def generate_outputs(reference_fasta, high_merged_regions, low_merged_regions,
297342
cov_df, cov_stats, buffer, output_dir, contig_lengths, no_window_stats,
298343
log_file):

0 commit comments

Comments
 (0)