Skip to content

Commit 95c1fb6

Browse files
jkbonfieldpd3
authored andcommitted
Tweak the condition for calling a 2nd consensus sequence in a deletion.
CONS_CUTOFF_DEL and CONS_CUTOFF_INC are now 35% instead of 40% (also DEL is new as it previously shared the score with SNPs). Plus when computing the ratio we no longer apply this as a percentage to the entire depth, but as a percentage of the alignments overlapping any STRs spanning this region (so meaningful alignments) that aren't already allocated to the "type" being diagnosed. Both of these make it slightly more likely to emit a second consensus sequence potentially containing indel variations. This fixes the specific problem raised in #2117, but like all heuristics it benefits some cases and harms others. We will always have some cases that fall just either side of the thresholds and look odd. Overall it seems a slim benefit and the work to track the number of meaningful spanning alignments may have potential value in subsequent updates. Fixes #2117
1 parent fd4f19f commit 95c1fb6

File tree

1 file changed

+43
-10
lines changed

1 file changed

+43
-10
lines changed

bam2bcf_edlib.c

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
317317
int ref_len, int left, int right,
318318
int sample, int type, int biggest_del,
319319
int *left_shift, int *right_shift,
320-
int *band, int *tcon_len, int *cpos_pos) {
320+
int *band, int *tcon_len, int *cpos_pos,
321+
int pos_l, int pos_r) {
321322
// Map ASCII ACGTN* to 012345
322323
static uint8_t base6[256] = {
323324
4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
@@ -357,6 +358,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
357358
//--------------------------------------------------
358359
// Accumulate sequences into cons_base and cons_ins arrays
359360
int local_band_max = 0; // maximum absolute deviation from diagonal
361+
int total_span_str = 0;
362+
int type_depth = 0;
360363
for (i = 0; i < n_plp[s]; i++) {
361364
const bam_pileup1_t *p = plp[s] + i;
362365
bam1_t *b = p->b;
@@ -425,6 +428,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
425428
if (bcf_cgp_append_cons(&cons_ins[x-left], ins,
426429
ilen, 1) < 0)
427430
goto err;
431+
type_depth += (x == pos+1);
428432
} else if (x != pos+1){
429433
if (bcf_cgp_append_cons(&ref_ins[x-left], ins,
430434
ilen, 1) < 0)
@@ -446,9 +450,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
446450
if (x < left) continue;
447451
if (x >= right) break;
448452
if ((p->indel == type && !p->is_del) || // starts here
449-
(p->indel == 0 && p->is_del && len == -type)) // left
453+
(p->indel == 0 && p->is_del && len == -type)) { // left
450454
cons_base[x-left][5]++;
451-
else if (x+len <= pos+1 || (skip_to && x > skip_to))
455+
type_depth += (x == pos+1);
456+
} else if (x+len <= pos+1 || (skip_to && x > skip_to))
452457
ref_base[x-left][5]++;
453458
else if (x <= pos && x+len > pos+1) {
454459
// we have a deletion which overlaps pos, but
@@ -466,6 +471,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
466471
}
467472
}
468473

474+
if (b->core.pos <= pos_l && x >= pos_r)
475+
total_span_str++;
476+
469477
// Also track the biggest deviation +/- from diagonal. We use
470478
// this band observation in our BAQ alignment step.
471479
if (*band < local_band_max)
@@ -622,10 +630,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
622630
}
623631
}
624632

625-
#define CONS_CUTOFF .40 // 40% needed for base vs N
626-
#define CONS_CUTOFF2 .80 // 80% needed for gap in cons[1]
627-
#define CONS_CUTOFF_INC .40 // 40% to include any insertion cons[0]
628-
#define CONS_CUTOFF_INC2 .80 // 80% to include any insertion cons[1] HOM
633+
#define CONS_CUTOFF .40 // % needed for base vs N
634+
#define CONS_CUTOFF_DEL .35 // % to include any het del
635+
#define CONS_CUTOFF2 .80 // % needed for gap in cons[1]
636+
#define CONS_CUTOFF_INC .35 // % to include any insertion cons[0]
637+
#define CONS_CUTOFF_INC2 .80 // % to include any insertion cons[1] HOM
629638
#define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N
630639

631640
//--------------------------------------------------
@@ -723,14 +732,18 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
723732
if (!always_del && cons_base[i][5] >= bca->min_support) {
724733
// Candidate HET del.
725734
if (cnum == 0) {
726-
het_del = cons_base[i][5] >= CONS_CUTOFF * tot;
735+
int tot2 = tot;
736+
if (i > pos-left && i <= pos-left-biggest_del)
737+
tot2 = total_span_str - type_depth;
738+
het_del = cons_base[i][5] >= CONS_CUTOFF_DEL * tot2;
739+
727740
if (i < 1024) {
728741
if (i > pos-left && i <= pos-left-biggest_del)
729742
hetd[i] = 0;
730743
else
731744
hetd[i] = het_del
732745
? 1
733-
: (cons_base[i][5] >= .3 * tot ? -1 : 0);
746+
: (cons_base[i][5] >= .3 * tot2 ? -1 : 0);
734747
}
735748
} else {
736749
// HET del uncalled on cnum 0
@@ -1391,6 +1404,26 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13911404
}
13921405
int band = biggest_ins - biggest_del; // NB del is -ve
13931406

1407+
// Find left & right extents of STR covering pos, from ref
1408+
int pos_l = pos, pos_r = pos;
1409+
{
1410+
rep_ele *reps, *elt, *tmp;
1411+
int pstart = MAX(0, pos-30);
1412+
int pmid = pos-pstart;
1413+
int pend = MIN(ref_len, pos+30);
1414+
reps = find_STR((char *)&ref[pstart], pend-pstart, 0);
1415+
DL_FOREACH_SAFE(reps, elt, tmp) {
1416+
if (elt->end >= pmid && elt->start <= pmid) {
1417+
if (pos_l > pstart + elt->start)
1418+
pos_l = pstart + elt->start;
1419+
if (pos_r < pstart + elt->end)
1420+
pos_r = pstart + elt->end;
1421+
}
1422+
DL_DELETE(reps, elt);
1423+
free(elt);
1424+
}
1425+
}
1426+
13941427
int str_len1 = l_run, str_len2 = l_run/4;
13951428
for (t = 0; t < n_types; ++t) {
13961429
int l, ir;
@@ -1420,7 +1453,7 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
14201453
tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, ref_len,
14211454
left, right, s, types[t], biggest_del,
14221455
&left_shift, &right_shift, &band,
1423-
tcon_len, &cpos_pos);
1456+
tcon_len, &cpos_pos, pos_l, pos_r);
14241457
// TODO: Consensus for a deletion shouldn't match the
14251458
// consensus for type 0. Eg consider
14261459
// vv vv

0 commit comments

Comments
 (0)