Skip to content

Commit 3399140

Browse files
committed
Fix endless loop or incorrect AF estimates
This problemm occurs would occur when missing genotypes were present and the `--estimate-AF -` option was used. Fixes #1687
1 parent b866bd7 commit 3399140

File tree

2 files changed

+38
-29
lines changed

2 files changed

+38
-29
lines changed

NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919

2020
- Fix the loss of phasing in half-missing genotypes in variant atomization (#1689)
2121

22+
* bcftools roh
23+
24+
- Fix a bug that could result in an endless loop or incorrect AF estimate when
25+
missing genotypes are present and the `--estimate-AF -` option was used (#1687)
26+
2227
* bcftools +split-vep
2328

2429
- VEP fields with characters disallowed in VCF tag names by the specification (such as '-'

vcfroh.c

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ static void init_data(args_t *args)
153153
args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
154154
if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
155155
error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n");
156-
if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT )
156+
if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT )
157157
error("Error: The FORMAT/PL tag not defined as Integer in the header\n");
158158
}
159159

@@ -279,15 +279,15 @@ static void init_data(args_t *args)
279279
MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
280280
MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW;
281281
MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
282-
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
282+
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
283283

284284
args->hmm = hmm_init(2, tprob, 10000);
285-
if ( args->genmap_fname )
285+
if ( args->genmap_fname )
286286
hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
287287
else if ( args->rec_rate > 0 )
288288
hmm_set_tprob_func(args->hmm, set_tprob_rrate, args);
289289

290-
args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu");
290+
args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu");
291291
if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno));
292292

293293
// print header
@@ -509,7 +509,7 @@ static void flush_viterbi(args_t *args, int ismpl)
509509

510510
if ( !args->vi_training ) // single viterbi pass
511511
{
512-
hmm_restore(args->hmm, smpl->snapshot);
512+
hmm_restore(args->hmm, smpl->snapshot);
513513
int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites;
514514
if ( end < smpl->nsites )
515515
smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->sites[smpl->nsites - args->nbuf_olap - 1]);
@@ -535,7 +535,7 @@ static void flush_viterbi(args_t *args, int ismpl)
535535

536536
if ( args->output_type & OUTPUT_RG )
537537
{
538-
if ( state!=smpl->rg.state )
538+
if ( state!=smpl->rg.state )
539539
{
540540
if ( !state ) // the region ends, flush
541541
{
@@ -599,7 +599,7 @@ static void flush_viterbi(args_t *args, int ismpl)
599599
MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
600600
MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW;
601601
MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ;
602-
MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
602+
MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
603603
hmm_set_tprob(args->hmm, tprob_arr, 10000);
604604

605605
int niter = 0;
@@ -627,14 +627,14 @@ static void flush_viterbi(args_t *args, int ismpl)
627627
delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev);
628628
niter++;
629629
args->str.l = 0;
630-
ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n",
630+
ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n",
631631
name,niter,deltaz,delthw,
632632
1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW),
633633
1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ));
634634
if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
635635
}
636636
while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th );
637-
637+
638638
// output the results
639639
for (i=0; i<smpl->nrid; i++)
640640
{
@@ -671,7 +671,7 @@ int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
671671

672672
char *tmp, *str = tgt->line.s;
673673
i = 0;
674-
while ( *str && i<3 )
674+
while ( *str && i<3 )
675675
{
676676
if ( *str=='\t' ) i++;
677677
str++;
@@ -722,7 +722,11 @@ int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq)
722722
int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr);
723723
while ( gt < end )
724724
{
725-
if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
725+
if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) )
726+
{
727+
gt += 2;
728+
continue;
729+
}
726730

727731
if ( bcf_gt_allele(gt[0]) ) nalt++;
728732
else nref++;
@@ -746,7 +750,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr
746750

747751
int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
748752
if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
749-
753+
750754
if ( args->af_smpl ) // subset samples for AF estimate
751755
{
752756
#define BRANCH(type_t) \
@@ -838,7 +842,7 @@ int process_line(args_t *args, bcf1_t *line, int ial)
838842
if ( ret==-2 )
839843
error("Type mismatch for INFO/%s tag at %s:%"PRId64"\n", args->af_tag, bcf_seqname(args->hdr,line), (int64_t) line->pos+1);
840844
}
841-
else if ( args->af_fname )
845+
else if ( args->af_fname )
842846
{
843847
// Read AF from a file
844848
ret = read_AF(args->files->targets, line, &alt_freq);
@@ -875,9 +879,9 @@ int process_line(args_t *args, bcf1_t *line, int ial)
875879
if ( ret>0 )
876880
AC = args->itmp[0];
877881
}
878-
if ( AN<=0 || AC<0 )
882+
if ( AN<=0 || AC<0 )
879883
ret = -1;
880-
else
884+
else
881885
alt_freq = (double) AC/AN;
882886
}
883887

@@ -962,12 +966,12 @@ int process_line(args_t *args, bcf1_t *line, int ial)
962966
smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2);
963967
if ( !smpl->eprob ) error("Error: failed to alloc %"PRIu64" bytes\n", (uint64_t)(sizeof(*smpl->eprob)*smpl->msites*2));
964968
}
965-
969+
966970
// Calculate emission probabilities P(D|AZ) and P(D|HW)
967971
double *eprob = &smpl->eprob[2*smpl->nsites];
968972
eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
969973
eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
970-
974+
971975
smpl->sites[smpl->nsites] = line->pos;
972976
smpl->nsites++;
973977

@@ -994,12 +998,12 @@ static void vcfroh(args_t *args, bcf1_t *line)
994998

995999
// Are we done?
9961000
if ( !line )
997-
{
1001+
{
9981002
for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
999-
return;
1003+
return;
10001004
}
10011005

1002-
// Skip unwanted lines, for simplicity we consider only biallelic sites
1006+
// Skip unwanted lines, for simplicity we consider only biallelic sites
10031007
if ( line->rid == args->skip_rid ) return;
10041008

10051009
// This can be raw callable VCF with the symbolic unseen allele <*>
@@ -1043,7 +1047,7 @@ static void vcfroh(args_t *args, bcf1_t *line)
10431047
args->prev_pos = line->pos;
10441048
skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
10451049
}
1046-
else if ( args->prev_pos == line->pos )
1050+
else if ( args->prev_pos == line->pos )
10471051
{
10481052
args->ndup++;
10491053
return; // skip duplicate positions
@@ -1161,7 +1165,7 @@ int main_vcfroh(int argc, char *argv[])
11611165
switch (c) {
11621166
case 0: args->af_tag = optarg; naf_opts++; break;
11631167
case 1: args->af_fname = optarg; naf_opts++; break;
1164-
case 2:
1168+
case 2:
11651169
args->dflt_AF = strtod(optarg,&tmp);
11661170
if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
11671171
break;
@@ -1173,7 +1177,7 @@ int main_vcfroh(int argc, char *argv[])
11731177
args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
11741178
case 5: args->include_noalt_sites = 1; break;
11751179
case 'o': args->output_fname = optarg; break;
1176-
case 'O':
1180+
case 'O':
11771181
if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
11781182
if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG;
11791183
if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ;
@@ -1183,10 +1187,10 @@ int main_vcfroh(int argc, char *argv[])
11831187
case 'i': args->skip_homref = 1; break;
11841188
case 'I': args->snps_only = 1; break;
11851189
case 'G':
1186-
args->fake_PLs = 1;
1190+
args->fake_PLs = 1;
11871191
args->unseen_PL = strtod(optarg,&tmp);
11881192
if ( *tmp ) error("Could not parse: -G %s\n", optarg);
1189-
args->unseen_PL = pow(10,-args->unseen_PL/10.);
1193+
args->unseen_PL = pow(10,-args->unseen_PL/10.);
11901194
break;
11911195
case 'm': args->genmap_fname = optarg; break;
11921196
case 'M':
@@ -1216,12 +1220,12 @@ int main_vcfroh(int argc, char *argv[])
12161220
if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
12171221
break;
12181222
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
1219-
case 'V':
1220-
args->vi_training = 1;
1221-
args->baum_welch_th = strtod(optarg,&tmp);
1223+
case 'V':
1224+
args->vi_training = 1;
1225+
args->baum_welch_th = strtod(optarg,&tmp);
12221226
if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg);
12231227
break;
1224-
case 'h':
1228+
case 'h':
12251229
case '?': usage(args); break;
12261230
default: error("Unknown argument: %s\n", optarg);
12271231
}

0 commit comments

Comments
 (0)