Skip to content

Commit 39a81be

Browse files
committed
Modify the interpretation -E, --error-probability
The --error-probability is newly interpreted as the probability of erroneous allele rather than genotype. In other words, the calculation of the discordance score now considers the probability of genotyping error to be different for HOM and HET genotypes, i.e. P(0/1|dsg=0) > P(1/1|dsg=0).
1 parent bd85368 commit 39a81be

File tree

9 files changed

+43
-22
lines changed

9 files changed

+43
-22
lines changed

NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ Changes affecting specific commands:
4545

4646
- adds a new column for the number of matching genotypes
4747

48+
- The --error-probability is newly interpreted as the probability of erroneous
49+
allele rather than genotype. In other words, the calculation of the discordance
50+
score now considers the probability of genotyping error to be different
51+
for HOM and HET genotypes, i.e. P(0/1|dsg=0) > P(1/1|dsg=0).
52+
4853
- fixes in HWE score calculation plus output average HWE score rather
4954
than absolute HWE score
5055

test/gtcheck.10.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
DCv2 s1 s1 4.002001e-03 3.465736e-01 2 2
1+
DCv2 s1 s1 0.000000e+00 3.465736e-01 2 2

test/gtcheck.3.1.out

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
DCv2 A D 2.302585e+00 0.000000e+00 1 1
2+
DCv2 A E 4.605170e+00 0.000000e+00 1 1
3+
DCv2 D E 2.302585e+00 0.000000e+00 1 1

test/gtcheck.5.1.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ INFO sites-used-PL-vs-PL 0
1010
INFO sites-used-PL-vs-GT 1
1111
INFO sites-used-GT-vs-PL 0
1212
INFO sites-used-GT-vs-GT 1
13-
DCv2 A A 3.000150e-04 0.000000e+00 2 2
13+
DCv2 A A 1.000089e-12 0.000000e+00 2 2

test/gtcheck.6.1.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
DCv2 A B 2.000100e-04 5.753641e-01 1 1
2-
DCv2 B C 9.903588e+00 0.000000e+00 1 1
1+
DCv2 A B 0.000000e+00 5.753641e-01 1 1
2+
DCv2 B C 9.210340e+00 0.000000e+00 1 1

test/gtcheck.ntop.1.out

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
DCv2 smpl x1 4.951814e+01 1.351550e-01 6 6
2-
DCv2 smpl x2 9.904588e+00 7.931820e-01 6 6
3-
DCv2 smpl x3 1.000050e-03 9.518185e-01 5 5
4-
DCv2 smpl x4 2.971136e+01 3.662041e-01 6 6
5-
DCv2 smpl x5 1.200060e-03 9.634573e-01 6 6
6-
DCv2 smpl x6 3.961475e+01 2.310491e-01 6 6
1+
DCv2 smpl x1 9.210340e+01 1.351550e-01 6 6
2+
DCv2 smpl x2 1.842068e+01 7.931820e-01 6 6
3+
DCv2 smpl x3 0.000000e+00 9.518185e-01 5 5
4+
DCv2 smpl x4 5.526204e+01 3.662041e-01 6 6
5+
DCv2 smpl x5 0.000000e+00 9.634573e-01 6 6
6+
DCv2 smpl x6 7.368272e+01 2.310491e-01 6 6

test/gtcheck.ntop.2.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
DCv2 smpl x5 1.200060e-03 9.634573e-01 6 6
2-
DCv2 smpl x3 1.000050e-03 9.518185e-01 5 5
1+
DCv2 smpl x3 0.000000e+00 9.518185e-01 5 5
2+
DCv2 smpl x5 0.000000e+00 9.634573e-01 6 6

test/test.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -921,6 +921,7 @@
921921
run_test(\&test_gtcheck,$opts,in=>'gtcheck.ntop',gts=>'gtcheck.ntop.gts',out=>'gtcheck.ntop.2.out',args=>q[--n-matches 2]);
922922
run_test(\&test_gtcheck,$opts,in=>'gtcheck.5',gts=>'gtcheck.5.gts',out=>'gtcheck.5.1.out',args=>q[],grep=>'grep -v Time');
923923
run_test(\&test_gtcheck,$opts,in=>'gtcheck.6',out=>'gtcheck.6.1.out',args=>q[-p A,B,B,C]);
924+
run_test(\&test_gtcheck,$opts,in=>'gtcheck.3',out=>'gtcheck.3.1.out',args=>q[-t 11:33 -p A,D,A,E,D,E -u GT -e 10]);
924925
925926
print "\nNumber of tests:\n";
926927
printf " total .. %d\n", $$opts{nok}+$$opts{nfailed};

vcfgtcheck.c

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -395,21 +395,33 @@ static void init_data(args_t *args)
395395
args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
396396
args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
397397

398+
// Convert genotypes to genotype likelihoods given by -E, the probability of reading one allele incorrectly. In this
399+
// simple model we have:
400+
// - probability of reading an allele incorrectly, eg. 0 as 1 or 1 as 0
401+
// P(0|1) = P(1|0) = e
402+
// - probability of genotype G={00,01,11} being correct given observed dosage {0,1,2} and the
403+
// genotyping error probability e:
404+
// P(00|0) = 1 P(00|1) = e P(00|2) = e^2
405+
// P(01|0) = e P(01|1) = 1 P(01|2) = e
406+
// P(11|0) = e^2 P(11|1) = e P(11|2) = 1
407+
//
398408
// dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
399-
// anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
409+
// anything else indicated an error, this is just to reuse gt_to_dsg(); the second index are the corresponding
400410
// probabilities of 0/0, 0/1, and 1/1 genotypes
411+
//
401412
for (i=0; i<8; i++)
402413
for (j=0; j<3; j++)
403414
args->dsg2prob[i][j] = HUGE_VAL;
404-
args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
405-
args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
406-
args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
407-
args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
408-
args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
409-
args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
410-
args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
411-
args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
412-
args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
415+
double eprob = pow(10,-0.1*args->use_PLs); // convert from phred score to probability
416+
args->dsg2prob[1][0] = 0; // P(00|0) = 1
417+
args->dsg2prob[1][1] = -log(eprob); // P(01|0) = e
418+
args->dsg2prob[1][2] = -2*log(eprob); // P(11|0) = e^2
419+
args->dsg2prob[2][0] = -log(eprob); // P(00|1) = e
420+
args->dsg2prob[2][1] = 0; // P(01|1) = 1
421+
args->dsg2prob[2][2] = -log(eprob); // P(11|1) = e
422+
args->dsg2prob[4][0] = -2*log(eprob); // P(00|2) = e^2
423+
args->dsg2prob[4][1] = -log(eprob); // P(01|2) = e
424+
args->dsg2prob[4][2] = 0; // P(11|2) = 1
413425

414426
// lookup table to avoid exponentiation
415427
for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);

0 commit comments

Comments
 (0)