Modify the interpretation -E, --error-probability

pd3 · pd3 · commit 39a81bea6a76 · 2023-11-07T15:00:22.000Z
The --error-probability is newly interpreted as the probability of erroneous
allele rather than genotype. In other words, the calculation of the discordance
score now considers the probability of genotyping error to be different for HOM
and HET genotypes, i.e. P(0/1|dsg=0) &gt; P(1/1|dsg=0).
diff --git a/NEWS b/NEWS
@@ -45,6 +45,11 @@ Changes affecting specific commands:
 
         - adds a new column for the number of matching genotypes
 
+        - The --error-probability is newly interpreted as the probability of erroneous
+          allele rather than genotype. In other words, the calculation of the discordance
+          score now considers the probability of genotyping error to be different
+          for HOM and HET genotypes, i.e. P(0/1|dsg=0) > P(1/1|dsg=0).
+
         - fixes in HWE score calculation plus output average HWE score rather
           than absolute HWE score
 
diff --git a/test/gtcheck.10.out b/test/gtcheck.10.out
@@ -1 +1 @@
-DCv2	s1	s1	4.002001e-03	3.465736e-01	2	2
+DCv2	s1	s1	0.000000e+00	3.465736e-01	2	2
diff --git a/test/gtcheck.3.1.out b/test/gtcheck.3.1.out
@@ -0,0 +1,3 @@
+DCv2	A	D	2.302585e+00	0.000000e+00	1	1
+DCv2	A	E	4.605170e+00	0.000000e+00	1	1
+DCv2	D	E	2.302585e+00	0.000000e+00	1	1
diff --git a/test/gtcheck.5.1.out b/test/gtcheck.5.1.out
@@ -10,4 +10,4 @@ INFO	sites-used-PL-vs-PL	0
 INFO	sites-used-PL-vs-GT	1
 INFO	sites-used-GT-vs-PL	0
 INFO	sites-used-GT-vs-GT	1
-DCv2	A	A	3.000150e-04	0.000000e+00	2	2
+DCv2	A	A	1.000089e-12	0.000000e+00	2	2
diff --git a/test/gtcheck.6.1.out b/test/gtcheck.6.1.out
@@ -1,2 +1,2 @@
-DCv2	A	B	2.000100e-04	5.753641e-01	1	1
-DCv2	B	C	9.903588e+00	0.000000e+00	1	1
+DCv2	A	B	0.000000e+00	5.753641e-01	1	1
+DCv2	B	C	9.210340e+00	0.000000e+00	1	1
diff --git a/test/gtcheck.ntop.1.out b/test/gtcheck.ntop.1.out
@@ -1,6 +1,6 @@
-DCv2	smpl	x1	4.951814e+01	1.351550e-01	6	6
-DCv2	smpl	x2	9.904588e+00	7.931820e-01	6	6
-DCv2	smpl	x3	1.000050e-03	9.518185e-01	5	5
-DCv2	smpl	x4	2.971136e+01	3.662041e-01	6	6
-DCv2	smpl	x5	1.200060e-03	9.634573e-01	6	6
-DCv2	smpl	x6	3.961475e+01	2.310491e-01	6	6
+DCv2	smpl	x1	9.210340e+01	1.351550e-01	6	6
+DCv2	smpl	x2	1.842068e+01	7.931820e-01	6	6
+DCv2	smpl	x3	0.000000e+00	9.518185e-01	5	5
+DCv2	smpl	x4	5.526204e+01	3.662041e-01	6	6
+DCv2	smpl	x5	0.000000e+00	9.634573e-01	6	6
+DCv2	smpl	x6	7.368272e+01	2.310491e-01	6	6
diff --git a/test/gtcheck.ntop.2.out b/test/gtcheck.ntop.2.out
@@ -1,2 +1,2 @@
-DCv2	smpl	x5	1.200060e-03	9.634573e-01	6	6
-DCv2	smpl	x3	1.000050e-03	9.518185e-01	5	5
+DCv2	smpl	x3	0.000000e+00	9.518185e-01	5	5
+DCv2	smpl	x5	0.000000e+00	9.634573e-01	6	6
diff --git a/test/test.pl b/test/test.pl
@@ -921,6 +921,7 @@
 run_test(\&test_gtcheck,$opts,in=>'gtcheck.ntop',gts=>'gtcheck.ntop.gts',out=>'gtcheck.ntop.2.out',args=>q[--n-matches 2]);
 run_test(\&test_gtcheck,$opts,in=>'gtcheck.5',gts=>'gtcheck.5.gts',out=>'gtcheck.5.1.out',args=>q[],grep=>'grep -v Time');
 run_test(\&test_gtcheck,$opts,in=>'gtcheck.6',out=>'gtcheck.6.1.out',args=>q[-p A,B,B,C]);
+run_test(\&test_gtcheck,$opts,in=>'gtcheck.3',out=>'gtcheck.3.1.out',args=>q[-t 11:33 -p A,D,A,E,D,E -u GT -e 10]);
 
 print "\nNumber of tests:\n";
 printf "    total   .. %d\n", $$opts{nok}+$$opts{nfailed};
diff --git a/vcfgtcheck.c b/vcfgtcheck.c
@@ -395,21 +395,33 @@ static void init_data(args_t *args)
         args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
         args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
 
+        // Convert genotypes to genotype likelihoods given by -E, the probability of reading one allele incorrectly. In this
+        // simple model we have:
+        //     - probability of reading an allele incorrectly, eg. 0 as 1 or 1 as 0
+        //         P(0|1) = P(1|0) = e
+        //     - probability of genotype G={00,01,11} being correct given observed dosage {0,1,2} and the
+        //       genotyping error probability e:
+        //          P(00|0) = 1       P(00|1) = e       P(00|2) = e^2
+        //          P(01|0) = e       P(01|1) = 1       P(01|2) = e
+        //          P(11|0) = e^2     P(11|1) = e       P(11|2) = 1
+        //
         // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
-        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
+        // anything else indicated an error, this is just to reuse gt_to_dsg(); the second index are the corresponding
         // probabilities of 0/0, 0/1, and 1/1 genotypes
+        //
         for (i=0; i<8; i++)
             for (j=0; j<3; j++)
                 args->dsg2prob[i][j] = HUGE_VAL;
-        args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
+        double eprob = pow(10,-0.1*args->use_PLs);      // convert from phred score to probability
+        args->dsg2prob[1][0] = 0;               // P(00|0) = 1
+        args->dsg2prob[1][1] = -log(eprob);     // P(01|0) = e
+        args->dsg2prob[1][2] = -2*log(eprob);   // P(11|0) = e^2
+        args->dsg2prob[2][0] = -log(eprob);     // P(00|1) = e
+        args->dsg2prob[2][1] = 0;               // P(01|1) = 1
+        args->dsg2prob[2][2] = -log(eprob);     // P(11|1) = e
+        args->dsg2prob[4][0] = -2*log(eprob);   // P(00|2) = e^2
+        args->dsg2prob[4][1] = -log(eprob);     // P(01|2) = e
+        args->dsg2prob[4][2] = 0;               // P(11|2) = 1
 
         // lookup table to avoid exponentiation
         for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-DCv2 s1 s1 4.002001e-03 3.465736e-01 2 2`
	`1`	`+DCv2 s1 s1 0.000000e+00 3.465736e-01 2 2`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+DCv2 A D 2.302585e+00 0.000000e+00 1 1`
	`2`	`+DCv2 A E 4.605170e+00 0.000000e+00 1 1`
	`3`	`+DCv2 D E 2.302585e+00 0.000000e+00 1 1`