@@ -395,21 +395,33 @@ static void init_data(args_t *args)
395395 args -> qry_prob = (double * ) malloc (3 * args -> nqry_smpl * sizeof (* args -> qry_prob ));
396396 args -> gt_prob = args -> cross_check ? args -> qry_prob : (double * ) malloc (3 * args -> ngt_smpl * sizeof (* args -> gt_prob ));
397397
398+ // Convert genotypes to genotype likelihoods given by -E, the probability of reading one allele incorrectly. In this
399+ // simple model we have:
400+ // - probability of reading an allele incorrectly, eg. 0 as 1 or 1 as 0
401+ // P(0|1) = P(1|0) = e
402+ // - probability of genotype G={00,01,11} being correct given observed dosage {0,1,2} and the
403+ // genotyping error probability e:
404+ // P(00|0) = 1 P(00|1) = e P(00|2) = e^2
405+ // P(01|0) = e P(01|1) = 1 P(01|2) = e
406+ // P(11|0) = e^2 P(11|1) = e P(11|2) = 1
407+ //
398408 // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
399- // anything else indicated an error, this is just to reuse gt_to_dsg()) ; the second index are the corresponding
409+ // anything else indicated an error, this is just to reuse gt_to_dsg(); the second index are the corresponding
400410 // probabilities of 0/0, 0/1, and 1/1 genotypes
411+ //
401412 for (i = 0 ; i < 8 ; i ++ )
402413 for (j = 0 ; j < 3 ; j ++ )
403414 args -> dsg2prob [i ][j ] = HUGE_VAL ;
404- args -> dsg2prob [1 ][0 ] = - log (1 - pow (10 ,-0.1 * args -> use_PLs ));
405- args -> dsg2prob [1 ][1 ] = - log (0.5 * pow (10 ,-0.1 * args -> use_PLs ));
406- args -> dsg2prob [1 ][2 ] = - log (0.5 * pow (10 ,-0.1 * args -> use_PLs ));
407- args -> dsg2prob [2 ][0 ] = - log (0.5 * pow (10 ,-0.1 * args -> use_PLs ));
408- args -> dsg2prob [2 ][1 ] = - log (1 - pow (10 ,-0.1 * args -> use_PLs ));
409- args -> dsg2prob [2 ][2 ] = - log (0.5 * pow (10 ,-0.1 * args -> use_PLs ));
410- args -> dsg2prob [4 ][0 ] = - log (0.5 * pow (10 ,-0.1 * args -> use_PLs ));
411- args -> dsg2prob [4 ][1 ] = - log (0.5 * pow (10 ,-0.1 * args -> use_PLs ));
412- args -> dsg2prob [4 ][2 ] = - log (1 - pow (10 ,-0.1 * args -> use_PLs ));
415+ double eprob = pow (10 ,-0.1 * args -> use_PLs ); // convert from phred score to probability
416+ args -> dsg2prob [1 ][0 ] = 0 ; // P(00|0) = 1
417+ args -> dsg2prob [1 ][1 ] = - log (eprob ); // P(01|0) = e
418+ args -> dsg2prob [1 ][2 ] = -2 * log (eprob ); // P(11|0) = e^2
419+ args -> dsg2prob [2 ][0 ] = - log (eprob ); // P(00|1) = e
420+ args -> dsg2prob [2 ][1 ] = 0 ; // P(01|1) = 1
421+ args -> dsg2prob [2 ][2 ] = - log (eprob ); // P(11|1) = e
422+ args -> dsg2prob [4 ][0 ] = -2 * log (eprob ); // P(00|2) = e^2
423+ args -> dsg2prob [4 ][1 ] = - log (eprob ); // P(01|2) = e
424+ args -> dsg2prob [4 ][2 ] = 0 ; // P(11|2) = 1
413425
414426 // lookup table to avoid exponentiation
415427 for (i = 0 ; i < 256 ; i ++ ) args -> pl2prob [i ] = pow (10 ,-0.1 * i );
0 commit comments