|
1 | 1 | /* vcfgtcheck.c -- Check sample identity. |
2 | 2 |
|
3 | | - Copyright (C) 2013-2021 Genome Research Ltd. |
| 3 | + Copyright (C) 2013-2023 Genome Research Ltd. |
4 | 4 |
|
5 | 5 | Author: Petr Danecek <[email protected]> |
6 | 6 |
|
@@ -59,6 +59,7 @@ typedef struct |
59 | 59 | int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file; |
60 | 60 | int regions_overlap, targets_overlap; |
61 | 61 | int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl; |
| 62 | + int nused[2][2]; |
62 | 63 | double *pdiff, *qry_prob, *gt_prob; |
63 | 64 | uint32_t *ndiff,*ncnt,ncmp, npairs; |
64 | 65 | int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr; |
@@ -309,7 +310,7 @@ static void init_data(args_t *args) |
309 | 310 | init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname); |
310 | 311 | } |
311 | 312 | if ( args->gt_samples ) |
312 | | - { |
| 313 | + { |
313 | 314 | init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl, |
314 | 315 | args->gt_hdr ? args->gt_hdr : args->qry_hdr, |
315 | 316 | args->gt_fname ? args->gt_fname : args->qry_fname); |
@@ -377,7 +378,7 @@ static void init_data(args_t *args) |
377 | 378 | args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob)); |
378 | 379 |
|
379 | 380 | // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing |
380 | | - // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding |
| 381 | + // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding |
381 | 382 | // probabilities of 0/0, 0/1, and 1/1 genotypes |
382 | 383 | for (i=0; i<8; i++) |
383 | 384 | for (j=0; j<3; j++) |
@@ -555,7 +556,9 @@ static void process_line(args_t *args) |
555 | 556 | args->gt_arr = args->qry_arr; |
556 | 557 | } |
557 | 558 |
|
| 559 | + // stats: number of compared sites, and used tags |
558 | 560 | args->ncmp++; |
| 561 | + args->nused[qry_use_GT][gt_use_GT]++; |
559 | 562 |
|
560 | 563 | double af,hwe_dsg[8]; |
561 | 564 | if ( args->calc_hwe_prob ) |
@@ -636,7 +639,7 @@ static void process_line(args_t *args) |
636 | 639 | gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob); |
637 | 640 | if ( !gt_dsg ) continue; // missing value |
638 | 641 | if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom |
639 | | - |
| 642 | + |
640 | 643 | ptr = args->qry_arr + args->pairs[i].iqry*nqry1; |
641 | 644 | qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob); |
642 | 645 | if ( !qry_dsg ) continue; // missing value |
@@ -797,11 +800,15 @@ static void report(args_t *args) |
797 | 800 | fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data); |
798 | 801 | fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT); |
799 | 802 | fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL); |
| 803 | + fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]); |
| 804 | + fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]); |
| 805 | + fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]); |
| 806 | + fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]); |
800 | 807 | fprintf(args->fp,"# DC, discordance:\n"); |
801 | 808 | fprintf(args->fp,"# - query sample\n"); |
802 | 809 | fprintf(args->fp,"# - genotyped sample\n"); |
803 | | - fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n"); |
804 | | - fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n"); |
| 810 | + fprintf(args->fp,"# - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n"); |
| 811 | + fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n"); |
805 | 812 | fprintf(args->fp,"# - number of sites compared (bigger is better)\n"); |
806 | 813 | fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n"); |
807 | 814 |
|
@@ -1023,7 +1030,7 @@ static int is_input_okay(args_t *args, int nmatch) |
1023 | 1030 | return 1; |
1024 | 1031 |
|
1025 | 1032 | not_okay: |
1026 | | - fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", |
| 1033 | + fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", |
1027 | 1034 | bcf_seqname(hdr,rec),rec->pos+1,msg); |
1028 | 1035 | return 0; |
1029 | 1036 | } |
@@ -1097,7 +1104,7 @@ int main_vcfgtcheck(int argc, char *argv[]) |
1097 | 1104 | args->es_max_mem = strdup("500M"); |
1098 | 1105 |
|
1099 | 1106 | // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 |
1100 | | - // - min_inter: pairs with smaller err value will be considered identical |
| 1107 | + // - min_inter: pairs with smaller err value will be considered identical |
1101 | 1108 | // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered |
1102 | 1109 | // different. If negative, the cutoff may be heuristically lowered |
1103 | 1110 | args->min_inter_err = 0.23; |
@@ -1169,7 +1176,7 @@ int main_vcfgtcheck(int argc, char *argv[]) |
1169 | 1176 | case 3 : args->calc_hwe_prob = 0; break; |
1170 | 1177 | case 4 : error("The option -S, --target-sample has been deprecated\n"); break; |
1171 | 1178 | case 5 : args->dry_run = 1; break; |
1172 | | - case 6 : |
| 1179 | + case 6 : |
1173 | 1180 | args->distinctive_sites = strtod(optarg,&tmp); |
1174 | 1181 | if ( *tmp ) |
1175 | 1182 | { |
@@ -1202,7 +1209,7 @@ int main_vcfgtcheck(int argc, char *argv[]) |
1202 | 1209 | else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4; |
1203 | 1210 | else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); |
1204 | 1211 | break; |
1205 | | - case 'S': |
| 1212 | + case 'S': |
1206 | 1213 | if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1; |
1207 | 1214 | else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1; |
1208 | 1215 | else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); |
|
0 commit comments