Skip to content

Commit 5202e44

Browse files
committed
Add new --target-gt r:FLOAT option to randomly select a proportion of genotypes
Resolves #1850
1 parent 868aefb commit 5202e44

File tree

3 files changed

+54
-21
lines changed

3 files changed

+54
-21
lines changed

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ Changes affecting specific commands:
105105

106106
- Add new `--new-gt X` option (#1800)
107107

108+
- Add new `--target-gt r:FLOAT` option to randomly select a proportion of genotypes (#1850)
109+
108110
- Fix a bug where `-t ./x` mode was advertised as selecting both phased and unphased
109111
half-missing genotypes, but was in fact selecting only unphased genotypes (#1844)
110112

plugins/setGT.c

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */
2727
#include <htslib/vcf.h>
2828
#include <htslib/vcfutils.h>
2929
#include <htslib/kfunc.h>
30+
#include <htslib/hts_os.h>
3031
#include <inttypes.h>
3132
#include <getopt.h>
3233
#include <ctype.h>
@@ -58,9 +59,9 @@ typedef struct
5859
int m_allele, M_allele, *gt, *phased, ploidy;
5960
char *gt_str;
6061
} custom;
61-
int filter_logic;
62+
int filter_logic, rand_seed;
6263
uint8_t *smpl_pass;
63-
double binom_val;
64+
double binom_val, rand_frac;
6465
char *binom_tag;
6566
cmp_f binom_cmp;
6667
}
@@ -80,6 +81,7 @@ args_t *args = NULL;
8081
#define GT_MINOR (1<<9)
8182
#define GT_CUSTOM (1<<10)
8283
#define GT_X_VAF (1<<11)
84+
#define GT_RAND (1<<12)
8385

8486
#define MINOR_ALLELE -1
8587
#define MAJOR_ALLELE -2
@@ -93,30 +95,32 @@ const char *usage(void)
9395
{
9496
return
9597
"About: Sets genotypes. The target genotypes can be specified as:\n"
96-
" ./. .. completely missing (\".\" or \"./.\", depending on ploidy)\n"
97-
" ./x .. partially missing (e.g., \"./0\" or \".|1\" but not \"./.\")\n"
98-
" . .. partially or completely missing\n"
99-
" a .. all genotypes\n"
100-
" b .. heterozygous genotypes failing two-tailed binomial test (example below)\n"
101-
" q .. select genotypes using -i/-e options\n"
98+
" ./. .. completely missing (\".\" or \"./.\", depending on ploidy)\n"
99+
" ./x .. partially missing (e.g., \"./0\" or \".|1\" but not \"./.\")\n"
100+
" . .. partially or completely missing\n"
101+
" a .. all genotypes\n"
102+
" b .. heterozygous genotypes failing two-tailed binomial test (example below)\n"
103+
" q .. select genotypes using -i/-e options\n"
104+
" r:FLOAT .. select randomly a proportion of FLOAT genotypes (can be combined with other modes)\n"
102105
" and the new genotype can be one of:\n"
103-
" . .. missing (\".\" or \"./.\", keeps ploidy)\n"
104-
" 0 .. reference allele (e.g. 0/0 or 0, keeps ploidy)\n"
105-
" c:GT .. custom genotype (e.g. 0/0, 0, 0/1, m/M, overrides ploidy)\n"
106-
" m .. minor (the second most common) allele as determined from INFO/AC or FMT/GT (e.g. 1/1 or 1, keeps ploidy)\n"
107-
" M .. major allele as determined from INFO/AC or FMT/GT (e.g. 1/1 or 1, keeps ploidy)\n"
108-
" X .. allele with bigger read depth as determined from FMT/AD\n"
109-
" p .. phase genotype (0/1 becomes 0|1)\n"
110-
" u .. unphase genotype and sort by allele (1|0 becomes 0/1)\n"
106+
" . .. missing (\".\" or \"./.\", keeps ploidy)\n"
107+
" 0 .. reference allele (e.g. 0/0 or 0, keeps ploidy)\n"
108+
" c:GT .. custom genotype (e.g. 0/0, 0, 0/1, m/M, overrides ploidy)\n"
109+
" m .. minor (the second most common) allele as determined from INFO/AC or FMT/GT (e.g. 1/1 or 1, keeps ploidy)\n"
110+
" M .. major allele as determined from INFO/AC or FMT/GT (e.g. 1/1 or 1, keeps ploidy)\n"
111+
" X .. allele with bigger read depth as determined from FMT/AD\n"
112+
" p .. phase genotype (0/1 becomes 0|1)\n"
113+
" u .. unphase genotype and sort by allele (1|0 becomes 0/1)\n"
111114
"Usage: bcftools +setGT [General Options] -- [Plugin Options]\n"
112115
"Options:\n"
113116
" run \"bcftools plugin\" for a list of common options\n"
114117
"\n"
115118
"Plugin options:\n"
116-
" -e, --exclude <expr> Exclude a genotype if true (requires -t q)\n"
117-
" -i, --include <expr> include a genotype if true (requires -t q)\n"
118-
" -n, --new-gt <type> Genotypes to set, see above\n"
119-
" -t, --target-gt <type> Genotypes to change, see above\n"
119+
" -e, --exclude EXPR Exclude a genotype if true (requires -t q)\n"
120+
" -i, --include EXPR include a genotype if true (requires -t q)\n"
121+
" -n, --new-gt TYPE Genotypes to set, see above\n"
122+
" -s, --seed INT Random seed to use with -t r [0]\n"
123+
" -t, --target-gt TYPE Genotypes to change, see above\n"
120124
"\n"
121125
"Example:\n"
122126
" # set missing genotypes (\"./.\") to phased ref genotypes (\"0|0\")\n"
@@ -196,19 +200,25 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
196200
args->in_hdr = in;
197201
args->out_hdr = out;
198202

203+
char *tmp;
199204
int c;
200205
static struct option loptions[] =
201206
{
202207
{"include",required_argument,NULL,'i'},
203208
{"exclude",required_argument,NULL,'e'},
204209
{"new-gt",required_argument,NULL,'n'},
205210
{"target-gt",required_argument,NULL,'t'},
211+
{"seed",required_argument,NULL,'s'},
206212
{NULL,0,NULL,0}
207213
};
208-
while ((c = getopt_long(argc, argv, "?hn:t:i:e:",loptions,NULL)) >= 0)
214+
while ((c = getopt_long(argc, argv, "?hn:t:i:e:s:",loptions,NULL)) >= 0)
209215
{
210216
switch (c)
211217
{
218+
case 's':
219+
args->rand_seed = strtol(optarg,&tmp,10);
220+
if ( *tmp ) error("Could not parse: -s %s\n",optarg);
221+
break;
212222
case 'e':
213223
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
214224
args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
@@ -233,6 +243,13 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
233243
if ( !strcmp(optarg,"a") ) args->tgt_mask |= GT_ALL;
234244
if ( !strcmp(optarg,"q") ) args->tgt_mask |= GT_QUERY;
235245
if ( !strcmp(optarg,"?") ) args->tgt_mask |= GT_QUERY; // for backward compatibility
246+
if ( !strncmp(optarg,"r:",2) )
247+
{
248+
args->rand_frac = strtod(optarg+2,&tmp);
249+
if ( *tmp ) error("Could not parse: -t %s\n", optarg);
250+
if ( args->rand_frac<=0 || args->rand_frac>=1 ) error("Expected value between 0 and 1 with -t\n");
251+
args->tgt_mask |= GT_RAND;
252+
}
236253
if ( strchr(optarg,'b') ) parse_binom_expr(args, strchr(optarg,'b'));
237254
if ( args->tgt_mask==0 ) error("Unknown parameter to --target-gt: %s\n", optarg);
238255
break;
@@ -244,6 +261,11 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
244261

245262
if ( !args->new_mask ) error("Expected -n option\n");
246263
if ( !args->tgt_mask ) error("Expected -t option\n");
264+
if ( args->tgt_mask & GT_RAND )
265+
{
266+
if ( args->tgt_mask==GT_RAND ) args->tgt_mask |= GT_ALL;
267+
hts_srand48(args->rand_seed);
268+
}
247269

248270
if ( args->new_mask & GT_MISSING ) args->new_gt = bcf_gt_missing;
249271
if ( args->new_mask & GT_REF ) args->new_gt = args->new_mask&GT_PHASED ? bcf_gt_phased(0) : bcf_gt_unphased(0);
@@ -389,6 +411,11 @@ static inline double calc_binom(int na, int nb)
389411
return prob;
390412
}
391413

414+
static inline int random_draw(args_t *args)
415+
{
416+
return hts_drand48() > args->rand_frac ? 1 : 0; // reversed random draw
417+
}
418+
392419
bcf1_t *process(bcf1_t *rec)
393420
{
394421
if ( !rec->n_sample ) return rec;
@@ -510,6 +537,7 @@ bcf1_t *process(bcf1_t *rec)
510537

511538
double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]);
512539
if ( !args->binom_cmp(prob,args->binom_val) ) continue;
540+
if ( args->tgt_mask&GT_RAND && random_draw(args) ) continue;
513541

514542
if ( args->new_mask&GT_UNPHASED )
515543
changed += unphase_gt(ptr, ngts);
@@ -548,6 +576,7 @@ bcf1_t *process(bcf1_t *rec)
548576
for (i=0; i<rec->n_sample; i++)
549577
{
550578
if ( args->smpl_pass && !args->smpl_pass[i] ) continue;
579+
if ( args->tgt_mask&GT_RAND && random_draw(args) ) continue;
551580
if ( args->new_mask&GT_UNPHASED )
552581
changed += unphase_gt(args->gts + i*ngts, ngts);
553582
else if ( args->new_mask==GT_PHASED )
@@ -579,6 +608,7 @@ bcf1_t *process(bcf1_t *rec)
579608
else if ( args->tgt_mask&GT_MISSING && ploidy==nmiss ) do_set = 1;
580609

581610
if ( !do_set ) continue;
611+
if ( args->tgt_mask&GT_RAND && random_draw(args) ) continue;
582612

583613
if ( args->new_mask&GT_UNPHASED )
584614
changed += unphase_gt(ptr, ngts);

test/test.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,7 @@
524524
run_test(\&test_vcf_plugin,$opts,in=>'setGT.5',out=>'setGT.5.1.out',cmd=>'+setGT --no-version',args=>q[-- -t a -n X]);
525525
run_test(\&test_vcf_plugin,$opts,in=>'setGT.6',out=>'setGT.6.1.out',cmd=>'+setGT --no-version',args=>q[-- -t ./x -n .]);
526526
run_test(\&test_vcf_plugin,$opts,in=>'setGT.6',out=>'setGT.6.1.out',cmd=>'+setGT --no-version',args=>q[-- -t . -n .]);
527+
run_test(\&test_vcf_plugin,$opts,in=>'setGT.6',out=>'setGT.6.2.out',cmd=>'+setGT --no-version',args=>q[-- -t r:0.5 -n .]);
527528
run_test(\&test_vcf_plugin,$opts,in=>'plugin1',out=>'fill-AN-AC.out',cmd=>'+fill-AN-AC --no-version');
528529
run_test(\&test_vcf_plugin,$opts,in=>'dosage',out=>'dosage.1.out',cmd=>'+dosage',args=>'-- -t PL');
529530
run_test(\&test_vcf_plugin,$opts,in=>'dosage',out=>'dosage.2.out',cmd=>'+dosage',args=>'-- -t GL');

0 commit comments

Comments
 (0)