@@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */
2727#include <htslib/vcf.h>
2828#include <htslib/vcfutils.h>
2929#include <htslib/kfunc.h>
30+ #include <htslib/hts_os.h>
3031#include <inttypes.h>
3132#include <getopt.h>
3233#include <ctype.h>
@@ -58,9 +59,9 @@ typedef struct
5859 int m_allele , M_allele , * gt , * phased , ploidy ;
5960 char * gt_str ;
6061 } custom ;
61- int filter_logic ;
62+ int filter_logic , rand_seed ;
6263 uint8_t * smpl_pass ;
63- double binom_val ;
64+ double binom_val , rand_frac ;
6465 char * binom_tag ;
6566 cmp_f binom_cmp ;
6667}
@@ -80,6 +81,7 @@ args_t *args = NULL;
8081#define GT_MINOR (1<<9)
8182#define GT_CUSTOM (1<<10)
8283#define GT_X_VAF (1<<11)
84+ #define GT_RAND (1<<12)
8385
8486#define MINOR_ALLELE -1
8587#define MAJOR_ALLELE -2
@@ -93,30 +95,32 @@ const char *usage(void)
9395{
9496 return
9597 "About: Sets genotypes. The target genotypes can be specified as:\n"
96- " ./. .. completely missing (\".\" or \"./.\", depending on ploidy)\n"
97- " ./x .. partially missing (e.g., \"./0\" or \".|1\" but not \"./.\")\n"
98- " . .. partially or completely missing\n"
99- " a .. all genotypes\n"
100- " b .. heterozygous genotypes failing two-tailed binomial test (example below)\n"
101- " q .. select genotypes using -i/-e options\n"
98+ " ./. .. completely missing (\".\" or \"./.\", depending on ploidy)\n"
99+ " ./x .. partially missing (e.g., \"./0\" or \".|1\" but not \"./.\")\n"
100+ " . .. partially or completely missing\n"
101+ " a .. all genotypes\n"
102+ " b .. heterozygous genotypes failing two-tailed binomial test (example below)\n"
103+ " q .. select genotypes using -i/-e options\n"
104+ " r:FLOAT .. select randomly a proportion of FLOAT genotypes (can be combined with other modes)\n"
102105 " and the new genotype can be one of:\n"
103- " . .. missing (\".\" or \"./.\", keeps ploidy)\n"
104- " 0 .. reference allele (e.g. 0/0 or 0, keeps ploidy)\n"
105- " c:GT .. custom genotype (e.g. 0/0, 0, 0/1, m/M, overrides ploidy)\n"
106- " m .. minor (the second most common) allele as determined from INFO/AC or FMT/GT (e.g. 1/1 or 1, keeps ploidy)\n"
107- " M .. major allele as determined from INFO/AC or FMT/GT (e.g. 1/1 or 1, keeps ploidy)\n"
108- " X .. allele with bigger read depth as determined from FMT/AD\n"
109- " p .. phase genotype (0/1 becomes 0|1)\n"
110- " u .. unphase genotype and sort by allele (1|0 becomes 0/1)\n"
106+ " . .. missing (\".\" or \"./.\", keeps ploidy)\n"
107+ " 0 .. reference allele (e.g. 0/0 or 0, keeps ploidy)\n"
108+ " c:GT .. custom genotype (e.g. 0/0, 0, 0/1, m/M, overrides ploidy)\n"
109+ " m .. minor (the second most common) allele as determined from INFO/AC or FMT/GT (e.g. 1/1 or 1, keeps ploidy)\n"
110+ " M .. major allele as determined from INFO/AC or FMT/GT (e.g. 1/1 or 1, keeps ploidy)\n"
111+ " X .. allele with bigger read depth as determined from FMT/AD\n"
112+ " p .. phase genotype (0/1 becomes 0|1)\n"
113+ " u .. unphase genotype and sort by allele (1|0 becomes 0/1)\n"
111114 "Usage: bcftools +setGT [General Options] -- [Plugin Options]\n"
112115 "Options:\n"
113116 " run \"bcftools plugin\" for a list of common options\n"
114117 "\n"
115118 "Plugin options:\n"
116- " -e, --exclude <expr> Exclude a genotype if true (requires -t q)\n"
117- " -i, --include <expr> include a genotype if true (requires -t q)\n"
118- " -n, --new-gt <type> Genotypes to set, see above\n"
119- " -t, --target-gt <type> Genotypes to change, see above\n"
119+ " -e, --exclude EXPR Exclude a genotype if true (requires -t q)\n"
120+ " -i, --include EXPR include a genotype if true (requires -t q)\n"
121+ " -n, --new-gt TYPE Genotypes to set, see above\n"
122+ " -s, --seed INT Random seed to use with -t r [0]\n"
123+ " -t, --target-gt TYPE Genotypes to change, see above\n"
120124 "\n"
121125 "Example:\n"
122126 " # set missing genotypes (\"./.\") to phased ref genotypes (\"0|0\")\n"
@@ -196,19 +200,25 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
196200 args -> in_hdr = in ;
197201 args -> out_hdr = out ;
198202
203+ char * tmp ;
199204 int c ;
200205 static struct option loptions [] =
201206 {
202207 {"include" ,required_argument ,NULL ,'i' },
203208 {"exclude" ,required_argument ,NULL ,'e' },
204209 {"new-gt" ,required_argument ,NULL ,'n' },
205210 {"target-gt" ,required_argument ,NULL ,'t' },
211+ {"seed" ,required_argument ,NULL ,'s' },
206212 {NULL ,0 ,NULL ,0 }
207213 };
208- while ((c = getopt_long (argc , argv , "?hn:t:i:e:" ,loptions ,NULL )) >= 0 )
214+ while ((c = getopt_long (argc , argv , "?hn:t:i:e:s: " ,loptions ,NULL )) >= 0 )
209215 {
210216 switch (c )
211217 {
218+ case 's' :
219+ args -> rand_seed = strtol (optarg ,& tmp ,10 );
220+ if ( * tmp ) error ("Could not parse: -s %s\n" ,optarg );
221+ break ;
212222 case 'e' :
213223 if ( args -> filter_str ) error ("Error: only one -i or -e expression can be given, and they cannot be combined\n" );
214224 args -> filter_str = optarg ; args -> filter_logic |= FLT_EXCLUDE ; break ;
@@ -233,6 +243,13 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
233243 if ( !strcmp (optarg ,"a" ) ) args -> tgt_mask |= GT_ALL ;
234244 if ( !strcmp (optarg ,"q" ) ) args -> tgt_mask |= GT_QUERY ;
235245 if ( !strcmp (optarg ,"?" ) ) args -> tgt_mask |= GT_QUERY ; // for backward compatibility
246+ if ( !strncmp (optarg ,"r:" ,2 ) )
247+ {
248+ args -> rand_frac = strtod (optarg + 2 ,& tmp );
249+ if ( * tmp ) error ("Could not parse: -t %s\n" , optarg );
250+ if ( args -> rand_frac <=0 || args -> rand_frac >=1 ) error ("Expected value between 0 and 1 with -t\n" );
251+ args -> tgt_mask |= GT_RAND ;
252+ }
236253 if ( strchr (optarg ,'b' ) ) parse_binom_expr (args , strchr (optarg ,'b' ));
237254 if ( args -> tgt_mask == 0 ) error ("Unknown parameter to --target-gt: %s\n" , optarg );
238255 break ;
@@ -244,6 +261,11 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
244261
245262 if ( !args -> new_mask ) error ("Expected -n option\n" );
246263 if ( !args -> tgt_mask ) error ("Expected -t option\n" );
264+ if ( args -> tgt_mask & GT_RAND )
265+ {
266+ if ( args -> tgt_mask == GT_RAND ) args -> tgt_mask |= GT_ALL ;
267+ hts_srand48 (args -> rand_seed );
268+ }
247269
248270 if ( args -> new_mask & GT_MISSING ) args -> new_gt = bcf_gt_missing ;
249271 if ( args -> new_mask & GT_REF ) args -> new_gt = args -> new_mask & GT_PHASED ? bcf_gt_phased (0 ) : bcf_gt_unphased (0 );
@@ -389,6 +411,11 @@ static inline double calc_binom(int na, int nb)
389411 return prob ;
390412}
391413
414+ static inline int random_draw (args_t * args )
415+ {
416+ return hts_drand48 () > args -> rand_frac ? 1 : 0 ; // reversed random draw
417+ }
418+
392419bcf1_t * process (bcf1_t * rec )
393420{
394421 if ( !rec -> n_sample ) return rec ;
@@ -510,6 +537,7 @@ bcf1_t *process(bcf1_t *rec)
510537
511538 double prob = calc_binom (args -> iarr [i * nbinom + ia ],args -> iarr [i * nbinom + ib ]);
512539 if ( !args -> binom_cmp (prob ,args -> binom_val ) ) continue ;
540+ if ( args -> tgt_mask & GT_RAND && random_draw (args ) ) continue ;
513541
514542 if ( args -> new_mask & GT_UNPHASED )
515543 changed += unphase_gt (ptr , ngts );
@@ -548,6 +576,7 @@ bcf1_t *process(bcf1_t *rec)
548576 for (i = 0 ; i < rec -> n_sample ; i ++ )
549577 {
550578 if ( args -> smpl_pass && !args -> smpl_pass [i ] ) continue ;
579+ if ( args -> tgt_mask & GT_RAND && random_draw (args ) ) continue ;
551580 if ( args -> new_mask & GT_UNPHASED )
552581 changed += unphase_gt (args -> gts + i * ngts , ngts );
553582 else if ( args -> new_mask == GT_PHASED )
@@ -579,6 +608,7 @@ bcf1_t *process(bcf1_t *rec)
579608 else if ( args -> tgt_mask & GT_MISSING && ploidy == nmiss ) do_set = 1 ;
580609
581610 if ( !do_set ) continue ;
611+ if ( args -> tgt_mask & GT_RAND && random_draw (args ) ) continue ;
582612
583613 if ( args -> new_mask & GT_UNPHASED )
584614 changed += unphase_gt (ptr , ngts );
0 commit comments