Skip to content

Commit 4014f7e

Browse files
committed
Fix the lost ability to filter on subfield names
- Numeric subfields could not be filtered with e.g. -c 'MAX_AF:Float' -i 'MAX_AF<0.001' - Interpret default column type names (--columns-types) as entire strings, rather than substrings to avoid unexpected spurious matches (i.e. internally add ^ and $ to all field names). - Remove duplicate subfields Resolves #2039
1 parent 92f911b commit 4014f7e

File tree

6 files changed

+58
-24
lines changed

6 files changed

+58
-24
lines changed

NEWS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,14 @@ Changes affecting specific commands:
101101

102102
- Include sample name in the output header with `-H` whenever it makes sense (#1992)
103103

104+
* bcftools +spit-vep
105+
106+
- Fix on the fly filtering involving numeric subfields, e.g. `-i 'MAX_AF<0.001'` (#2039)
107+
108+
- Interpret default column type names (--columns-types) as entire strings, rather than
109+
substrings to avoid unexpected spurious matches (i.e. internally add ^ and $ to all
110+
field names)
111+
104112
* bcftools view
105113

106114
- Add new `-A, --trim-unseen-allele` option to remove the unseen allele <*> or <NON_REF>

plugins/split-vep.c

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ typedef struct
6060
{
6161
regex_t *regex;
6262
char *type;
63+
int bcf_ht_type;
6364
}
6465
col2type_t;
6566

@@ -170,7 +171,8 @@ static const char *default_column_types(void)
170171
{
171172
return
172173
"# Default CSQ subfield types, unlisted fields are type String.\n"
173-
"# Note the use of regular expressions.\n"
174+
"# Note that the name search is done using regular expressions, with\n"
175+
"# \"^\" and \"$\" appended automatically\n"
174176
"cDNA_position Integer\n"
175177
"CDS_position Integer\n"
176178
"Protein_position Integer\n"
@@ -179,6 +181,7 @@ static const char *default_column_types(void)
179181
"TSL Integer\n"
180182
"GENE_PHENO Integer\n"
181183
"HGVS_OFFSET Integer\n"
184+
".*_POPS String\n" // e.g. MAX_AF_POPS
182185
"AF Float\n"
183186
".*_AF Float\n"
184187
"MAX_AF_.* Float\n"
@@ -344,32 +347,34 @@ static void init_column2type(args_t *args)
344347
free(str);
345348
}
346349
if ( !type || !ntype ) error("Failed to parse the column types\n");
350+
kstring_t tmp = {0,0,0};
347351
for (i=0; i<ntype; i++)
348352
{
349353
if ( type[i][0]=='#' ) continue;
350-
char *tmp = strdup(type[i]);
351-
char *ptr = tmp;
354+
tmp.l = 0;
355+
kputc('^',&tmp);
356+
char *ptr = type[i];
352357
while ( *ptr && !isspace(*ptr) ) ptr++;
353358
if ( !*ptr ) error("Error: failed to parse the column type \"%s\"\n",type[i]);
354-
*ptr = 0;
355-
ptr++;
359+
kputsn(type[i],ptr-type[i],&tmp);
360+
kputc('$',&tmp);
356361
while ( *ptr && isspace(*ptr) ) ptr++;
357362
if ( !*ptr ) error("Error: failed to parse the column type \"%s\"\n",type[i]);
358363
args->ncolumn2type++;
359364
args->column2type = (col2type_t*) realloc(args->column2type,sizeof(*args->column2type)*args->ncolumn2type);
360365
col2type_t *ct = &args->column2type[args->ncolumn2type-1];
361366
ct->regex = (regex_t *) malloc(sizeof(regex_t));
362-
if ( regcomp(ct->regex, tmp, REG_NOSUB) )
363-
error("Error: fail to compile the column type regular expression \"%s\": %s\n", tmp,type[i]);
364-
int type_ok = 0;
365-
if ( !strcmp(ptr,"Float") ) type_ok = 1;
366-
else if ( !strcmp(ptr,"Integer") ) type_ok = 1;
367-
else if ( !strcmp(ptr,"Flag") ) type_ok = 1;
368-
else if ( !strcmp(ptr,"String") ) type_ok = 1;
369-
if ( !type_ok ) error("Error: the column type \"%s\" is not supported: %s\n",ptr,type[i]);
367+
if ( regcomp(ct->regex, tmp.s, REG_NOSUB) )
368+
error("Error: fail to compile the column type regular expression \"%s\": %s\n", tmp.s,type[i]);
369+
ct->bcf_ht_type = -1;
370+
if ( !strcmp(ptr,"Float") ) ct->bcf_ht_type = BCF_HT_REAL;
371+
else if ( !strcmp(ptr,"Integer") ) ct->bcf_ht_type = BCF_HT_INT;
372+
else if ( !strcmp(ptr,"Flag") ) ct->bcf_ht_type = BCF_HT_FLAG;
373+
else if ( !strcmp(ptr,"String") ) ct->bcf_ht_type = BCF_HT_STR;
374+
if ( ct->bcf_ht_type==-1 ) error("Error: the column type \"%s\" is not supported: %s\n",ptr,type[i]);
370375
ct->type = strdup(ptr);
371-
free(tmp);
372376
}
377+
free(tmp.s);
373378
if ( !args->ncolumn2type ) error("Failed to parse the column types\n");
374379
for (i=0; i<ntype; i++) free(type[i]);
375380
free(type);
@@ -387,15 +392,20 @@ static void destroy_column2type(args_t *args)
387392
args->ncolumn2type = 0;
388393
args->column2type = NULL;
389394
}
390-
static const char *get_column_type(args_t *args, char *field)
395+
static const char *get_column_type(args_t *args, char *field, int *type)
391396
{
392397
if ( !args->column2type ) init_column2type(args);
393398
int i;
394399
for (i=0; i<args->ncolumn2type; i++)
395400
{
396401
int match = regexec(args->column2type[i].regex, field, 0,NULL,0) ? 0 : 1;
397-
if ( match ) return args->column2type[i].type;
402+
if ( match )
403+
{
404+
*type = args->column2type[i].bcf_ht_type;
405+
return args->column2type[i].type;
406+
}
398407
}
408+
*type = BCF_HT_STR;
399409
return "String";
400410
}
401411

@@ -647,7 +657,20 @@ static void parse_column_str(args_t *args)
647657
ep++;
648658
}
649659

650-
// Now add each column to the VCF header and reconstruct the column_str in case it will be needed later
660+
// Prune duplicates
661+
for (i=0; i<args->nannot; i++)
662+
{
663+
for (j=0; j<i; j++)
664+
if ( !strcmp(args->field[column[i]],args->field[column[j]]) ) break;
665+
if ( i==j ) continue; // unique tag, no action needed
666+
args->nannot--;
667+
if ( i==args->nannot ) break; // the last one is to be skipped, we are done
668+
memmove(&column[i],&column[i+1],sizeof(*column)*(args->nannot-i));
669+
i--;
670+
}
671+
672+
// Now initizalize each annotation, add each column to the VCF header, and reconstruct
673+
// the column_str in case it will be needed later
651674
free(args->column_str);
652675
kstring_t str = {0,0,0};
653676
args->annot = (annot_t*)calloc(args->nannot,sizeof(*args->annot));
@@ -664,7 +687,7 @@ static void parse_column_str(args_t *args)
664687
else if ( ann->type==BCF_HT_INT ) type = "Integer";
665688
else if ( ann->type==BCF_HT_FLAG ) type = "Flag";
666689
else if ( ann->type==BCF_HT_STR ) type = "String";
667-
else if ( ann->type==-1 ) type = get_column_type(args, args->field[j]);
690+
else if ( ann->type==-1 ) type = get_column_type(args, args->field[j], &ann->type);
668691
ksprintf(&args->kstr,"##INFO=<ID=%%s,Number=.,Type=%s,Description=\"The %%s field from INFO/%%s\">",type);
669692
bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag);
670693
if ( str.l ) kputc(',',&str);

test/split-vep.18.out

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)