@@ -60,6 +60,7 @@ typedef struct
6060{
6161 regex_t * regex ;
6262 char * type ;
63+ int bcf_ht_type ;
6364}
6465col2type_t ;
6566
@@ -170,7 +171,8 @@ static const char *default_column_types(void)
170171{
171172 return
172173 "# Default CSQ subfield types, unlisted fields are type String.\n"
173- "# Note the use of regular expressions.\n"
174+ "# Note that the name search is done using regular expressions, with\n"
175+ "# \"^\" and \"$\" appended automatically\n"
174176 "cDNA_position Integer\n"
175177 "CDS_position Integer\n"
176178 "Protein_position Integer\n"
@@ -179,6 +181,7 @@ static const char *default_column_types(void)
179181 "TSL Integer\n"
180182 "GENE_PHENO Integer\n"
181183 "HGVS_OFFSET Integer\n"
184+ ".*_POPS String\n" // e.g. MAX_AF_POPS
182185 "AF Float\n"
183186 ".*_AF Float\n"
184187 "MAX_AF_.* Float\n"
@@ -344,32 +347,34 @@ static void init_column2type(args_t *args)
344347 free (str );
345348 }
346349 if ( !type || !ntype ) error ("Failed to parse the column types\n" );
350+ kstring_t tmp = {0 ,0 ,0 };
347351 for (i = 0 ; i < ntype ; i ++ )
348352 {
349353 if ( type [i ][0 ]== '#' ) continue ;
350- char * tmp = strdup (type [i ]);
351- char * ptr = tmp ;
354+ tmp .l = 0 ;
355+ kputc ('^' ,& tmp );
356+ char * ptr = type [i ];
352357 while ( * ptr && !isspace (* ptr ) ) ptr ++ ;
353358 if ( !* ptr ) error ("Error: failed to parse the column type \"%s\"\n" ,type [i ]);
354- * ptr = 0 ;
355- ptr ++ ;
359+ kputsn ( type [ i ], ptr - type [ i ], & tmp ) ;
360+ kputc ( '$' , & tmp ) ;
356361 while ( * ptr && isspace (* ptr ) ) ptr ++ ;
357362 if ( !* ptr ) error ("Error: failed to parse the column type \"%s\"\n" ,type [i ]);
358363 args -> ncolumn2type ++ ;
359364 args -> column2type = (col2type_t * ) realloc (args -> column2type ,sizeof (* args -> column2type )* args -> ncolumn2type );
360365 col2type_t * ct = & args -> column2type [args -> ncolumn2type - 1 ];
361366 ct -> regex = (regex_t * ) malloc (sizeof (regex_t ));
362- if ( regcomp (ct -> regex , tmp , REG_NOSUB ) )
363- error ("Error: fail to compile the column type regular expression \"%s\": %s\n" , tmp ,type [i ]);
364- int type_ok = 0 ;
365- if ( !strcmp (ptr ,"Float" ) ) type_ok = 1 ;
366- else if ( !strcmp (ptr ,"Integer" ) ) type_ok = 1 ;
367- else if ( !strcmp (ptr ,"Flag" ) ) type_ok = 1 ;
368- else if ( !strcmp (ptr ,"String" ) ) type_ok = 1 ;
369- if ( ! type_ok ) error ("Error: the column type \"%s\" is not supported: %s\n" ,ptr ,type [i ]);
367+ if ( regcomp (ct -> regex , tmp . s , REG_NOSUB ) )
368+ error ("Error: fail to compile the column type regular expression \"%s\": %s\n" , tmp . s ,type [i ]);
369+ ct -> bcf_ht_type = -1 ;
370+ if ( !strcmp (ptr ,"Float" ) ) ct -> bcf_ht_type = BCF_HT_REAL ;
371+ else if ( !strcmp (ptr ,"Integer" ) ) ct -> bcf_ht_type = BCF_HT_INT ;
372+ else if ( !strcmp (ptr ,"Flag" ) ) ct -> bcf_ht_type = BCF_HT_FLAG ;
373+ else if ( !strcmp (ptr ,"String" ) ) ct -> bcf_ht_type = BCF_HT_STR ;
374+ if ( ct -> bcf_ht_type == -1 ) error ("Error: the column type \"%s\" is not supported: %s\n" ,ptr ,type [i ]);
370375 ct -> type = strdup (ptr );
371- free (tmp );
372376 }
377+ free (tmp .s );
373378 if ( !args -> ncolumn2type ) error ("Failed to parse the column types\n" );
374379 for (i = 0 ; i < ntype ; i ++ ) free (type [i ]);
375380 free (type );
@@ -387,15 +392,20 @@ static void destroy_column2type(args_t *args)
387392 args -> ncolumn2type = 0 ;
388393 args -> column2type = NULL ;
389394}
390- static const char * get_column_type (args_t * args , char * field )
395+ static const char * get_column_type (args_t * args , char * field , int * type )
391396{
392397 if ( !args -> column2type ) init_column2type (args );
393398 int i ;
394399 for (i = 0 ; i < args -> ncolumn2type ; i ++ )
395400 {
396401 int match = regexec (args -> column2type [i ].regex , field , 0 ,NULL ,0 ) ? 0 : 1 ;
397- if ( match ) return args -> column2type [i ].type ;
402+ if ( match )
403+ {
404+ * type = args -> column2type [i ].bcf_ht_type ;
405+ return args -> column2type [i ].type ;
406+ }
398407 }
408+ * type = BCF_HT_STR ;
399409 return "String" ;
400410}
401411
@@ -647,7 +657,20 @@ static void parse_column_str(args_t *args)
647657 ep ++ ;
648658 }
649659
650- // Now add each column to the VCF header and reconstruct the column_str in case it will be needed later
660+ // Prune duplicates
661+ for (i = 0 ; i < args -> nannot ; i ++ )
662+ {
663+ for (j = 0 ; j < i ; j ++ )
664+ if ( !strcmp (args -> field [column [i ]],args -> field [column [j ]]) ) break ;
665+ if ( i == j ) continue ; // unique tag, no action needed
666+ args -> nannot -- ;
667+ if ( i == args -> nannot ) break ; // the last one is to be skipped, we are done
668+ memmove (& column [i ],& column [i + 1 ],sizeof (* column )* (args -> nannot - i ));
669+ i -- ;
670+ }
671+
672+ // Now initizalize each annotation, add each column to the VCF header, and reconstruct
673+ // the column_str in case it will be needed later
651674 free (args -> column_str );
652675 kstring_t str = {0 ,0 ,0 };
653676 args -> annot = (annot_t * )calloc (args -> nannot ,sizeof (* args -> annot ));
@@ -664,7 +687,7 @@ static void parse_column_str(args_t *args)
664687 else if ( ann -> type == BCF_HT_INT ) type = "Integer" ;
665688 else if ( ann -> type == BCF_HT_FLAG ) type = "Flag" ;
666689 else if ( ann -> type == BCF_HT_STR ) type = "String" ;
667- else if ( ann -> type == -1 ) type = get_column_type (args , args -> field [j ]);
690+ else if ( ann -> type == -1 ) type = get_column_type (args , args -> field [j ], & ann -> type );
668691 ksprintf (& args -> kstr ,"##INFO=<ID=%%s,Number=.,Type=%s,Description=\"The %%s field from INFO/%%s\">" ,type );
669692 bcf_hdr_printf (args -> hdr_out , args -> kstr .s , ann -> tag ,ann -> field ,args -> vep_tag );
670693 if ( str .l ) kputc (',' ,& str );
0 commit comments