@@ -93,7 +93,7 @@ typedef struct
9393 int32_t * int32_arr ;
9494 int ntmp_arr1 , ntmp_arr2 , nint32_arr ;
9595 kstring_t * tmp_str ;
96- kstring_t * tmp_als , * tmp_sym , tmp_kstr ;
96+ kstring_t * tmp_als , * tmp_sym , tmp_kstr , old_rec_tag_kstr ;
9797 int ntmp_als , ntmp_sym ;
9898 rbuf_t rbuf ;
9999 int buf_win ; // maximum distance between two records to consider
@@ -127,6 +127,42 @@ typedef struct
127127}
128128args_t ;
129129
130+ static void old_rec_tag_init (args_t * args , bcf1_t * line )
131+ {
132+ if ( !args -> old_rec_tag ) return ;
133+
134+ args -> old_rec_tag_kstr .l = 0 ;
135+ ksprintf (& args -> old_rec_tag_kstr ,"%s|%" PRIhts_pos "|%s|" ,bcf_seqname (args -> hdr ,line ),line -> pos + 1 ,line -> d .allele [0 ]);
136+ int i ;
137+ for (i = 1 ; i < line -> n_allele ; i ++ )
138+ {
139+ kputs (line -> d .allele [i ],& args -> old_rec_tag_kstr );
140+ if ( i + 1 < line -> n_allele ) kputc (',' ,& args -> old_rec_tag_kstr );
141+ }
142+ }
143+ static void old_rec_tag_set (args_t * args , bcf1_t * line , int ialt )
144+ {
145+ if ( !args -> old_rec_tag || !args -> old_rec_tag_kstr .l ) return ;
146+
147+ // only update if the tag is not present already, there can be multiple normalization steps
148+ int i , id = bcf_hdr_id2int (args -> out_hdr , BCF_DT_ID , args -> old_rec_tag );
149+ bcf_unpack (line , BCF_UN_INFO );
150+ for (i = 0 ; i < line -> n_info ; i ++ )
151+ {
152+ bcf_info_t * inf = & line -> d .info [i ];
153+ if ( inf && inf -> key == id ) return ;
154+ }
155+
156+ if ( ialt > 0 )
157+ {
158+ kputc ('|' ,& args -> old_rec_tag_kstr );
159+ kputw (ialt ,& args -> old_rec_tag_kstr );
160+ }
161+ if ( (bcf_update_info_string (args -> out_hdr , line , args -> old_rec_tag , args -> old_rec_tag_kstr .s ))!= 0 )
162+ error ("An error occurred while updating INFO/%s\n" ,args -> old_rec_tag );
163+ args -> old_rec_tag_kstr .l = 0 ;
164+ }
165+
130166static inline int replace_iupac_codes (char * seq , int nseq )
131167{
132168 // Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end
@@ -159,7 +195,8 @@ static void seq_to_upper(char *seq, int len)
159195 for (i = 0 ; seq [i ]; i ++ ) seq [i ] = nt_to_upper (seq [i ]);
160196}
161197
162- static void fix_ref (args_t * args , bcf1_t * line )
198+ // returns 0 when no fix was needed, 1 otherwise
199+ static int fix_ref (args_t * args , bcf1_t * line )
163200{
164201 bcf_unpack (line , BCF_UN_STR );
165202 int reflen = strlen (line -> d .allele [0 ]);
@@ -177,7 +214,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
177214 args -> nref .tot ++ ;
178215
179216 // is the REF different? If not, we are done
180- if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return ; }
217+ if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return 0 ; }
181218
182219 // is the REF allele missing?
183220 if ( reflen == 1 && line -> d .allele [0 ][0 ]== '.' )
@@ -186,11 +223,11 @@ static void fix_ref(args_t *args, bcf1_t *line)
186223 args -> nref .set ++ ;
187224 free (ref );
188225 bcf_update_alleles (args -> out_hdr ,line ,(const char * * )line -> d .allele ,line -> n_allele );
189- return ;
226+ return 1 ;
190227 }
191228
192229 // does REF or ALT contain non-standard bases?
193- int has_non_acgtn = 0 ;
230+ int ret = 0 , has_non_acgtn = 0 ;
194231 for (i = 0 ; i < line -> n_allele ; i ++ )
195232 {
196233 if ( line -> d .allele [i ][0 ]== '<' ) continue ;
@@ -200,7 +237,8 @@ static void fix_ref(args_t *args, bcf1_t *line)
200237 {
201238 args -> nref .set ++ ;
202239 bcf_update_alleles (args -> out_hdr ,line ,(const char * * )line -> d .allele ,line -> n_allele );
203- if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return ; }
240+ if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return 1 ; }
241+ ret = 1 ;
204242 }
205243
206244 // does the REF allele contain N's ?
@@ -221,12 +259,12 @@ static void fix_ref(args_t *args, bcf1_t *line)
221259 }
222260 if ( fix )
223261 {
262+ ret = 1 ;
224263 args -> nref .set ++ ;
225264 bcf_update_alleles (args -> out_hdr ,line ,(const char * * )line -> d .allele ,line -> n_allele );
226- if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return ; }
265+ if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return ret ; }
227266 }
228267
229-
230268 // is it swapped?
231269 for (i = 1 ; i < line -> n_allele ; i ++ )
232270 {
@@ -237,6 +275,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
237275 kstring_t str = {0 ,0 ,0 };
238276 if ( i == line -> n_allele ) // none of the alternate alleles matches the reference
239277 {
278+ ret = 1 ;
240279 args -> nref .set ++ ;
241280 kputsn (ref ,reflen ,& str );
242281 for (i = 1 ; i < line -> n_allele ; i ++ )
@@ -247,7 +286,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
247286 bcf_update_alleles_str (args -> out_hdr ,line ,str .s );
248287 free (ref );
249288 free (str .s );
250- return ;
289+ return ret ;
251290 }
252291
253292 // one of the alternate alleles matches the reference, assume it's a simple swap
@@ -289,6 +328,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
289328 ac [i - 1 ] = ni ;
290329 bcf_update_info_int32 (args -> out_hdr , line , "AC" , ac , nac );
291330 }
331+ return 1 ;
292332}
293333
294334static void fix_dup_alt (args_t * args , bcf1_t * line )
@@ -334,34 +374,35 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
334374 if ( changed ) bcf_update_genotypes (args -> out_hdr ,line ,gts ,ngts );
335375}
336376
337- static void set_old_rec_tag (args_t * args , bcf1_t * dst , bcf1_t * src , int ialt )
338- {
339- if ( !args -> old_rec_tag ) return ;
340-
341- // only update if the tag is not present already, there can be multiple normalization steps
342- int i , id = bcf_hdr_id2int (args -> out_hdr , BCF_DT_ID , args -> old_rec_tag );
343- bcf_unpack (dst , BCF_UN_INFO );
344- for (i = 0 ; i < dst -> n_info ; i ++ )
345- {
346- bcf_info_t * inf = & dst -> d .info [i ];
347- if ( inf && inf -> key == id ) return ;
348- }
349-
350- args -> tmp_kstr .l = 0 ;
351- ksprintf (& args -> tmp_kstr ,"%s|%" PRIhts_pos "|%s|" ,bcf_seqname (args -> hdr ,src ),src -> pos + 1 ,src -> d .allele [0 ]);
352- for (i = 1 ; i < src -> n_allele ; i ++ )
353- {
354- kputs (src -> d .allele [i ],& args -> tmp_kstr );
355- if ( i + 1 < src -> n_allele ) kputc (',' ,& args -> tmp_kstr );
356- }
357- if ( ialt > 0 )
358- {
359- kputc ('|' ,& args -> tmp_kstr );
360- kputw (ialt ,& args -> tmp_kstr );
361- }
362- if ( (bcf_update_info_string (args -> out_hdr , dst , args -> old_rec_tag , args -> tmp_kstr .s ))!= 0 )
363- error ("An error occurred while updating INFO/%s\n" ,args -> old_rec_tag );
364- }
377+ // static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
378+ // {
379+ // fprintf(stderr,"remove me\n");
380+ // if ( !args->old_rec_tag ) return;
381+ //
382+ // // only update if the tag is not present already, there can be multiple normalization steps
383+ // int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
384+ // bcf_unpack(dst, BCF_UN_INFO);
385+ // for (i=0; i<dst->n_info; i++)
386+ // {
387+ // bcf_info_t *inf = &dst->d.info[i];
388+ // if ( inf && inf->key == id ) return;
389+ // }
390+ //
391+ // args->tmp_kstr.l = 0;
392+ // ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
393+ // for (i=1; i<src->n_allele; i++)
394+ // {
395+ // kputs(src->d.allele[i],&args->tmp_kstr);
396+ // if ( i+1<src->n_allele ) kputc(',',&args->tmp_kstr);
397+ // }
398+ // if ( ialt>0 )
399+ // {
400+ // kputc('|',&args->tmp_kstr);
401+ // kputw(ialt,&args->tmp_kstr);
402+ // }
403+ // if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
404+ // error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
405+ // }
365406
366407static int is_left_align (args_t * args , bcf1_t * line )
367408{
@@ -523,6 +564,7 @@ static hts_pos_t realign_right(args_t *args, bcf1_t *line)
523564static int realign (args_t * args , bcf1_t * line )
524565{
525566 bcf_unpack (line , BCF_UN_STR );
567+ old_rec_tag_init (args ,line );
526568
527569 // Sanity check REF
528570 int i , nref , reflen = strlen (line -> d .allele [0 ]);
@@ -655,7 +697,7 @@ static int realign(args_t *args, bcf1_t *line)
655697 }
656698 if ( new_pos == line -> pos && !strcasecmp (line -> d .allele [0 ],als [0 ].s ) ) return ERR_OK ;
657699
658- set_old_rec_tag (args , line , line , 0 );
700+ old_rec_tag_set (args , line , 0 );
659701
660702 // Create new block of alleles and update
661703 args -> tmp_kstr .l = 0 ;
@@ -1247,6 +1289,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
12471289 if ( !args -> tmp_lines [i ] ) args -> tmp_lines [i ] = bcf_init1 ();
12481290 bcf1_t * dst = args -> tmp_lines [i ];
12491291 bcf_clear (dst );
1292+ old_rec_tag_init (args ,line );
12501293
12511294 dst -> rid = line -> rid ;
12521295 dst -> pos = line -> pos ;
@@ -1271,7 +1314,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
12711314 else if ( type == BCF_HT_FLAG ) split_info_flag (args , line , info , i , dst );
12721315 else split_info_string (args , line , info , i , dst );
12731316 }
1274- set_old_rec_tag (args , dst , line , i + 1 ); // 1-based indexes
1317+ old_rec_tag_set (args , dst , i + 1 ); // 1-based indexes
12751318
12761319 dst -> n_sample = line -> n_sample ;
12771320 for (j = 0 ; j < line -> n_fmt ; j ++ )
@@ -2246,6 +2289,7 @@ static void destroy_data(args_t *args)
22462289 free (args -> tmp_als );
22472290 free (args -> tmp_sym );
22482291 free (args -> tmp_kstr .s );
2292+ free (args -> old_rec_tag_kstr .s );
22492293 if ( args -> tmp_str )
22502294 {
22512295 for (i = 0 ; i < bcf_hdr_nsamples (args -> hdr ); i ++ ) free (args -> tmp_str [i ].s );
@@ -2269,7 +2313,11 @@ static void normalize_line(args_t *args, bcf1_t *line)
22692313{
22702314 if ( args -> fai )
22712315 {
2272- if ( args -> filter_pass && (args -> check_ref & CHECK_REF_FIX ) ) fix_ref (args , line );
2316+ if ( args -> filter_pass && (args -> check_ref & CHECK_REF_FIX ) )
2317+ {
2318+ old_rec_tag_init (args ,line );
2319+ if ( fix_ref (args ,line ) ) old_rec_tag_set (args ,line ,0 );
2320+ }
22732321 if ( args -> do_indels )
22742322 {
22752323 int ret = args -> filter_pass ? realign (args , line ) : ERR_OK ;
0 commit comments