2626#include "vcf.h"
2727#include "alignment-file.h"
2828#include "snp-sites.h"
29-
30-
29+ #include <assert.h>
3130
3231void create_vcf_file (char filename [], int snp_locations [],int number_of_snps , char * * bases_for_snps , char * * sequence_names , int number_of_samples )
3332{
@@ -56,7 +55,7 @@ void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int num
5655{
5756 int i ;
5857 fprintf ( vcf_file_pointer , "##fileformat=VCFv4.1\n" );
59- fprintf ( vcf_file_pointer , "##INFO =<ID=AB ,Number=1,Type=String,Description=\"Alt Base \">\n" );
58+ fprintf ( vcf_file_pointer , "##FORMAT =<ID=GT ,Number=1,Type=String,Description=\"Genotype \">\n" );
6059 fprintf ( vcf_file_pointer , "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" );
6160
6261 for (i = 0 ; i < number_of_samples ; i ++ )
@@ -69,7 +68,6 @@ void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int num
6968void output_vcf_row (FILE * vcf_file_pointer , char * bases_for_snp , int snp_location , int number_of_samples )
7069{
7170 char reference_base = bases_for_snp [0 ];
72- char alt_bases [30 ];
7371 if (reference_base == '\0' )
7472 {
7573 return ;
@@ -90,53 +88,94 @@ void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_locat
9088 // ALT
9189 // Need to look through list and find unique characters
9290
93- alternative_bases (reference_base , bases_for_snp , alt_bases , number_of_samples );
94- fprintf ( vcf_file_pointer , "%s\t" , alt_bases );
91+ char * alt_bases = alternative_bases (reference_base , bases_for_snp , number_of_samples );
92+ char * alternative_bases_string = format_alternative_bases (alt_bases );
93+ fprintf ( vcf_file_pointer , "%s\t" , alternative_bases_string );
94+ free (alternative_bases_string );
9595
9696 // QUAL
9797 fprintf ( vcf_file_pointer , ".\t" );
9898
9999 // FILTER
100100 fprintf ( vcf_file_pointer , ".\t" );
101101
102- // FORMAT
103- fprintf ( vcf_file_pointer , "AB\t" );
104-
105102 // INFO
106103 fprintf ( vcf_file_pointer , ".\t" );
107104
105+ // FORMAT
106+ fprintf ( vcf_file_pointer , "GT\t" );
107+
108108 // Bases for each sample
109- output_vcf_row_samples_bases (vcf_file_pointer , reference_base , bases_for_snp , number_of_samples );
109+ output_vcf_row_samples_bases (vcf_file_pointer , reference_base , alt_bases , bases_for_snp , number_of_samples );
110+ free (alt_bases );
110111
111112 fprintf ( vcf_file_pointer , "\n" );
112113}
113114
114115
115- void alternative_bases (char reference_base , char * bases_for_snp , char alt_bases [] , int number_of_samples )
116+ char * alternative_bases (char reference_base , char * bases_for_snp , int number_of_samples )
116117{
117118 int i ;
118119 int num_alt_bases = 0 ;
120+ char * alt_bases = calloc (MAXIMUM_NUMBER_OF_ALT_BASES + 1 , sizeof (char ));
119121 for (i = 0 ; i < number_of_samples ; i ++ )
120122 {
121123 if ((bases_for_snp [i ] != reference_base ) && (bases_for_snp [i ] != '-' ) && (toupper (bases_for_snp [i ]) != 'N' ) )
122124 {
123125 if (check_if_char_in_string (alt_bases , bases_for_snp [i ], num_alt_bases ) == 0 )
124126 {
127+ if (num_alt_bases >= MAXIMUM_NUMBER_OF_ALT_BASES )
128+ {
129+ fprintf (stderr , "Unexpectedly large number of alternative bases found between sequences. Please check input file is not corrupted\n\n" );
130+ fflush (stderr );
131+ exit (EXIT_FAILURE );
132+ }
125133 alt_bases [num_alt_bases ] = bases_for_snp [i ];
126134 num_alt_bases ++ ;
127- alt_bases [num_alt_bases ] = ',' ;
128- num_alt_bases ++ ;
129135 }
130136 }
131137 }
132- if (num_alt_bases > 0 && alt_bases [num_alt_bases - 1 ] == ',' )
138+ return alt_bases ;
139+ }
140+
141+ char * format_allele_index (char base , char reference_base , char * alt_bases )
142+ {
143+ int length_of_alt_bases = strlen (alt_bases );
144+ assert (length_of_alt_bases < 100 );
145+ char * result = calloc (3 , sizeof (char ));
146+ int index ;
147+ if (reference_base == base || toupper (base ) == 'N' || base == '-' )
133148 {
134- alt_bases [ num_alt_bases - 1 ] = '\0' ;
149+ sprintf ( result , "0" ) ;
135150 }
136151 else
137152 {
138- alt_bases [num_alt_bases ] = '\0' ;
153+ sprintf (result , "." );
154+ for (index = 1 ; index <= length_of_alt_bases ; index ++ )
155+ {
156+ if (alt_bases [index - 1 ] == base )
157+ {
158+ sprintf (result , "%i" , index );
159+ break ;
160+ }
161+ }
162+ }
163+ return result ;
164+ }
165+
166+ char * format_alternative_bases (char * alt_bases )
167+ {
168+ int number_of_alt_bases = strlen (alt_bases );
169+ assert ( number_of_alt_bases < MAXIMUM_NUMBER_OF_ALT_BASES );
170+ char * formatted_alt_bases = calloc (number_of_alt_bases * 2 + 1 , sizeof (char ));
171+ int i ;
172+ formatted_alt_bases [0 ] = alt_bases [0 ];
173+ for (i = 1 ; i < number_of_alt_bases ; i ++ )
174+ {
175+ formatted_alt_bases [i * 2 - 1 ] = ',' ;
176+ formatted_alt_bases [i * 2 ] = alt_bases [i ];
139177 }
178+ return formatted_alt_bases ;
140179}
141180
142181int check_if_char_in_string (char search_string [], char target_char , int search_string_length )
@@ -152,20 +191,16 @@ int check_if_char_in_string(char search_string[], char target_char, int search_s
152191 return 0 ;
153192}
154193
155- void output_vcf_row_samples_bases (FILE * vcf_file_pointer , char reference_base , char * bases_for_snp , int number_of_samples )
194+ void output_vcf_row_samples_bases (FILE * vcf_file_pointer , char reference_base , char * alt_bases , char * bases_for_snp , int number_of_samples )
156195{
157196 int i ;
197+ char * format ;
158198
159199 for (i = 0 ; i < number_of_samples ; i ++ )
160200 {
161- if ((bases_for_snp [i ] == reference_base ) || (bases_for_snp [i ] == '-' ) || (toupper (bases_for_snp [i ]) == 'N' ) )
162- {
163- fprintf ( vcf_file_pointer , "." );
164- }
165- else
166- {
167- fprintf ( vcf_file_pointer , "%c" , (char ) bases_for_snp [i ] );
168- }
201+ format = format_allele_index (bases_for_snp [i ], reference_base , alt_bases );
202+ fprintf ( vcf_file_pointer , "%s" , format );
203+ free (format );
169204 if (i + 1 != number_of_samples )
170205 {
171206 fprintf ( vcf_file_pointer , "\t" );
0 commit comments