Skip to content

Commit 3d4bb61

Browse files
committed
Merge pull request #50 from andrewjpage/only_acgt
allow pure bases and monomorphic, making it work with BEAST
2 parents 0581ad0 + f3fff50 commit 3d4bb61

19 files changed

+228
-37
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.2
1+
2.3.0

snp-sites.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ SNP-SITES(1)
55

66
NAME
77
----
8-
snp-sites - finds snp sites from a multi fasta alignment file
8+
snp-sites - finds SNP sites from a multi FASTA alignment file
99

1010
SYNOPSIS
1111
--------
@@ -33,6 +33,12 @@ OPTIONS
3333
*-p*::
3434
Output a phylip file
3535

36+
*-c*::
37+
Only output columns containing exclusively ACGT
38+
39+
*-b*::
40+
Output monomorphic sites, useful for BEAST
41+
3642
*-o*::
3743
Specify an output filename
3844

@@ -49,6 +55,8 @@ EXAMPLES
4955
snp-sites my-alignment.aln
5056

5157
snp-sites my-gzipped-alignment.aln.gz
58+
59+
snp-sites -cb -o output_beast_format.aln inputfile.aln
5260

5361

5462
FORMAT OF THE INPUT FILE

src/alignment-file.c

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ void get_bases_for_each_snp(char filename[], char ** bases_for_snps)
105105
gzclose(fp);
106106
}
107107

108-
void detect_snps(char filename[])
108+
void detect_snps(char filename[], int pure_mode, int output_monomorphic)
109109
{
110110
int i;
111111
int l;
@@ -134,16 +134,27 @@ void detect_snps(char filename[])
134134
}
135135
for(i = 0; i < length_of_genome; i++)
136136
{
137+
if(first_sequence[i] == '#')
138+
{
139+
continue;
140+
}
141+
137142
if(first_sequence[i] == 'N' && !is_unknown(seq->seq.s[i]))
138143
{
139144
first_sequence[i] = toupper(seq->seq.s[i]);
140145
pseudo_reference_sequence[i] = toupper(seq->seq.s[i]);
141146
}
142-
143-
if(first_sequence[i] != '>' && !is_unknown(seq->seq.s[i]) && first_sequence[i] != 'N' && first_sequence[i] != toupper(seq->seq.s[i]))
147+
148+
// in pure mode we only want /ACGT/i, if any other base is found the whole column is excluded
149+
if(pure_mode && !is_pure(seq->seq.s[i]))
150+
{
151+
first_sequence[i] = '#';
152+
continue;
153+
}
154+
155+
if(first_sequence[i] != '>' && !is_unknown(seq->seq.s[i]) && first_sequence[i] != 'N' && first_sequence[i] != toupper(seq->seq.s[i]))
144156
{
145157
first_sequence[i] = '>';
146-
number_of_snps++;
147158
}
148159
}
149160

@@ -157,18 +168,26 @@ void detect_snps(char filename[])
157168
number_of_samples++;
158169
}
159170

171+
for(i = 0; i < length_of_genome; i++)
172+
{
173+
if(first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#'))
174+
{
175+
number_of_snps++;
176+
}
177+
}
178+
160179
if(number_of_snps == 0)
161180
{
162181
fprintf(stderr, "Warning: No SNPs were detected so there is nothing to output.\n");
163182
fflush(stderr);
164183
exit(EXIT_FAILURE);
165184
}
166-
185+
167186
int current_snp_index = 0;
168187
snp_locations = calloc(number_of_snps, sizeof(int));
169188
for(i = 0; i < length_of_genome; i++)
170189
{
171-
if(first_sequence[i] == '>')
190+
if(first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#'))
172191
{
173192
snp_locations[current_snp_index] = i;
174193
current_snp_index++;
@@ -192,3 +211,20 @@ int is_unknown(char base)
192211
return 0;
193212
}
194213
}
214+
215+
int is_pure(char base)
216+
{
217+
switch (base) {
218+
case 'A':
219+
case 'C':
220+
case 'G':
221+
case 'T':
222+
case 'a':
223+
case 'c':
224+
case 'g':
225+
case 't':
226+
return 1;
227+
default:
228+
return 0;
229+
}
230+
}

src/alignment-file.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
#include "kseq.h"
2424

25-
void detect_snps( char filename[]);
25+
void detect_snps( char filename[], int pure_mode, int output_monomorphic);
2626
void get_bases_for_each_snp(char filename[], char ** bases_for_snps);
2727
int is_unknown(char base);
2828
int get_length_of_genome();
@@ -31,6 +31,7 @@ int get_number_of_snps();
3131
char ** get_sequence_names();
3232
int * get_snp_locations();
3333
char * get_pseudo_reference_sequence();
34+
int is_pure(char base);
3435

3536
#define MAX_SAMPLE_NAME_SIZE 2048
3637
#define DEFAULT_NUM_SAMPLES 65536

src/main.c

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,21 @@
3232

3333
static void print_usage()
3434
{
35-
printf("Usage: snp-sites [-mvph] [-o output_filename] <file>\n");
36-
printf("This program finds snp sites from a multi FASTA alignment file.\n");
37-
printf(" -r output internal pseudo reference sequence\n");
38-
printf(" -m output a multi fasta alignment file (default)\n");
39-
printf(" -v output a VCF file\n");
40-
printf(" -p output a phylip file\n");
41-
printf(" -o specify an output filename\n");
42-
printf(" -h this help message\n");
43-
printf(" -V print version and exit\n");
44-
printf(" <file> input alignment file which can optionally be gzipped\n\n");
35+
printf("Usage: snp-sites [-rmvpcbhV] [-o output_filename] <file>\n");
36+
printf("This program finds snp sites from a multi FASTA alignment file.\n");
37+
printf(" -r output internal pseudo reference sequence\n");
38+
printf(" -m output a multi fasta alignment file (default)\n");
39+
printf(" -v output a VCF file\n");
40+
printf(" -p output a phylip file\n");
41+
printf(" -o STR specify an output filename\n");
42+
printf(" -c only output columns containing exclusively ACGT\n");
43+
printf(" -b output monomorphic sites, used for BEAST\n");
44+
printf(" -h this help message\n");
45+
printf(" -V print version and exit\n");
46+
printf(" <file> input alignment file which can optionally be gzipped\n\n");
47+
48+
printf("Example: creating files for BEAST\n");
49+
printf(" snp-sites -cb -o outputfile.aln inputfile.aln\n\n");
4550

4651
printf("If you use this program, please cite:\n");
4752
printf("\"SNP-sites: rapid efficient extraction of SNPs from multi-FASTA alignments\",\n");
@@ -65,8 +70,10 @@ int main (int argc, char **argv) {
6570
int output_vcf_file = 0;
6671
int output_phylip_file = 0;
6772
int output_reference = 0;
68-
69-
while ((c = getopt (argc, argv, "mvrpo:V")) != -1)
73+
int pure_mode = 0;
74+
int output_monomorphic =0;
75+
76+
while ((c = getopt (argc, argv, "mvrbpco:V")) != -1)
7077
switch (c)
7178
{
7279
case 'm':
@@ -83,6 +90,12 @@ int main (int argc, char **argv) {
8390
break;
8491
case 'r':
8592
output_reference = 1;
93+
break;
94+
case 'c':
95+
pure_mode = 1;
96+
break;
97+
case 'b':
98+
output_monomorphic = 1;
8699
break;
87100
case 'o':
88101
strncpy(output_filename, optarg, FILENAME_MAX);
@@ -105,7 +118,15 @@ int main (int argc, char **argv) {
105118
}
106119

107120
strncpy(multi_fasta_filename, argv[optind], FILENAME_MAX);
108-
if (output_reference) {
121+
122+
if( pure_mode || output_monomorphic)
123+
{
124+
generate_snp_sites_with_ref_pure_mono(multi_fasta_filename,
125+
output_multi_fasta_file,
126+
output_vcf_file, output_phylip_file,
127+
output_filename,output_reference,pure_mode,output_monomorphic);
128+
}
129+
else if(output_reference ) {
109130
generate_snp_sites_with_ref(multi_fasta_filename,
110131
output_multi_fasta_file,
111132
output_vcf_file, output_phylip_file,

src/snp-sites.c

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ static int generate_snp_sites_generic(char filename[],
3232
int output_vcf_file,
3333
int output_phylip_file,
3434
char output_filename[],
35-
int output_reference)
35+
int output_reference, int pure_mode, int output_monomorphic)
3636
{
3737
int i;
38-
detect_snps(filename);
38+
detect_snps(filename, pure_mode, output_monomorphic);
3939

4040
char* bases_for_snps[get_number_of_snps()];
4141

@@ -112,7 +112,7 @@ int generate_snp_sites(char filename[], int output_multi_fasta_file,
112112
{
113113
return generate_snp_sites_generic(filename, output_multi_fasta_file,
114114
output_vcf_file, output_phylip_file,
115-
output_filename, 0);
115+
output_filename, 0,0,0);
116116
}
117117

118118
int generate_snp_sites_with_ref(char filename[], int output_multi_fasta_file,
@@ -121,9 +121,24 @@ int generate_snp_sites_with_ref(char filename[], int output_multi_fasta_file,
121121
{
122122
return generate_snp_sites_generic(filename, output_multi_fasta_file,
123123
output_vcf_file, output_phylip_file,
124-
output_filename, 1);
124+
output_filename, 1,0,0);
125125
}
126126

127+
int generate_snp_sites_with_ref_pure_mono(char filename[],
128+
int output_multi_fasta_file,
129+
int output_vcf_file,
130+
int output_phylip_file,
131+
char output_filename[],
132+
int output_reference,
133+
int pure_mode,
134+
int output_monomorphic)
135+
{
136+
return generate_snp_sites_generic(filename, output_multi_fasta_file,
137+
output_vcf_file, output_phylip_file,
138+
output_filename, output_reference, pure_mode, output_monomorphic);
139+
}
140+
141+
127142
// Inefficient
128143
void strip_directory_from_filename(char * input_filename, char * output_filename)
129144
{

src/snp-sites.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,16 @@ int generate_snp_sites_with_ref(char filename[],
3333
int output_vcf_file,
3434
int output_phylip_file,
3535
char output_filename[]);
36+
37+
int generate_snp_sites_with_ref_pure_mono(char filename[],
38+
int output_multi_fasta_file,
39+
int output_vcf_file,
40+
int output_phylip_file,
41+
char output_filename[],
42+
int output_reference,
43+
int pure_mode,
44+
int output_monomorphic);
45+
3646
void strip_directory_from_filename(char *input_filename,
3747
char *output_filename);
3848

src/vcf.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_locat
113113
free(alt_bases);
114114

115115
fprintf( vcf_file_pointer, "\n");
116+
116117
}
117118

118119

@@ -145,14 +146,15 @@ char * alternative_bases(char reference_base, char * bases_for_snp, int number_o
145146
}
146147
}
147148
}
149+
alt_bases[num_alt_bases] = '\0';
148150
return alt_bases;
149151
}
150152

151153
char * format_allele_index(char base, char reference_base, char * alt_bases)
152154
{
153155
int length_of_alt_bases = strlen(alt_bases);
154156
assert(length_of_alt_bases < 100);
155-
char * result = calloc(3, sizeof(char));
157+
char * result = calloc(5, sizeof(char));
156158
int index;
157159

158160
if(is_unknown(base))
@@ -183,6 +185,14 @@ char * format_alternative_bases(char * alt_bases)
183185
{
184186
int number_of_alt_bases = strlen(alt_bases);
185187
assert( number_of_alt_bases < MAXIMUM_NUMBER_OF_ALT_BASES );
188+
189+
if(number_of_alt_bases ==0)
190+
{
191+
char * formatted_alt_bases = calloc(2 + 1, sizeof(char));
192+
formatted_alt_bases[0] = '.';
193+
return formatted_alt_bases;
194+
}
195+
186196
char * formatted_alt_bases = calloc(number_of_alt_bases*2 + 1, sizeof(char));
187197
int i;
188198
formatted_alt_bases[0] = alt_bases[0];

0 commit comments

Comments
 (0)