Skip to content

Commit 8e2bb64

Browse files
committed
Merge pull request #35 from bewt85/bt5_496468_qm_bases
496468: Allow '?' in fasta input
2 parents cc4c259 + 527d09f commit 8e2bb64

File tree

5 files changed

+24
-11
lines changed

5 files changed

+24
-11
lines changed

src/alignment-file.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,18 @@ int build_reference_sequence_and_truncate(char reference_sequence[], char filena
181181
return 1;
182182
}
183183

184+
int is_unknown(char base)
185+
{
186+
switch (toupper(base)) {
187+
case 'N':
188+
case '-':
189+
case '?':
190+
return 1;
191+
default:
192+
return 0;
193+
}
194+
}
195+
184196
int detect_snps(char reference_sequence[], char filename[], size_t length_of_genome)
185197
{
186198
int i;
@@ -199,12 +211,12 @@ int detect_snps(char reference_sequence[], char filename[], size_t length_of_gen
199211
for(i = 0; i < length_of_genome; i++)
200212
{
201213
// If there is an indel in the reference sequence, replace with the first proper base you find
202-
if((reference_sequence[i] == '-' && seq->seq.s[i] != '-' ) || (toupper(reference_sequence[i]) == 'N' && seq->seq.s[i] != 'N' ))
214+
if(is_unknown(reference_sequence[i]) && !is_unknown(seq->seq.s[i]))
203215
{
204216
reference_sequence[i] = toupper(seq->seq.s[i]);
205217
}
206218

207-
if(reference_sequence[i] != '*' && seq->seq.s[i] != '-' && toupper(seq->seq.s[i]) != 'N' && reference_sequence[i] != toupper(seq->seq.s[i]))
219+
if(! is_unknown(reference_sequence[i]) && reference_sequence[i] != '*' && ! is_unknown(seq->seq.s[i]) && (reference_sequence[i] != toupper(seq->seq.s[i])))
208220
{
209221
reference_sequence[i] = '*';
210222
number_of_snps++;

src/alignment-file.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "kseq.h"
2424

25+
int is_unknown(char base);
2526
int detect_snps(char reference_sequence[], char filename[], size_t length_of_genome);
2627
int line_length(FILE * alignment_file_pointer);
2728
int build_reference_sequence(char reference_sequence[], char filename[]);

src/parse-phylip.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ int does_column_contain_snps(int snp_column, char reference_base)
4242
return 0;
4343
}
4444

45-
if(sequences[i][snp_column] != '-' && toupper(sequences[i][snp_column]) != 'N' && sequences[i][snp_column] != reference_base)
45+
if(!is_unknown(sequences[i][snp_column]) && sequences[i][snp_column] != reference_base)
4646
{
4747
return 1;
4848
}

src/vcf.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ char * alternative_bases(char reference_base, char * bases_for_snp, int number_o
121121
char * alt_bases = calloc(MAXIMUM_NUMBER_OF_ALT_BASES+1, sizeof(char));
122122
for(i=0; i< number_of_samples; i++ )
123123
{
124-
if((bases_for_snp[i] != reference_base) && (bases_for_snp[i] != '-') && (toupper(bases_for_snp[i]) != 'N') )
124+
if(!is_unknown(bases_for_snp[i]) && (bases_for_snp[i] != reference_base))
125125
{
126126
if(check_if_char_in_string(alt_bases, bases_for_snp[i], num_alt_bases) == 0)
127127
{
@@ -145,7 +145,7 @@ char * format_allele_index(char base, char reference_base, char * alt_bases)
145145
assert(length_of_alt_bases < 100);
146146
char * result = calloc(3, sizeof(char));
147147
int index;
148-
if (reference_base == base || toupper(base) == 'N' || base == '-')
148+
if (reference_base == base || is_unknown(base))
149149
{
150150
sprintf(result, "0");
151151
}

0 commit comments

Comments
 (0)