@@ -378,6 +378,9 @@ variants from chromosome 24 of ten Norwegian and French house sparrows,
378
378
Read the sites in the vcf and add them to the samples object, reordering the
379
379
alleles to put the ancestral allele first, if it is available.
380
380
"""
381
+ # You may want to change the following line, e.g. here we allow * (a spanning
382
+ # deletion) to be a valid allele state
383
+ allowed_allele_chars = set (" ATGCatgc*" )
381
384
pos = 0
382
385
for variant in vcf: # Loop over variants, each assumed at a unique site
383
386
if pos == variant.POS :
@@ -387,9 +390,20 @@ variants from chromosome 24 of ten Norwegian and French house sparrows,
387
390
if any ([not phased for _, _, phased in variant.genotypes]):
388
391
raise ValueError (" Unphased genotypes for variant at position" , pos)
389
392
alleles = [variant.REF ] + variant.ALT
390
- ancestral = variant.INFO .get(" AA" , variant.REF )
393
+ ancestral = variant.INFO .get(" AA" , " ." ) # "." means unknown
394
+ # some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
395
+ ancestral = ancestral.split(" |" )[0 ]
396
+ if ancestral == " ." :
397
+ # use the reference as ancestral, if unknown (NB: you may not want this)
398
+ ancestral = variant.REF
391
399
# Ancestral state must be first in the allele list.
392
400
ordered_alleles = [ancestral] + list (set (alleles) - {ancestral})
401
+ # Check we have ATCG alleles
402
+ for allele in ordered_alleles:
403
+ if len (set (allele) - allowed_allele_chars) > 0 :
404
+ raise ValueError (
405
+ " Site at pos {pos} : allele {allele} not in {allowed_allele_chars} "
406
+ )
393
407
allele_index = {
394
408
old_index: ordered_alleles.index(allele)
395
409
for old_index, allele in enumerate (alleles)
0 commit comments