@@ -380,33 +380,32 @@ variants from chromosome 24 of ten Norwegian and French house sparrows,
380
380
"""
381
381
# You may want to change the following line, e.g. here we allow * (a spanning
382
382
# deletion) to be a valid allele state
383
- allowed_allele_chars = set (" ATGCatgc*" )
383
+ allele_chars = set (" ATGCatgc*" )
384
384
pos = 0
385
385
for variant in vcf: # Loop over variants, each assumed at a unique site
386
386
if pos == variant.POS :
387
- raise ValueError (" Duplicate positions for variant at position" , pos)
387
+ print (f " Duplicate entries at position { pos} , ignoring all but the first " )
388
+ continue
388
389
else :
389
390
pos = variant.POS
390
391
if any ([not phased for _, _, phased in variant.genotypes]):
391
392
raise ValueError (" Unphased genotypes for variant at position" , pos)
392
- alleles = [variant.REF ] + variant.ALT
393
+ alleles = [variant.REF .upper() ] + [v.upper() for v in variant.ALT ]
393
394
ancestral = variant.INFO .get(" AA" , " ." ) # "." means unknown
394
395
# some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
395
- ancestral = ancestral.split(" |" )[0 ]
396
- if ancestral == " ." :
396
+ ancestral = ancestral.split(" |" )[0 ].upper()
397
+ if ancestral == " ." or ancestral == " " :
397
398
# use the reference as ancestral, if unknown (NB: you may not want this)
398
- ancestral = variant.REF
399
+ ancestral = variant.REF .upper()
399
400
# Ancestral state must be first in the allele list.
400
401
ordered_alleles = [ancestral] + list (set (alleles) - {ancestral})
401
402
# Check we have ATCG alleles
402
- for allele in ordered_alleles:
403
- if len (set (allele) - allowed_allele_chars) > 0 :
404
- raise ValueError (
405
- " Site at pos {pos} : allele {allele} not in {allowed_allele_chars} "
406
- )
403
+ for a in ordered_alleles:
404
+ if len (set (a) - allele_chars) > 0 :
405
+ print (f " Ignoring site at pos { pos} : allele { a} not in { allele_chars} " )
406
+ continue
407
407
allele_index = {
408
- old_index: ordered_alleles.index(allele)
409
- for old_index, allele in enumerate (alleles)
408
+ old_index: ordered_alleles.index(a) for old_index, a in enumerate (alleles)
410
409
}
411
410
# Map original allele indexes to their indexes in the new alleles list.
412
411
genotypes = [
0 commit comments