Skip to content

Commit 1f63180

Browse files
hyanwongmergify[bot]
authored andcommitted
Allow VCF to read a 1000G file
This should allow reading of 1000G VCFs, printing warnings rather than raising errors when there are problems. I guess that's more useful for some people?
1 parent b87fbea commit 1f63180

File tree

1 file changed

+12
-13
lines changed

1 file changed

+12
-13
lines changed

docs/tutorial.rst

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -380,33 +380,32 @@ variants from chromosome 24 of ten Norwegian and French house sparrows,
380380
"""
381381
# You may want to change the following line, e.g. here we allow * (a spanning
382382
# deletion) to be a valid allele state
383-
allowed_allele_chars = set("ATGCatgc*")
383+
allele_chars = set("ATGCatgc*")
384384
pos = 0
385385
for variant in vcf: # Loop over variants, each assumed at a unique site
386386
if pos == variant.POS:
387-
raise ValueError("Duplicate positions for variant at position", pos)
387+
print(f"Duplicate entries at position {pos}, ignoring all but the first")
388+
continue
388389
else:
389390
pos = variant.POS
390391
if any([not phased for _, _, phased in variant.genotypes]):
391392
raise ValueError("Unphased genotypes for variant at position", pos)
392-
alleles = [variant.REF] + variant.ALT
393+
alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT]
393394
ancestral = variant.INFO.get("AA", ".") # "." means unknown
394395
# some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
395-
ancestral = ancestral.split("|")[0]
396-
if ancestral == ".":
396+
ancestral = ancestral.split("|")[0].upper()
397+
if ancestral == "." or ancestral == "":
397398
# use the reference as ancestral, if unknown (NB: you may not want this)
398-
ancestral = variant.REF
399+
ancestral = variant.REF.upper()
399400
# Ancestral state must be first in the allele list.
400401
ordered_alleles = [ancestral] + list(set(alleles) - {ancestral})
401402
# Check we have ATCG alleles
402-
for allele in ordered_alleles:
403-
if len(set(allele) - allowed_allele_chars) > 0:
404-
raise ValueError(
405-
"Site at pos {pos}: allele {allele} not in {allowed_allele_chars}"
406-
)
403+
for a in ordered_alleles:
404+
if len(set(a) - allele_chars) > 0:
405+
print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}")
406+
continue
407407
allele_index = {
408-
old_index: ordered_alleles.index(allele)
409-
for old_index, allele in enumerate(alleles)
408+
old_index: ordered_alleles.index(a) for old_index, a in enumerate(alleles)
410409
}
411410
# Map original allele indexes to their indexes in the new alleles list.
412411
genotypes = [

0 commit comments

Comments
 (0)