Skip to content

Commit 269d8d1

Browse files
hyanwongmergify[bot]
authored andcommitted
Add in extra sanity checks in VCF reading code example
1 parent 2a06def commit 269d8d1

File tree

1 file changed

+15
-1
lines changed

1 file changed

+15
-1
lines changed

docs/tutorial.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,9 @@ variants from chromosome 24 of ten Norwegian and French house sparrows,
378378
Read the sites in the vcf and add them to the samples object, reordering the
379379
alleles to put the ancestral allele first, if it is available.
380380
"""
381+
# You may want to change the following line, e.g. here we allow * (a spanning
382+
# deletion) to be a valid allele state
383+
allowed_allele_chars = set("ATGCatgc*")
381384
pos = 0
382385
for variant in vcf: # Loop over variants, each assumed at a unique site
383386
if pos == variant.POS:
@@ -387,9 +390,20 @@ variants from chromosome 24 of ten Norwegian and French house sparrows,
387390
if any([not phased for _, _, phased in variant.genotypes]):
388391
raise ValueError("Unphased genotypes for variant at position", pos)
389392
alleles = [variant.REF] + variant.ALT
390-
ancestral = variant.INFO.get("AA", variant.REF)
393+
ancestral = variant.INFO.get("AA", ".") # "." means unknown
394+
# some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
395+
ancestral = ancestral.split("|")[0]
396+
if ancestral == ".":
397+
# use the reference as ancestral, if unknown (NB: you may not want this)
398+
ancestral = variant.REF
391399
# Ancestral state must be first in the allele list.
392400
ordered_alleles = [ancestral] + list(set(alleles) - {ancestral})
401+
# Check we have ATCG alleles
402+
for allele in ordered_alleles:
403+
if len(set(allele) - allowed_allele_chars) > 0:
404+
raise ValueError(
405+
"Site at pos {pos}: allele {allele} not in {allowed_allele_chars}"
406+
)
393407
allele_index = {
394408
old_index: ordered_alleles.index(allele)
395409
for old_index, allele in enumerate(alleles)

0 commit comments

Comments
 (0)