Skip to content

Commit 0eb2471

Browse files
Add annotations example
1 parent f25dae0 commit 0eb2471

File tree

3 files changed

+245
-2
lines changed

3 files changed

+245
-2
lines changed
8.45 KB
Binary file not shown.
101 Bytes
Binary file not shown.

tests/test_vcf_examples.py

Lines changed: 245 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import xarray.testing as xt
44
import pytest
55
import sgkit as sg
6-
import zarr
76

87
from bio2zarr import vcf
98

@@ -412,13 +411,257 @@ def test_call_PID(self, ds):
412411
assert call_PGT.shape == (23, 3)
413412

414413

414+
class Test1000G2020AnnotationsExample:
415+
data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf"
416+
417+
@pytest.fixture(scope="class")
418+
def ds(self, tmp_path_factory):
419+
out = tmp_path_factory.mktemp("data") / "example.zarr"
420+
# TODO capture warnings from htslib here
421+
vcf.convert_vcf([self.data_path], out, worker_processes=0)
422+
return sg.load_dataset(out)
423+
424+
def test_position(self, ds):
425+
# fmt: off
426+
pos = [
427+
60070, 60083, 60114, 60116, 60137, 60138, 60149, 60181, 60183,
428+
60254, 60280, 60280, 60286, 60286, 60291, 60291, 60291, 60291,
429+
60291, 60329, 60331
430+
]
431+
# fmt: on
432+
nt.assert_array_equal(ds.variant_position.values, pos)
433+
434+
def test_alleles(self, ds):
435+
alleles = [
436+
["G", "A"],
437+
["T", "C"],
438+
["T", "C"],
439+
["A", "G"],
440+
["T", "C"],
441+
["T", "A"],
442+
["C", "T"],
443+
["A", "G"],
444+
["A", "G"],
445+
["C", "A"],
446+
["TTTCCA", "T"],
447+
["T", "TTTCCA"],
448+
["T", "G"],
449+
["TTCCAG", "T"],
450+
["G", "T"],
451+
["G", "GTCCAT"],
452+
["GTCCATTCCAT", "G"],
453+
["GTCCAT", "G"],
454+
["G", "GTCCATTCCAT"],
455+
["C", "G"],
456+
["T", "C"],
457+
]
458+
nt.assert_array_equal(ds.variant_allele.values, alleles)
459+
460+
def test_info_fields(self, ds):
461+
info_vars = [
462+
"variant_1000Gp3_AA_GF",
463+
"variant_1000Gp3_AF",
464+
"variant_1000Gp3_HomC",
465+
"variant_1000Gp3_RA_GF",
466+
"variant_1000Gp3_RR_GF",
467+
"variant_AC",
468+
"variant_ACMG_GENE",
469+
"variant_ACMG_INHRT",
470+
"variant_ACMG_MIM_GENE",
471+
"variant_ACMG_PATH",
472+
"variant_AC_AFR",
473+
"variant_AC_AFR_unrel",
474+
"variant_AC_AMR",
475+
"variant_AC_AMR_unrel",
476+
"variant_AC_EAS",
477+
"variant_AC_EAS_unrel",
478+
"variant_AC_EUR",
479+
"variant_AC_EUR_unrel",
480+
"variant_AC_Het",
481+
"variant_AC_Het_AFR",
482+
"variant_AC_Het_AFR_unrel",
483+
"variant_AC_Het_AMR",
484+
"variant_AC_Het_AMR_unrel",
485+
"variant_AC_Het_EAS",
486+
"variant_AC_Het_EAS_unrel",
487+
"variant_AC_Het_EUR",
488+
"variant_AC_Het_EUR_unrel",
489+
"variant_AC_Het_SAS",
490+
"variant_AC_Het_SAS_unrel",
491+
"variant_AC_Hom",
492+
"variant_AC_Hom_AFR",
493+
"variant_AC_Hom_AFR_unrel",
494+
"variant_AC_Hom_AMR",
495+
"variant_AC_Hom_AMR_unrel",
496+
"variant_AC_Hom_EAS",
497+
"variant_AC_Hom_EAS_unrel",
498+
"variant_AC_Hom_EUR",
499+
"variant_AC_Hom_EUR_unrel",
500+
"variant_AC_Hom_SAS",
501+
"variant_AC_Hom_SAS_unrel",
502+
"variant_AC_SAS",
503+
"variant_AC_SAS_unrel",
504+
"variant_AF",
505+
"variant_AF_AFR",
506+
"variant_AF_AFR_unrel",
507+
"variant_AF_AMR",
508+
"variant_AF_AMR_unrel",
509+
"variant_AF_EAS",
510+
"variant_AF_EAS_unrel",
511+
"variant_AF_EUR",
512+
"variant_AF_EUR_unrel",
513+
"variant_AF_SAS",
514+
"variant_AF_SAS_unrel",
515+
"variant_AN",
516+
"variant_ANN",
517+
"variant_AN_AFR",
518+
"variant_AN_AFR_unrel",
519+
"variant_AN_AMR",
520+
"variant_AN_AMR_unrel",
521+
"variant_AN_EAS",
522+
"variant_AN_EAS_unrel",
523+
"variant_AN_EUR",
524+
"variant_AN_EUR_unrel",
525+
"variant_AN_SAS",
526+
"variant_AN_SAS_unrel",
527+
"variant_AR_GENE",
528+
"variant_BaseQRankSum",
529+
"variant_CADD_phred",
530+
"variant_CLNDBN",
531+
"variant_CLNDSDB",
532+
"variant_CLNDSDBID",
533+
"variant_CLNSIG",
534+
"variant_COSMIC_CNT",
535+
"variant_ClippingRankSum",
536+
"variant_DP",
537+
"variant_DS",
538+
"variant_END",
539+
"variant_Entrez_gene_id",
540+
"variant_Essential_gene",
541+
"variant_ExAC_AF",
542+
"variant_ExcHet",
543+
"variant_ExcHet_AFR",
544+
"variant_ExcHet_AMR",
545+
"variant_ExcHet_EAS",
546+
"variant_ExcHet_EUR",
547+
"variant_ExcHet_SAS",
548+
"variant_FS",
549+
"variant_GDI",
550+
"variant_GDI-Phred",
551+
"variant_GERP++_NR",
552+
"variant_GERP++_RS",
553+
"variant_HWE",
554+
"variant_HWE_AFR",
555+
"variant_HWE_AFR_unrel",
556+
"variant_HWE_AMR",
557+
"variant_HWE_AMR_unrel",
558+
"variant_HWE_EAS",
559+
"variant_HWE_EAS_unrel",
560+
"variant_HWE_EUR",
561+
"variant_HWE_EUR_unrel",
562+
"variant_HWE_SAS",
563+
"variant_HWE_SAS_unrel",
564+
"variant_HaplotypeScore",
565+
"variant_InbreedingCoeff",
566+
"variant_LOF",
567+
"variant_LoFtool_score",
568+
"variant_ME",
569+
"variant_MGI_mouse_gene",
570+
"variant_MLEAC",
571+
"variant_MLEAF",
572+
"variant_MQ",
573+
"variant_MQ0",
574+
"variant_MQRankSum",
575+
"variant_MutationAssessor_pred",
576+
"variant_MutationTaster_pred",
577+
"variant_NEGATIVE_TRAIN_SITE",
578+
"variant_NMD",
579+
"variant_POSITIVE_TRAIN_SITE",
580+
"variant_Pathway(BioCarta)_short",
581+
"variant_Polyphen2_HDIV_pred",
582+
"variant_Polyphen2_HVAR_pred",
583+
"variant_QD",
584+
"variant_RAW_MQ",
585+
"variant_RVIS",
586+
"variant_RVIS_percentile",
587+
"variant_ReadPosRankSum",
588+
"variant_Regulome_dbSNP141",
589+
"variant_SIFT_pred",
590+
"variant_SOR",
591+
"variant_Uniprot_aapos_Polyphen2",
592+
"variant_Uniprot_id_Polyphen2",
593+
"variant_VQSLOD",
594+
"variant_VariantType",
595+
"variant_ZFIN_zebrafish_gene",
596+
"variant_ZFIN_zebrafish_phenotype_tag",
597+
"variant_culprit",
598+
"variant_dbSNPBuildID",
599+
"variant_phastCons20way_mammalian",
600+
"variant_phyloP20way_mammalian",
601+
"variant_repeats",
602+
]
603+
# Verified with bcftools view -H | grep INFO
604+
assert len(info_vars) == 140
605+
standard_vars = [
606+
"variant_filter",
607+
"variant_contig",
608+
"variant_position",
609+
"variant_allele",
610+
"variant_id",
611+
"variant_id_mask",
612+
"variant_quality",
613+
"contig_id",
614+
"contig_length",
615+
"filter_id",
616+
"sample_id",
617+
]
618+
assert sorted(list(ds)) == sorted(info_vars + standard_vars)
619+
620+
def test_variant_ANN(self, ds):
621+
print(repr(ds.variant_ANN.values))
622+
variant_ANN = [
623+
"A|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60070G>A||||||",
624+
"C|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60083T>C||||||",
625+
"C|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60114T>C||||||",
626+
"G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60116A>G||||||",
627+
"C|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60137T>C||||||",
628+
"A|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60138T>A||||||",
629+
"T|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60149C>T||||||",
630+
"G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60181A>G||||||",
631+
"G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60183A>G||||||",
632+
"A|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60254C>A||||||",
633+
"T|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60281_60285delTTCCA||||||",
634+
"TTTCCA|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60280_60281insTTCCA||||||",
635+
"G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60286T>G||||||",
636+
"T|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60287_60291delTCCAG||||||",
637+
"T|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60291G>T||||||",
638+
"GTCCAT|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60291_60292insTCCAT||||||",
639+
"G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60292_60301delTCCATTCCAT||||||",
640+
"G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60292_60296delTCCAT||||||",
641+
"GTCCATTCCAT|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60291_60292insTCCATTCCAT||||||",
642+
"G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60329C>G||||||",
643+
"C|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60331T>C||||||",
644+
]
645+
nt.assert_array_equal(ds.variant_ANN.values, variant_ANN)
646+
647+
648+
class TestGeneratedFieldsExample:
649+
data_path = "tests/data/vcf/field_type_combos.vcf.gz"
650+
651+
@pytest.fixture(scope="class")
652+
def ds(self, tmp_path_factory):
653+
out = tmp_path_factory.mktemp("data") / "vcf.zarr"
654+
vcf.convert_vcf([self.data_path], out)
655+
return sg.load_dataset(out)
656+
657+
415658
@pytest.mark.parametrize(
416659
"name",
417660
[
418661
"sample.vcf.gz",
419662
"sample_no_genotypes.vcf.gz",
420-
"info_field_type_combos.vcf.gz",
421663
"1kg_2020_chrM.vcf.gz",
664+
"field_type_combos.vcf.gz",
422665
],
423666
)
424667
def test_by_validating(name, tmp_path):

0 commit comments

Comments
 (0)