|
3 | 3 | import xarray.testing as xt
|
4 | 4 | import pytest
|
5 | 5 | import sgkit as sg
|
6 |
| -import zarr |
7 | 6 |
|
8 | 7 | from bio2zarr import vcf
|
9 | 8 |
|
@@ -412,13 +411,257 @@ def test_call_PID(self, ds):
|
412 | 411 | assert call_PGT.shape == (23, 3)
|
413 | 412 |
|
414 | 413 |
|
| 414 | +class Test1000G2020AnnotationsExample: |
| 415 | + data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf" |
| 416 | + |
| 417 | + @pytest.fixture(scope="class") |
| 418 | + def ds(self, tmp_path_factory): |
| 419 | + out = tmp_path_factory.mktemp("data") / "example.zarr" |
| 420 | + # TODO capture warnings from htslib here |
| 421 | + vcf.convert_vcf([self.data_path], out, worker_processes=0) |
| 422 | + return sg.load_dataset(out) |
| 423 | + |
| 424 | + def test_position(self, ds): |
| 425 | + # fmt: off |
| 426 | + pos = [ |
| 427 | + 60070, 60083, 60114, 60116, 60137, 60138, 60149, 60181, 60183, |
| 428 | + 60254, 60280, 60280, 60286, 60286, 60291, 60291, 60291, 60291, |
| 429 | + 60291, 60329, 60331 |
| 430 | + ] |
| 431 | + # fmt: on |
| 432 | + nt.assert_array_equal(ds.variant_position.values, pos) |
| 433 | + |
| 434 | + def test_alleles(self, ds): |
| 435 | + alleles = [ |
| 436 | + ["G", "A"], |
| 437 | + ["T", "C"], |
| 438 | + ["T", "C"], |
| 439 | + ["A", "G"], |
| 440 | + ["T", "C"], |
| 441 | + ["T", "A"], |
| 442 | + ["C", "T"], |
| 443 | + ["A", "G"], |
| 444 | + ["A", "G"], |
| 445 | + ["C", "A"], |
| 446 | + ["TTTCCA", "T"], |
| 447 | + ["T", "TTTCCA"], |
| 448 | + ["T", "G"], |
| 449 | + ["TTCCAG", "T"], |
| 450 | + ["G", "T"], |
| 451 | + ["G", "GTCCAT"], |
| 452 | + ["GTCCATTCCAT", "G"], |
| 453 | + ["GTCCAT", "G"], |
| 454 | + ["G", "GTCCATTCCAT"], |
| 455 | + ["C", "G"], |
| 456 | + ["T", "C"], |
| 457 | + ] |
| 458 | + nt.assert_array_equal(ds.variant_allele.values, alleles) |
| 459 | + |
| 460 | + def test_info_fields(self, ds): |
| 461 | + info_vars = [ |
| 462 | + "variant_1000Gp3_AA_GF", |
| 463 | + "variant_1000Gp3_AF", |
| 464 | + "variant_1000Gp3_HomC", |
| 465 | + "variant_1000Gp3_RA_GF", |
| 466 | + "variant_1000Gp3_RR_GF", |
| 467 | + "variant_AC", |
| 468 | + "variant_ACMG_GENE", |
| 469 | + "variant_ACMG_INHRT", |
| 470 | + "variant_ACMG_MIM_GENE", |
| 471 | + "variant_ACMG_PATH", |
| 472 | + "variant_AC_AFR", |
| 473 | + "variant_AC_AFR_unrel", |
| 474 | + "variant_AC_AMR", |
| 475 | + "variant_AC_AMR_unrel", |
| 476 | + "variant_AC_EAS", |
| 477 | + "variant_AC_EAS_unrel", |
| 478 | + "variant_AC_EUR", |
| 479 | + "variant_AC_EUR_unrel", |
| 480 | + "variant_AC_Het", |
| 481 | + "variant_AC_Het_AFR", |
| 482 | + "variant_AC_Het_AFR_unrel", |
| 483 | + "variant_AC_Het_AMR", |
| 484 | + "variant_AC_Het_AMR_unrel", |
| 485 | + "variant_AC_Het_EAS", |
| 486 | + "variant_AC_Het_EAS_unrel", |
| 487 | + "variant_AC_Het_EUR", |
| 488 | + "variant_AC_Het_EUR_unrel", |
| 489 | + "variant_AC_Het_SAS", |
| 490 | + "variant_AC_Het_SAS_unrel", |
| 491 | + "variant_AC_Hom", |
| 492 | + "variant_AC_Hom_AFR", |
| 493 | + "variant_AC_Hom_AFR_unrel", |
| 494 | + "variant_AC_Hom_AMR", |
| 495 | + "variant_AC_Hom_AMR_unrel", |
| 496 | + "variant_AC_Hom_EAS", |
| 497 | + "variant_AC_Hom_EAS_unrel", |
| 498 | + "variant_AC_Hom_EUR", |
| 499 | + "variant_AC_Hom_EUR_unrel", |
| 500 | + "variant_AC_Hom_SAS", |
| 501 | + "variant_AC_Hom_SAS_unrel", |
| 502 | + "variant_AC_SAS", |
| 503 | + "variant_AC_SAS_unrel", |
| 504 | + "variant_AF", |
| 505 | + "variant_AF_AFR", |
| 506 | + "variant_AF_AFR_unrel", |
| 507 | + "variant_AF_AMR", |
| 508 | + "variant_AF_AMR_unrel", |
| 509 | + "variant_AF_EAS", |
| 510 | + "variant_AF_EAS_unrel", |
| 511 | + "variant_AF_EUR", |
| 512 | + "variant_AF_EUR_unrel", |
| 513 | + "variant_AF_SAS", |
| 514 | + "variant_AF_SAS_unrel", |
| 515 | + "variant_AN", |
| 516 | + "variant_ANN", |
| 517 | + "variant_AN_AFR", |
| 518 | + "variant_AN_AFR_unrel", |
| 519 | + "variant_AN_AMR", |
| 520 | + "variant_AN_AMR_unrel", |
| 521 | + "variant_AN_EAS", |
| 522 | + "variant_AN_EAS_unrel", |
| 523 | + "variant_AN_EUR", |
| 524 | + "variant_AN_EUR_unrel", |
| 525 | + "variant_AN_SAS", |
| 526 | + "variant_AN_SAS_unrel", |
| 527 | + "variant_AR_GENE", |
| 528 | + "variant_BaseQRankSum", |
| 529 | + "variant_CADD_phred", |
| 530 | + "variant_CLNDBN", |
| 531 | + "variant_CLNDSDB", |
| 532 | + "variant_CLNDSDBID", |
| 533 | + "variant_CLNSIG", |
| 534 | + "variant_COSMIC_CNT", |
| 535 | + "variant_ClippingRankSum", |
| 536 | + "variant_DP", |
| 537 | + "variant_DS", |
| 538 | + "variant_END", |
| 539 | + "variant_Entrez_gene_id", |
| 540 | + "variant_Essential_gene", |
| 541 | + "variant_ExAC_AF", |
| 542 | + "variant_ExcHet", |
| 543 | + "variant_ExcHet_AFR", |
| 544 | + "variant_ExcHet_AMR", |
| 545 | + "variant_ExcHet_EAS", |
| 546 | + "variant_ExcHet_EUR", |
| 547 | + "variant_ExcHet_SAS", |
| 548 | + "variant_FS", |
| 549 | + "variant_GDI", |
| 550 | + "variant_GDI-Phred", |
| 551 | + "variant_GERP++_NR", |
| 552 | + "variant_GERP++_RS", |
| 553 | + "variant_HWE", |
| 554 | + "variant_HWE_AFR", |
| 555 | + "variant_HWE_AFR_unrel", |
| 556 | + "variant_HWE_AMR", |
| 557 | + "variant_HWE_AMR_unrel", |
| 558 | + "variant_HWE_EAS", |
| 559 | + "variant_HWE_EAS_unrel", |
| 560 | + "variant_HWE_EUR", |
| 561 | + "variant_HWE_EUR_unrel", |
| 562 | + "variant_HWE_SAS", |
| 563 | + "variant_HWE_SAS_unrel", |
| 564 | + "variant_HaplotypeScore", |
| 565 | + "variant_InbreedingCoeff", |
| 566 | + "variant_LOF", |
| 567 | + "variant_LoFtool_score", |
| 568 | + "variant_ME", |
| 569 | + "variant_MGI_mouse_gene", |
| 570 | + "variant_MLEAC", |
| 571 | + "variant_MLEAF", |
| 572 | + "variant_MQ", |
| 573 | + "variant_MQ0", |
| 574 | + "variant_MQRankSum", |
| 575 | + "variant_MutationAssessor_pred", |
| 576 | + "variant_MutationTaster_pred", |
| 577 | + "variant_NEGATIVE_TRAIN_SITE", |
| 578 | + "variant_NMD", |
| 579 | + "variant_POSITIVE_TRAIN_SITE", |
| 580 | + "variant_Pathway(BioCarta)_short", |
| 581 | + "variant_Polyphen2_HDIV_pred", |
| 582 | + "variant_Polyphen2_HVAR_pred", |
| 583 | + "variant_QD", |
| 584 | + "variant_RAW_MQ", |
| 585 | + "variant_RVIS", |
| 586 | + "variant_RVIS_percentile", |
| 587 | + "variant_ReadPosRankSum", |
| 588 | + "variant_Regulome_dbSNP141", |
| 589 | + "variant_SIFT_pred", |
| 590 | + "variant_SOR", |
| 591 | + "variant_Uniprot_aapos_Polyphen2", |
| 592 | + "variant_Uniprot_id_Polyphen2", |
| 593 | + "variant_VQSLOD", |
| 594 | + "variant_VariantType", |
| 595 | + "variant_ZFIN_zebrafish_gene", |
| 596 | + "variant_ZFIN_zebrafish_phenotype_tag", |
| 597 | + "variant_culprit", |
| 598 | + "variant_dbSNPBuildID", |
| 599 | + "variant_phastCons20way_mammalian", |
| 600 | + "variant_phyloP20way_mammalian", |
| 601 | + "variant_repeats", |
| 602 | + ] |
| 603 | + # Verified with bcftools view -H | grep INFO |
| 604 | + assert len(info_vars) == 140 |
| 605 | + standard_vars = [ |
| 606 | + "variant_filter", |
| 607 | + "variant_contig", |
| 608 | + "variant_position", |
| 609 | + "variant_allele", |
| 610 | + "variant_id", |
| 611 | + "variant_id_mask", |
| 612 | + "variant_quality", |
| 613 | + "contig_id", |
| 614 | + "contig_length", |
| 615 | + "filter_id", |
| 616 | + "sample_id", |
| 617 | + ] |
| 618 | + assert sorted(list(ds)) == sorted(info_vars + standard_vars) |
| 619 | + |
| 620 | + def test_variant_ANN(self, ds): |
| 621 | + print(repr(ds.variant_ANN.values)) |
| 622 | + variant_ANN = [ |
| 623 | + "A|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60070G>A||||||", |
| 624 | + "C|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60083T>C||||||", |
| 625 | + "C|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60114T>C||||||", |
| 626 | + "G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60116A>G||||||", |
| 627 | + "C|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60137T>C||||||", |
| 628 | + "A|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60138T>A||||||", |
| 629 | + "T|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60149C>T||||||", |
| 630 | + "G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60181A>G||||||", |
| 631 | + "G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60183A>G||||||", |
| 632 | + "A|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60254C>A||||||", |
| 633 | + "T|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60281_60285delTTCCA||||||", |
| 634 | + "TTTCCA|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60280_60281insTTCCA||||||", |
| 635 | + "G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60286T>G||||||", |
| 636 | + "T|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60287_60291delTCCAG||||||", |
| 637 | + "T|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60291G>T||||||", |
| 638 | + "GTCCAT|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60291_60292insTCCAT||||||", |
| 639 | + "G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60292_60301delTCCATTCCAT||||||", |
| 640 | + "G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60292_60296delTCCAT||||||", |
| 641 | + "GTCCATTCCAT|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60291_60292insTCCATTCCAT||||||", |
| 642 | + "G|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60329C>G||||||", |
| 643 | + "C|intergenic_region|MODIFIER|DEFB125|ENSG00000178591|intergenic_region|ENSG00000178591|||n.60331T>C||||||", |
| 644 | + ] |
| 645 | + nt.assert_array_equal(ds.variant_ANN.values, variant_ANN) |
| 646 | + |
| 647 | + |
| 648 | +class TestGeneratedFieldsExample: |
| 649 | + data_path = "tests/data/vcf/field_type_combos.vcf.gz" |
| 650 | + |
| 651 | + @pytest.fixture(scope="class") |
| 652 | + def ds(self, tmp_path_factory): |
| 653 | + out = tmp_path_factory.mktemp("data") / "vcf.zarr" |
| 654 | + vcf.convert_vcf([self.data_path], out) |
| 655 | + return sg.load_dataset(out) |
| 656 | + |
| 657 | + |
415 | 658 | @pytest.mark.parametrize(
|
416 | 659 | "name",
|
417 | 660 | [
|
418 | 661 | "sample.vcf.gz",
|
419 | 662 | "sample_no_genotypes.vcf.gz",
|
420 |
| - "info_field_type_combos.vcf.gz", |
421 | 663 | "1kg_2020_chrM.vcf.gz",
|
| 664 | + "field_type_combos.vcf.gz", |
422 | 665 | ],
|
423 | 666 | )
|
424 | 667 | def test_by_validating(name, tmp_path):
|
|
0 commit comments