Pad out tests

jeromekelleher · jeromekelleher · commit 8c80f540e196 · 2024-02-28T15:54:23.000Z
diff --git a/bio2zarr/vcf_utils.py b/bio2zarr/vcf_utils.py
@@ -365,9 +365,11 @@ def read_tabix(
         )
 
 
-class IndexedVcf:
+class IndexedVcf(contextlib.AbstractContextManager):
     def __init__(self, vcf_path, index_path=None):
+        self.vcf = None
         vcf_path = pathlib.Path(vcf_path)
+        # TODO use constants here instead of strings
         if index_path is None:
             index_path = vcf_path.with_suffix(vcf_path.suffix + ".tbi")
             if not index_path.exists():
@@ -379,6 +381,7 @@ def __init__(self, vcf_path, index_path=None):
 
         self.vcf_path = vcf_path
         self.index_path = index_path
+        # TODO use Enums for these
         self.file_type = None
         self.index_type = None
         if index_path.suffix == ".csi":
@@ -387,7 +390,9 @@ def __init__(self, vcf_path, index_path=None):
             self.index_type = "tabix"
             self.file_type = "vcf"
         else:
-            raise ValueError("TODO")
+            raise ValueError("Only .tbi or .csi indexes are supported.")
+        self.vcf = cyvcf2.VCF(vcf_path)
+        self.vcf.set_index(str(self.index_path))
         self.sequence_names = None
         if self.index_type == "csi":
             # Determine the file-type based on the "aux" field.
@@ -403,12 +408,28 @@ def __init__(self, vcf_path, index_path=None):
             self.index = read_tabix(self.index_path)
             self.sequence_names = self.index.sequence_names
 
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.vcf is not None:
+            self.vcf.close()
+            self.vcf = None
+        return False
+
     def contig_record_counts(self):
         d = dict(zip(self.sequence_names, self.index.record_counts))
         if self.file_type == "bcf":
             d = {k: v for k, v in d.items() if v > 0}
         return d
 
+    def count_variants(self, region):
+        return sum(1 for _ in self.variants(region))
+
+    def variants(self, region):
+        # Need to filter because of indels overlapping the region
+        start = 1 if region.start is None else region.start
+        for var in self.vcf(str(region)):
+            if var.POS >= start:
+                yield var
+
     def partition_into_regions(
         self,
         num_parts: Optional[int] = None,
diff --git a/tests/test_vcf_utils.py b/tests/test_vcf_utils.py
@@ -11,121 +11,138 @@
 data_path = pathlib.Path("tests/data/vcf/")
 
 
-# values computed using bcftools index -s
-@pytest.mark.parametrize(
-    ["index_file", "expected"],
-    [
-        ("sample.vcf.gz.tbi", {"19": 2, "20": 6, "X": 1}),
-        ("sample.bcf.csi", {"19": 2, "20": 6, "X": 1}),
-        ("sample_no_genotypes.vcf.gz.csi", {"19": 2, "20": 6, "X": 1}),
-        ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi", {"20": 3450, "21": 16460}),
-        ("CEUTrio.20.21.gatk3.4.g.bcf.csi", {"20": 3450, "21": 16460}),
-        ("1kg_2020_chrM.vcf.gz.tbi", {"chrM": 23}),
-        ("1kg_2020_chrM.vcf.gz.csi", {"chrM": 23}),
-        ("1kg_2020_chrM.bcf.csi", {"chrM": 23}),
-        ("1kg_2020_chr20_annotations.bcf.csi", {"chr20": 21}),
-        ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", {"20": 301778}),
-        ("multi_contig.vcf.gz.tbi", {str(j): 933 for j in range(5)}),
-    ],
-)
-def test_index_record_count(index_file, expected):
-    vcf_path = data_path / (".".join(list(index_file.split("."))[:-1]))
-    indexed_vcf = vcf_utils.IndexedVcf(vcf_path, data_path / index_file)
-    assert indexed_vcf.contig_record_counts() == expected
-
-
-@pytest.mark.parametrize(
-    ["index_file", "expected"],
-    [
-        ("sample.vcf.gz.tbi", ["19:1-", "20", "X"]),
-        ("sample.bcf.csi", ["19:1-", "20", "X"]),
-        ("sample_no_genotypes.vcf.gz.csi", ["19:1-", "20", "X"]),
-        ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi", ["20:1-", "21"]),
-        ("CEUTrio.20.21.gatk3.4.g.bcf.csi", ["20:1-", "21"]),
-        ("1kg_2020_chrM.vcf.gz.tbi", ["chrM:1-"]),
-        ("1kg_2020_chrM.vcf.gz.csi", ["chrM:1-"]),
-        ("1kg_2020_chrM.bcf.csi", ["chrM:1-"]),
-        ("1kg_2020_chr20_annotations.bcf.csi", ["chr20:49153-"]),
-        ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", ["20:1-"]),
-        ("multi_contig.vcf.gz.tbi", ["0:1-"] + [str(j) for j in range(1, 5)]),
-    ],
-)
-def test_partition_into_one_part(index_file, expected):
-    vcf_path = data_path / (".".join(list(index_file.split("."))[:-1]))
-    indexed_vcf = vcf_utils.IndexedVcf(vcf_path, data_path / index_file)
-    regions = indexed_vcf.partition_into_regions(num_parts=1)
-    assert all(isinstance(r, vcf_utils.Region) for r in regions)
-    assert [str(r) for r in regions] == expected
-
-
-def test_tabix_multi_chrom_bug():
-    index_file = "multi_contig.vcf.gz.tbi"
-    vcf_path = data_path / (".".join(list(index_file.split("."))[:-1]))
-    indexed_vcf = vcf_utils.IndexedVcf(vcf_path, data_path / index_file)
-    regions = indexed_vcf.partition_into_regions(num_parts=10)
-    # An earlier version of the code returned this, i.e. with a duplicate
-    # for 4 with end coord of 0
-    # ["0:1-", "1", "2", "3", "4:1-0", "4:1-"]
-    expected = ["0:1-", "1", "2", "3", "4:1-"]
-    assert [str(r) for r in regions] == expected
-
-
-@pytest.mark.skip("TODO")
-class TestCsiIndex:
+def assert_part_counts_non_zero(part_counts, index_file):
+    # We may have one zero count value at the end in Tabix indexes.
+    # Should probably try to get rid of it, but probably no harm
+    # https://github.com/jeromekelleher/bio2zarr/issues/45
+    if index_file.endswith(".tbi"):
+        assert np.all(part_counts[:-1] > 0)
+    else:
+        assert np.all(part_counts > 0)
+
+
+class TestIndexedVcf:
+    def get_instance(self, index_file):
+        vcf_path = data_path / (".".join(list(index_file.split("."))[:-1]))
+        return vcf_utils.IndexedVcf(vcf_path, data_path / index_file)
+
+    def test_context_manager_success(self):
+        # Nominal case
+        with vcf_utils.IndexedVcf(data_path / "sample.bcf") as iv:
+            assert iv.vcf is not None
+        assert iv.vcf is None
+
+    def test_context_manager_error(self):
+        with pytest.raises(ValueError, match="Cannot find"):
+            with vcf_utils.IndexedVcf(data_path / "no-such-file.bcf"):
+                pass
+
+    # values computed using bcftools index -s
     @pytest.mark.parametrize(
-        "filename",
-        ["CEUTrio.20.21.gatk3.4.g.vcf.bgz", "CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi"],
+        ["index_file", "expected"],
+        [
+            ("sample.vcf.gz.tbi", {"19": 2, "20": 6, "X": 1}),
+            ("sample.bcf.csi", {"19": 2, "20": 6, "X": 1}),
+            ("sample_no_genotypes.vcf.gz.csi", {"19": 2, "20": 6, "X": 1}),
+            ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi", {"20": 3450, "21": 16460}),
+            ("CEUTrio.20.21.gatk3.4.g.bcf.csi", {"20": 3450, "21": 16460}),
+            ("1kg_2020_chrM.vcf.gz.tbi", {"chrM": 23}),
+            ("1kg_2020_chrM.vcf.gz.csi", {"chrM": 23}),
+            ("1kg_2020_chrM.bcf.csi", {"chrM": 23}),
+            ("1kg_2020_chr20_annotations.bcf.csi", {"chr20": 21}),
+            ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", {"20": 301778}),
+            ("multi_contig.vcf.gz.tbi", {str(j): 933 for j in range(5)}),
+        ],
     )
-    def test_invalid_csi(self, filename):
-        with pytest.raises(ValueError, match=r"File not in CSI format."):
-            read_csi(data_path / filename)
+    def test_contig_record_counts(self, index_file, expected):
+        indexed_vcf = self.get_instance(index_file)
+        assert indexed_vcf.contig_record_counts() == expected
 
-
-@pytest.mark.skip("TODO")
-class TestTabixIndex:
     @pytest.mark.parametrize(
-        "filename",
+        ["index_file", "expected"],
         [
-            "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
-            "CEUTrio.20.21.gatk3.4.g.bcf.csi",
+            ("sample.vcf.gz.tbi", ["19:1-", "20", "X"]),
+            ("sample.bcf.csi", ["19:1-", "20", "X"]),
+            ("sample_no_genotypes.vcf.gz.csi", ["19:1-", "20", "X"]),
+            ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi", ["20:1-", "21"]),
+            ("CEUTrio.20.21.gatk3.4.g.bcf.csi", ["20:1-", "21"]),
+            ("1kg_2020_chrM.vcf.gz.tbi", ["chrM:1-"]),
+            ("1kg_2020_chrM.vcf.gz.csi", ["chrM:1-"]),
+            ("1kg_2020_chrM.bcf.csi", ["chrM:1-"]),
+            ("1kg_2020_chr20_annotations.bcf.csi", ["chr20:49153-"]),
+            ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", ["20:1-"]),
+            ("multi_contig.vcf.gz.tbi", ["0:1-"] + [str(j) for j in range(1, 5)]),
         ],
     )
-    def test_invalid_tbi(self, filename):
-        with pytest.raises(ValueError, match=r"File not in Tabix format."):
-            read_tabix(data_path / filename)
+    def test_partition_into_one_part(self, index_file, expected):
+        indexed_vcf = self.get_instance(index_file)
+        regions = indexed_vcf.partition_into_regions(num_parts=1)
+        assert all(isinstance(r, vcf_utils.Region) for r in regions)
+        assert [str(r) for r in regions] == expected
 
-
-@pytest.mark.skip("TODO")
-class TestPartitionIntoRegions:
     @pytest.mark.parametrize(
-        "vcf_file",
+        ["index_file", "num_expected", "total_records"],
         [
-            "CEUTrio.20.21.gatk3.4.g.bcf",
-            "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
-            "NA12878.prod.chr20snippet.g.vcf.gz",
+            ("sample.vcf.gz.tbi", 3, 9),
+            ("sample.bcf.csi", 3, 9),
+            ("sample_no_genotypes.vcf.gz.csi", 3, 9),
+            ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi", 18, 19910),
+            ("CEUTrio.20.21.gatk3.4.g.bcf.csi", 3, 19910),
+            ("1kg_2020_chrM.vcf.gz.tbi", 1, 23),
+            ("1kg_2020_chrM.vcf.gz.csi", 1, 23),
+            ("1kg_2020_chrM.bcf.csi", 1, 23),
+            ("1kg_2020_chr20_annotations.bcf.csi", 1, 21),
+            ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", 59, 301778),
+            ("multi_contig.vcf.gz.tbi", 5, 5 * 933),
         ],
     )
-    def test_num_parts(self, vcf_file):
-        vcf_path = data_path / vcf_file
-        regions = partition_into_regions(vcf_path, num_parts=4)
-
-        assert regions is not None
-        part_variant_counts = [count_variants(vcf_path, region) for region in regions]
-        total_variants = count_variants(vcf_path)
-
-        assert sum(part_variant_counts) == total_variants
-
-    def test_num_parts_large(self):
-        vcf_path = data_path / "CEUTrio.20.21.gatk3.4.g.vcf.bgz"
-
-        regions = partition_into_regions(vcf_path, num_parts=100)
-        assert regions is not None
-        assert len(regions) == 18
-
-        part_variant_counts = [count_variants(vcf_path, region) for region in regions]
-        total_variants = count_variants(vcf_path)
+    def test_partition_into_max_parts(self, index_file, num_expected, total_records):
+        indexed_vcf = self.get_instance(index_file)
+        regions = indexed_vcf.partition_into_regions(num_parts=1000)
+        assert all(isinstance(r, vcf_utils.Region) for r in regions)
+        # print(regions)
+        assert len(regions) == num_expected
+        part_variant_counts = np.array(
+            [indexed_vcf.count_variants(region) for region in regions]
+        )
+        assert np.sum(part_variant_counts) == total_records
+        assert_part_counts_non_zero(part_variant_counts, index_file)
 
-        assert sum(part_variant_counts) == total_variants
+    @pytest.mark.parametrize(
+        ["index_file", "total_records"],
+        [
+            ("sample.vcf.gz.tbi", 9),
+            ("sample.bcf.csi", 9),
+            ("sample_no_genotypes.vcf.gz.csi", 9),
+            ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi", 19910),
+            ("CEUTrio.20.21.gatk3.4.g.bcf.csi", 19910),
+            ("1kg_2020_chrM.vcf.gz.tbi", 23),
+            ("1kg_2020_chrM.vcf.gz.csi", 23),
+            ("1kg_2020_chrM.bcf.csi", 23),
+            ("1kg_2020_chr20_annotations.bcf.csi", 21),
+            ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", 301778),
+            ("multi_contig.vcf.gz.tbi", 5 * 933),
+        ],
+    )
+    @pytest.mark.parametrize("num_parts", [2, 3, 4, 5, 16, 33])
+    def test_partition_into_n_parts(self, index_file, total_records, num_parts):
+        indexed_vcf = self.get_instance(index_file)
+        regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
+        assert all(isinstance(r, vcf_utils.Region) for r in regions)
+        part_variant_counts = np.array(
+            [indexed_vcf.count_variants(region) for region in regions]
+        )
+        assert np.sum(part_variant_counts) == total_records
+        assert_part_counts_non_zero(part_variant_counts, index_file)
+
+    def test_tabix_multi_chrom_bug(self):
+        indexed_vcf = self.get_instance("multi_contig.vcf.gz.tbi")
+        regions = indexed_vcf.partition_into_regions(num_parts=10)
+        # An earlier version of the code returned this, i.e. with a duplicate
+        # for 4 with end coord of 0
+        # ["0:1-", "1", "2", "3", "4:1-0", "4:1-"]
+        expected = ["0:1-", "1", "2", "3", "4:1-"]
+        assert [str(r) for r in regions] == expected
 
     @pytest.mark.parametrize(
         "target_part_size",
@@ -136,48 +153,60 @@ def test_num_parts_large(self):
         ],
     )
     def test_target_part_size(self, target_part_size):
-        vcf_path = data_path / "CEUTrio.20.21.gatk3.4.g.vcf.bgz"
-
-        regions = partition_into_regions(vcf_path, target_part_size=target_part_size)
-        assert regions is not None
+        indexed_vcf = self.get_instance("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi")
+        regions = indexed_vcf.partition_into_regions(target_part_size=target_part_size)
         assert len(regions) == 5
-
-        part_variant_counts = [count_variants(vcf_path, region) for region in regions]
+        part_variant_counts = [indexed_vcf.count_variants(region) for region in regions]
         assert part_variant_counts == [3450, 3869, 4525, 7041, 1025]
-        total_variants = count_variants(vcf_path)
+        assert sum(part_variant_counts) == 19910
 
-        assert sum(part_variant_counts) == total_variants
-
-    def test_invalid_arguments(self):
-        vcf_path = data_path / "CEUTrio.20.21.gatk3.4.g.vcf.bgz"
+    def test_partition_invalid_arguments(self):
+        indexed_vcf = self.get_instance("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi")
 
         with pytest.raises(
             ValueError, match=r"One of num_parts or target_part_size must be specified"
         ):
-            partition_into_regions(vcf_path)
+            indexed_vcf.partition_into_regions()
 
         with pytest.raises(
             ValueError,
             match=r"Only one of num_parts or target_part_size may be specified",
         ):
-            partition_into_regions(vcf_path, num_parts=4, target_part_size=100_000)
+            indexed_vcf.partition_into_regions(num_parts=4, target_part_size=100_000)
 
         with pytest.raises(ValueError, match=r"num_parts must be positive"):
-            partition_into_regions(vcf_path, num_parts=0)
+            indexed_vcf.partition_into_regions(num_parts=0)
 
         with pytest.raises(ValueError, match=r"target_part_size must be positive"):
-            partition_into_regions(vcf_path, target_part_size=0)
-
-    @pytest.mark.skip("TODO")
-    def test_missing_index(self, temp_path):
-        vcf_path = data_path / "CEUTrio.20.21.gatk3.4.g.vcf.bgz"
-        with pytest.raises(ValueError, match=r"Cannot find .tbi or .csi file."):
-            partition_into_regions(vcf_path, num_parts=2)
+            indexed_vcf.partition_into_regions(target_part_size=0)
 
-        bogus_index_path = path_for_test(
-            shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz.index", True
-        )
+    def test_bad_index(self):
         with pytest.raises(
             ValueError, match=r"Only .tbi or .csi indexes are supported."
         ):
-            partition_into_regions(vcf_path, index_path=bogus_index_path, num_parts=2)
+            # We don't actually go out the filesystem before checking so can
+            # be anything
+            vcf_utils.IndexedVcf("x", "y")
+
+
+class TestCsiIndex:
+    @pytest.mark.parametrize(
+        "filename",
+        ["CEUTrio.20.21.gatk3.4.g.vcf.bgz", "CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi"],
+    )
+    def test_invalid_csi(self, filename):
+        with pytest.raises(ValueError, match=r"File not in CSI format."):
+            vcf_utils.read_csi(data_path / filename)
+
+
+class TestTabixIndex:
+    @pytest.mark.parametrize(
+        "filename",
+        [
+            "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
+            "CEUTrio.20.21.gatk3.4.g.bcf.csi",
+        ],
+    )
+    def test_invalid_tbi(self, filename):
+        with pytest.raises(ValueError, match=r"File not in Tabix format."):
+            vcf_utils.read_tabix(data_path / filename)
diff --git a/tests/utils.py b/tests/utils.py
@@ -52,6 +52,4 @@ def get_region_start(region: str) -> int:
 def count_variants(path: PathType, region: Optional[str] = None) -> int:
     """Count the number of variants in a VCF file."""
     with open_vcf(path) as vcf:
-        if region is not None:
-            vcf = vcf(region)
-        return sum(1 for _ in region_filter(vcf, region))
+        return sum(1 for _ in region_filter(vcf(str(region)), str(region)))