Remove index creation as it is now done by bio2zarr

tomwhite · jeromekelleher · commit 97578d1f8d03 · 2024-11-28T15:44:46.000Z
diff --git a/tests/test_stats.py b/tests/test_stats.py
@@ -2,6 +2,7 @@
 from io import StringIO
 
 import pytest
+import zarr
 from bio2zarr import vcf2zarr
 
 from vcztools.stats import nrecords, stats
@@ -40,5 +41,9 @@ def test_stats__no_index(tmp_path):
     vcz = tmp_path.joinpath("intermediate.vcz")
     vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False)
 
+    # delete the index created by vcf2zarr
+    root = zarr.open(vcz, mode="a")
+    del root["region_index"]
+
     with pytest.raises(ValueError, match="Could not load 'region_index' variable."):
         stats(vcz, StringIO())
diff --git a/tests/utils.py b/tests/utils.py
@@ -7,8 +7,6 @@
 import numpy as np
 from bio2zarr import vcf2zarr
 
-from vcztools.regions import create_index
-
 
 @contextmanager
 def open_vcf(path) -> Iterator[cyvcf2.VCF]:
@@ -152,5 +150,4 @@ def vcz_path_cache(vcf_path):
             vcf2zarr.convert(
                 [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False
             )
-        create_index(cached_vcz_path)
     return cached_vcz_path
diff --git a/vcztools/cli.py b/vcztools/cli.py
@@ -3,8 +3,8 @@
 import click
 
 from . import query as query_module
-from . import regions, vcf_writer
 from . import stats as stats_module
+from . import vcf_writer
 
 include = click.option(
     "-i", "--include", type=str, help="Filter expression to include variant sites."
@@ -42,8 +42,6 @@ def index(path, nrecords, stats):
         stats_module.nrecords(path, sys.stdout)
     elif stats:
         stats_module.stats(path, sys.stdout)
-    else:
-        regions.create_index(path)
 
 
 @click.command
diff --git a/vcztools/regions.py b/vcztools/regions.py
@@ -1,57 +1,11 @@
 import re
 from typing import Any
 
-import numcodecs
 import numpy as np
 import pandas as pd
-import zarr
 from pyranges import PyRanges
 
 
-def create_index(vcz) -> None:
-    """Create an index to support efficient region queries."""
-
-    root = zarr.open(vcz, mode="r+")
-
-    contig = root["variant_contig"]
-    pos = root["variant_position"]
-    length = root["variant_length"]
-
-    assert contig.cdata_shape == pos.cdata_shape
-
-    index = []
-
-    for v_chunk in range(pos.cdata_shape[0]):
-        c = contig.blocks[v_chunk]
-        p = pos.blocks[v_chunk]
-        e = p + length.blocks[v_chunk] - 1
-
-        # create a row for each contig in the chunk
-        d = np.diff(c, append=-1)
-        c_start_idx = 0
-        for c_end_idx in np.nonzero(d)[0]:
-            assert c[c_start_idx] == c[c_end_idx]
-            index.append(
-                (
-                    v_chunk,  # chunk index
-                    c[c_start_idx],  # contig ID
-                    p[c_start_idx],  # start
-                    p[c_end_idx],  # end
-                    np.max(e[c_start_idx : c_end_idx + 1]),  # max end
-                    c_end_idx - c_start_idx + 1,  # num records
-                )
-            )
-            c_start_idx = c_end_idx + 1
-
-    index = np.array(index, dtype=np.int32)
-    root.array(
-        "region_index",
-        data=index,
-        compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
-        overwrite=True,
-    )
-
-
 def parse_region_string(region: str) -> tuple[str, int | None, int | None]:
     """Return the contig, start position and end position from a region string."""
     if re.search(r":\d+-\d*$", region):
diff --git a/vcztools/stats.py b/vcztools/stats.py
@@ -18,7 +18,7 @@ def stats(vcz, output):
     if "region_index" not in root:
         raise ValueError(
             "Could not load 'region_index' variable. "
-            "Use 'vcztools index' to create an index."
+            "Use 'vcz2zarr' to create an index."
         )
 
     with open_file_like(output) as output:

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def stats(vcz, output):`
`18`	`18`	`if "region_index" not in root:`
`19`	`19`	`raise ValueError(`
`20`	`20`	`"Could not load 'region_index' variable. "`
`21`		`- "Use 'vcztools index' to create an index."`
	`21`	`+ "Use 'vcz2zarr' to create an index."`
`22`	`22`	`)`
`23`	`23`
`24`	`24`	`with open_file_like(output) as output:`