sgkit-dev · jeromekelleher · May 23, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -69,6 +69,12 @@ jobs:
           python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1
           python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2
           python -m bio2zarr vcf2zarr dencode-finalise sample.vcz
+      - name: Run tskit2zarr example
+        run: |
+          python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees sample.vcz -f
+      - name: Run plink2zarr example
+        run: |
+          python -m bio2zarr plink2zarr convert tests/data/plink/example sample.vcz -f
       - name: Run tests
         run: |
           pytest --cov=bio2zarr
@@ -137,6 +143,14 @@ jobs:
         run: |
           vcfpartition --help
           python -m bio2zarr vcfpartition --help
+      - name: Check tskit2zarr CLI
+        run: |
+          tskit2zarr --help
+          python -m bio2zarr tskit2zarr --help
+      - name: Check plink2zarr CLI
+        run: |
+          plink2zarr --help
+          python -m bio2zarr plink2zarr --help
 
   test-numpy-version:
     name: Test numpy versions

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,13 +1,21 @@
-# 0.1.6 2025-0X-XX
+# 0.1.6 2025-05-23
+
+- Initial Python API support for VCF and tskit one-shot conversion. Format
+conversion is done using the functions ``bio2zarr.vcf.convert``
+and ``bio2zarr.tskit.convert``.
 
 - Initial version of supported plink2zarr (#390, #344, #382)
 
+- Initial version of tskit2zarr (#232)
+
 - Make format-specific dependencies optional (#385)
 
+- Remove bed_reader dependency (#397, #400)
+
 - Change default number of worker processes to zero (#404) to simplify
   debugging
 
-Breaking changes
+*Breaking changes*
 
 - Remove explicit sample, contig and filter lists from the schema.
   Existing ICFs will need to be recreated. (#343)

diff --git a/bio2zarr/__main__.py b/bio2zarr/__main__.py
@@ -15,9 +15,9 @@ def bio2zarr():
 # is handy for development and for those whose PATHs aren't set
 # up in the right way.
 bio2zarr.add_command(cli.vcf2zarr_main)
-bio2zarr.add_command(cli.plink2zarr)
+bio2zarr.add_command(cli.plink2zarr_main)
+bio2zarr.add_command(cli.tskit2zarr_main)
 bio2zarr.add_command(cli.vcfpartition)
-bio2zarr.add_command(cli.tskit2zarr)
 
 if __name__ == "__main__":
     bio2zarr()
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -531,6 +531,7 @@ def vcf2zarr_main():
     Convert VCF file(s) to VCF Zarr format.
 
     See the online documentation at https://sgkit-dev.github.io/bio2zarr/
+
     for more information.
     """
 
@@ -551,6 +552,7 @@ def vcf2zarr_main():
 @click.command(name="convert")
 @click.argument("in_path", type=click.Path())
 @click.argument("zarr_path", type=click.Path())
+@force
 @worker_processes
 @progress
 @verbose
@@ -559,6 +561,7 @@ def vcf2zarr_main():
 def convert_plink(
     in_path,
     zarr_path,
+    force,
     verbose,
     worker_processes,
     progress,
@@ -571,6 +574,7 @@ def convert_plink(
     then running `vcf2zarr convert tmp.vcf zarr_path`
     """
     setup_logging(verbose)
+    check_overwrite_dir(zarr_path, force)
     plink.convert(
         in_path,
         zarr_path,
@@ -582,15 +586,15 @@ def convert_plink(
 
 
 @version
-@click.group()
-def plink2zarr():
+@click.group(name="plink2zarr")
+def plink2zarr_main():
     """
     Convert plink fileset(s) to VCF Zarr format
     """
     pass
 
 
-plink2zarr.add_command(convert_plink)
+plink2zarr_main.add_command(convert_plink)
 
 
 @click.command
@@ -684,12 +688,12 @@ def convert_tskit(
 
 
 @version
-@click.group()
-def tskit2zarr():
+@click.group(name="tskit2zarr")
+def tskit2zarr_main():
     """
     Convert tskit tree sequence(s) to VCF Zarr format
     """
     pass
 
 
-tskit2zarr.add_command(convert_tskit)
+tskit2zarr_main.add_command(convert_tskit)
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -33,17 +33,19 @@
 BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
 
 
-def read_fam(path, sep=None):
+# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
+# on the parameters to Pandas here.
+def read_fam(path):
     # See: https://www.cog-genomics.org/plink/1.9/formats#fam
     names = [f[0] for f in FAM_FIELDS]
-    df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
+    df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
     return df
 
 
-def read_bim(path, sep=None):
+def read_bim(path):
     # See: https://www.cog-genomics.org/plink/1.9/formats#bim
     names = [f[0] for f in BIM_FIELDS]
-    df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
+    df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
     return df
 
 
@@ -94,6 +96,18 @@ def __init__(self, path, num_variants, num_samples):
 
         self.byte_lookup = lookup
 
+    def iter_decode(self, start, stop, buffer_size=None):
+        """
+        Iterate of over the variants in the specified window
+        with the specified approximate buffer size in bytes (default=10MiB).
+        """
+        if buffer_size is None:
+            buffer_size = 10 * 1024 * 1024
+        variants_per_read = max(1, int(buffer_size / self.bytes_per_variant))
+        for off in range(start, stop, variants_per_read):
+            genotypes = self.decode(off, min(off + variants_per_read, stop))
+            yield from genotypes
+
     def decode(self, start, stop):
         chunk_size = stop - start
 
@@ -102,6 +116,11 @@ def decode(self, start, stop):
         start_offset = 3 + (start * self.bytes_per_variant)
         bytes_to_read = chunk_size * self.bytes_per_variant
 
+        logger.debug(
+            f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
+            f"from {self.path}"
+        )
+
         # TODO make it possible to read sequentially from the same file handle,
         # seeking only when necessary.
         with open(self.path, "rb") as f:
@@ -175,19 +194,16 @@ def iter_id(self, start, stop):
         yield from self.bim.variant_id[start:stop]
 
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
-        alt_field = self.bim.allele_1.values
-        ref_field = self.bim.allele_2.values
-        gt = self.bed_reader.decode(start, stop)
-        phased = np.zeros(gt.shape[:2], dtype=bool)
-        for i, (ref, alt) in enumerate(
-            zip(ref_field[start:stop], alt_field[start:stop])
-        ):
+        alt_iter = self.bim.allele_1.values[start:stop]
+        ref_iter = self.bim.allele_2.values[start:stop]
+        gt_iter = self.bed_reader.iter_decode(start, stop)
+        for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
             alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
             alleles[0] = ref
             alleles[1 : 1 + len(alt)] = alt
-
+            phased = np.zeros(gt.shape[0], dtype=bool)
             # rlen is the length of the REF in PLINK as there's no END annotations
-            yield vcz.VariantData(len(alleles[0]), alleles, gt[i], phased[i])
+            yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
 
     def generate_schema(
         self,

diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1608,7 +1608,7 @@ def mkschema(
 
 def convert(
     vcfs,
-    out_path,
+    vcz_path,
     *,
     variants_chunk_size=None,
     samples_chunk_size=None,
@@ -1617,6 +1617,12 @@ def convert(
     show_progress=False,
     icf_path=None,
 ):
+    """
+    Convert the VCF data at the specified list of paths
+    to VCF Zarr format stored at the specified path.
+
+    .. todo:: Document parameters
+    """
     if icf_path is None:
         cm = temp_icf_path(prefix="vcf2zarr")
     else:
@@ -1631,7 +1637,7 @@ def convert(
         )
         encode(
             icf_path,
-            out_path,
+            vcz_path,
             variants_chunk_size=variants_chunk_size,
             samples_chunk_size=samples_chunk_size,
             worker_processes=worker_processes,

diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py
@@ -842,6 +842,7 @@ def encode_alleles_and_genotypes_partition(self, partition_index):
                 partition_index, "call_genotype_phased"
             )
             shape = gt.buff.shape[1:]
+
         for variant_data in self.source.iter_alleles_and_genotypes(
             partition.start, partition.stop, shape, alleles.array.shape[1]
         ):

diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -6,6 +6,7 @@ chapters:
   sections:
   - file: vcf2zarr/tutorial
   - file: vcf2zarr/cli_ref
+  - file: vcf2zarr/python_api
 - file: plink2zarr/overview
   sections:
   - file: plink2zarr/cli_ref

diff --git a/docs/intro.md b/docs/intro.md
@@ -11,6 +11,9 @@
 - {ref}`sec-plink2zarr` converts PLINK 1.0 data to
   [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format.
 
+- {ref}`sec-tskit2zarr` converts [tskit](https://tskit.dev)
+  data into [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format.
+
 - {ref}`sec-vcfpartition` is a utility to split an input
   VCF into a given number of partitions. This is useful for
   parallel processing of VCF data.
@@ -25,13 +28,11 @@ support for other formats such as BGEN (or an interested in helping with impleme
 please open an [issue on Github](https://github.com/sgkit-dev/bio2zarr/issues)
 to discuss!
 
+## Python APIs
 
-The package is currently focused on command line interfaces, but a
-Python API is also planned.
+There is access to some limited functionality via Python APIs (documented
+along with the respective tools). These are in beta, and should be fully
+documented and stabilised in the coming releases. General APIs to enable
+efficient and straightforward encoding of data to VCZ are planned
+(see [issue #412](https://github.com/sgkit-dev/bio2zarr/issues/412)).
 
-:::{warning}
-Although it is possible to import the bio2zarr Python package
-the APIs are purely internal for the moment and will change
-in arbitrary ways. Please don't use them (or open issues about
-them on GitHub).
-:::
diff --git a/docs/vcf2zarr/python_api.md b/docs/vcf2zarr/python_api.md
@@ -0,0 +1,17 @@
+(sec-vcf2zarr-python-api)=
+# Python API
+
+Basic usage:
+```python
+import bio2zarr.vcf as v2z
+
+v2z.convert([vcf_path], vcz_path)
+```
+
+## API reference
+
+```{eval-rst}
+
+.. autofunction:: bio2zarr.vcf.convert
+
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,6 +52,7 @@ documentation = "https://sgkit-dev.github.io/bio2zarr/"
 vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
 vcfpartition = "bio2zarr.cli:vcfpartition"
 tskit2zarr = "bio2zarr.cli:tskit2zarr_main"
+plink2zarr = "bio2zarr.cli:plink2zarr_main"
 
 [project.optional-dependencies]
 dev = [