diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 46002159..af15cdec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -69,6 +69,12 @@ jobs: python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1 python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2 python -m bio2zarr vcf2zarr dencode-finalise sample.vcz + - name: Run tskit2zarr example + run: | + python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees sample.vcz -f + - name: Run plink2zarr example + run: | + python -m bio2zarr plink2zarr convert tests/data/plink/example sample.vcz -f - name: Run tests run: | pytest --cov=bio2zarr @@ -137,6 +143,14 @@ jobs: run: | vcfpartition --help python -m bio2zarr vcfpartition --help + - name: Check tskit2zarr CLI + run: | + tskit2zarr --help + python -m bio2zarr tskit2zarr --help + - name: Check plink2zarr CLI + run: | + plink2zarr --help + python -m bio2zarr plink2zarr --help test-numpy-version: name: Test numpy versions diff --git a/CHANGELOG.md b/CHANGELOG.md index 0909fd68..b8cb77f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,21 @@ -# 0.1.6 2025-0X-XX +# 0.1.6 2025-05-23 + +- Initial Python API support for VCF and tskit one-shot conversion. Format +conversion is done using the functions ``bio2zarr.vcf.convert`` +and ``bio2zarr.tskit.convert``. - Initial version of supported plink2zarr (#390, #344, #382) +- Initial version of tskit2zarr (#232) + - Make format-specific dependencies optional (#385) +- Remove bed_reader dependency (#397, #400) + - Change default number of worker processes to zero (#404) to simplify debugging -Breaking changes +*Breaking changes* - Remove explicit sample, contig and filter lists from the schema. Existing ICFs will need to be recreated. (#343) diff --git a/bio2zarr/__main__.py b/bio2zarr/__main__.py index 13728659..c19d5334 100644 --- a/bio2zarr/__main__.py +++ b/bio2zarr/__main__.py @@ -15,9 +15,9 @@ def bio2zarr(): # is handy for development and for those whose PATHs aren't set # up in the right way. bio2zarr.add_command(cli.vcf2zarr_main) -bio2zarr.add_command(cli.plink2zarr) +bio2zarr.add_command(cli.plink2zarr_main) +bio2zarr.add_command(cli.tskit2zarr_main) bio2zarr.add_command(cli.vcfpartition) -bio2zarr.add_command(cli.tskit2zarr) if __name__ == "__main__": bio2zarr() diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index e8dade34..45553f23 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -531,6 +531,7 @@ def vcf2zarr_main(): Convert VCF file(s) to VCF Zarr format. See the online documentation at https://sgkit-dev.github.io/bio2zarr/ + for more information. """ @@ -551,6 +552,7 @@ def vcf2zarr_main(): @click.command(name="convert") @click.argument("in_path", type=click.Path()) @click.argument("zarr_path", type=click.Path()) +@force @worker_processes @progress @verbose @@ -559,6 +561,7 @@ def vcf2zarr_main(): def convert_plink( in_path, zarr_path, + force, verbose, worker_processes, progress, @@ -571,6 +574,7 @@ def convert_plink( then running `vcf2zarr convert tmp.vcf zarr_path` """ setup_logging(verbose) + check_overwrite_dir(zarr_path, force) plink.convert( in_path, zarr_path, @@ -582,15 +586,15 @@ def convert_plink( @version -@click.group() -def plink2zarr(): +@click.group(name="plink2zarr") +def plink2zarr_main(): """ Convert plink fileset(s) to VCF Zarr format """ pass -plink2zarr.add_command(convert_plink) +plink2zarr_main.add_command(convert_plink) @click.command @@ -684,12 +688,12 @@ def convert_tskit( @version -@click.group() -def tskit2zarr(): +@click.group(name="tskit2zarr") +def tskit2zarr_main(): """ Convert tskit tree sequence(s) to VCF Zarr format """ pass -tskit2zarr.add_command(convert_tskit) +tskit2zarr_main.add_command(convert_tskit) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 500bd422..6f59adf8 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -33,17 +33,19 @@ BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS]) -def read_fam(path, sep=None): +# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion +# on the parameters to Pandas here. +def read_fam(path): # See: https://www.cog-genomics.org/plink/1.9/formats#fam names = [f[0] for f in FAM_FIELDS] - df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE) + df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python") return df -def read_bim(path, sep=None): +def read_bim(path): # See: https://www.cog-genomics.org/plink/1.9/formats#bim names = [f[0] for f in BIM_FIELDS] - df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE) + df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python") return df @@ -94,6 +96,18 @@ def __init__(self, path, num_variants, num_samples): self.byte_lookup = lookup + def iter_decode(self, start, stop, buffer_size=None): + """ + Iterate of over the variants in the specified window + with the specified approximate buffer size in bytes (default=10MiB). + """ + if buffer_size is None: + buffer_size = 10 * 1024 * 1024 + variants_per_read = max(1, int(buffer_size / self.bytes_per_variant)) + for off in range(start, stop, variants_per_read): + genotypes = self.decode(off, min(off + variants_per_read, stop)) + yield from genotypes + def decode(self, start, stop): chunk_size = stop - start @@ -102,6 +116,11 @@ def decode(self, start, stop): start_offset = 3 + (start * self.bytes_per_variant) bytes_to_read = chunk_size * self.bytes_per_variant + logger.debug( + f"Reading {chunk_size} variants ({bytes_to_read} bytes) " + f"from {self.path}" + ) + # TODO make it possible to read sequentially from the same file handle, # seeking only when necessary. with open(self.path, "rb") as f: @@ -175,19 +194,16 @@ def iter_id(self, start, stop): yield from self.bim.variant_id[start:stop] def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): - alt_field = self.bim.allele_1.values - ref_field = self.bim.allele_2.values - gt = self.bed_reader.decode(start, stop) - phased = np.zeros(gt.shape[:2], dtype=bool) - for i, (ref, alt) in enumerate( - zip(ref_field[start:stop], alt_field[start:stop]) - ): + alt_iter = self.bim.allele_1.values[start:stop] + ref_iter = self.bim.allele_2.values[start:stop] + gt_iter = self.bed_reader.iter_decode(start, stop) + for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter): alleles = np.full(num_alleles, constants.STR_FILL, dtype="O") alleles[0] = ref alleles[1 : 1 + len(alt)] = alt - + phased = np.zeros(gt.shape[0], dtype=bool) # rlen is the length of the REF in PLINK as there's no END annotations - yield vcz.VariantData(len(alleles[0]), alleles, gt[i], phased[i]) + yield vcz.VariantData(len(alleles[0]), alleles, gt, phased) def generate_schema( self, diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index 29654c73..31b17bca 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1608,7 +1608,7 @@ def mkschema( def convert( vcfs, - out_path, + vcz_path, *, variants_chunk_size=None, samples_chunk_size=None, @@ -1617,6 +1617,12 @@ def convert( show_progress=False, icf_path=None, ): + """ + Convert the VCF data at the specified list of paths + to VCF Zarr format stored at the specified path. + + .. todo:: Document parameters + """ if icf_path is None: cm = temp_icf_path(prefix="vcf2zarr") else: @@ -1631,7 +1637,7 @@ def convert( ) encode( icf_path, - out_path, + vcz_path, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, worker_processes=worker_processes, diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 28a60741..45864ac3 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -842,6 +842,7 @@ def encode_alleles_and_genotypes_partition(self, partition_index): partition_index, "call_genotype_phased" ) shape = gt.buff.shape[1:] + for variant_data in self.source.iter_alleles_and_genotypes( partition.start, partition.stop, shape, alleles.array.shape[1] ): diff --git a/docs/_toc.yml b/docs/_toc.yml index 718b3512..d88093b7 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -6,6 +6,7 @@ chapters: sections: - file: vcf2zarr/tutorial - file: vcf2zarr/cli_ref + - file: vcf2zarr/python_api - file: plink2zarr/overview sections: - file: plink2zarr/cli_ref diff --git a/docs/intro.md b/docs/intro.md index 8b563fae..71de0b5a 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -11,6 +11,9 @@ - {ref}`sec-plink2zarr` converts PLINK 1.0 data to [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format. +- {ref}`sec-tskit2zarr` converts [tskit](https://tskit.dev) + data into [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) format. + - {ref}`sec-vcfpartition` is a utility to split an input VCF into a given number of partitions. This is useful for parallel processing of VCF data. @@ -25,13 +28,11 @@ support for other formats such as BGEN (or an interested in helping with impleme please open an [issue on Github](https://github.com/sgkit-dev/bio2zarr/issues) to discuss! +## Python APIs -The package is currently focused on command line interfaces, but a -Python API is also planned. +There is access to some limited functionality via Python APIs (documented +along with the respective tools). These are in beta, and should be fully +documented and stabilised in the coming releases. General APIs to enable +efficient and straightforward encoding of data to VCZ are planned +(see [issue #412](https://github.com/sgkit-dev/bio2zarr/issues/412)). -:::{warning} -Although it is possible to import the bio2zarr Python package -the APIs are purely internal for the moment and will change -in arbitrary ways. Please don't use them (or open issues about -them on GitHub). -::: diff --git a/docs/vcf2zarr/python_api.md b/docs/vcf2zarr/python_api.md new file mode 100644 index 00000000..47c0a546 --- /dev/null +++ b/docs/vcf2zarr/python_api.md @@ -0,0 +1,17 @@ +(sec-vcf2zarr-python-api)= +# Python API + +Basic usage: +```python +import bio2zarr.vcf as v2z + +v2z.convert([vcf_path], vcz_path) +``` + +## API reference + +```{eval-rst} + +.. autofunction:: bio2zarr.vcf.convert + +``` diff --git a/pyproject.toml b/pyproject.toml index 0ba72320..50e52f22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ documentation = "https://sgkit-dev.github.io/bio2zarr/" vcf2zarr = "bio2zarr.cli:vcf2zarr_main" vcfpartition = "bio2zarr.cli:vcfpartition" tskit2zarr = "bio2zarr.cli:tskit2zarr_main" +plink2zarr = "bio2zarr.cli:plink2zarr_main" [project.optional-dependencies] dev = [ diff --git a/tests/test_cli.py b/tests/test_cli.py index 5113f356..f8c628f9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -616,7 +616,7 @@ def test_vcf_convert_overwrite_zarr_confirm_no(self, mocked, tmp_path, response) def test_convert_plink(self, mocked, progress, flag): runner = ct.CliRunner() result = runner.invoke( - cli.plink2zarr, ["convert", "in", "out", flag], catch_exceptions=False + cli.plink2zarr_main, ["convert", "in", "out", flag], catch_exceptions=False ) assert result.exit_code == 0 assert len(result.stdout) == 0 @@ -651,7 +651,7 @@ def test_convert_tskit(self, mocked, tmp_path, progress, flag): zarr_path = tmp_path / "zarr" runner = ct.CliRunner() result = runner.invoke( - cli.tskit2zarr, + cli.tskit2zarr_main, f"convert {ts_path} {zarr_path} {flag}", catch_exceptions=False, ) @@ -674,7 +674,7 @@ def test_tskit_convert_overwrite_zarr_confirm_yes(self, mocked, tmp_path, respon zarr_path.mkdir() runner = ct.CliRunner() result = runner.invoke( - cli.tskit2zarr, + cli.tskit2zarr_main, f"convert {ts_path} {zarr_path}", catch_exceptions=False, input=response, @@ -696,7 +696,7 @@ def test_tskit_convert_overwrite_zarr_confirm_no(self, mocked, tmp_path, respons zarr_path.mkdir() runner = ct.CliRunner() result = runner.invoke( - cli.tskit2zarr, + cli.tskit2zarr_main, f"convert {ts_path} {zarr_path}", catch_exceptions=False, input=response, @@ -713,7 +713,7 @@ def test_tskit_convert_overwrite_zarr_force(self, mocked, tmp_path, force_arg): zarr_path.mkdir() runner = ct.CliRunner() result = runner.invoke( - cli.tskit2zarr, + cli.tskit2zarr_main, f"convert {ts_path} {zarr_path} {force_arg}", catch_exceptions=False, ) @@ -732,7 +732,7 @@ def test_tskit_convert_with_options(self, mocked, tmp_path): zarr_path = tmp_path / "zarr" runner = ct.CliRunner() result = runner.invoke( - cli.tskit2zarr, + cli.tskit2zarr_main, f"convert {ts_path} {zarr_path} --contig-id chr1 " "--isolated-as-missing -l 100 -w 50 -p 4", catch_exceptions=False, @@ -754,6 +754,66 @@ def test_tskit_convert_with_options(self, mocked, tmp_path): **expected_args, ) + @pytest.mark.parametrize("response", ["y", "Y", "yes"]) + @mock.patch("bio2zarr.plink.convert") + def test_plink_convert_overwrite_zarr_confirm_yes(self, mocked, tmp_path, response): + prefix = "tests/data/plink/example" + zarr_path = tmp_path / "zarr" + zarr_path.mkdir() + runner = ct.CliRunner() + result = runner.invoke( + cli.plink2zarr_main, + f"convert {prefix} {zarr_path}", + catch_exceptions=False, + input=response, + ) + assert result.exit_code == 0 + assert f"Do you want to overwrite {zarr_path}" in result.stdout + assert len(result.stderr) == 0 + mocked.assert_called_once_with( + prefix, + str(zarr_path), + **DEFAULT_PLINK_CONVERT_ARGS, + ) + + @pytest.mark.parametrize("response", ["n", "N", "No"]) + @mock.patch("bio2zarr.plink.convert") + def test_plink_convert_overwrite_zarr_confirm_no(self, mocked, tmp_path, response): + prefix = "tests/data/plink/example" + zarr_path = tmp_path / "zarr" + zarr_path.mkdir() + runner = ct.CliRunner() + result = runner.invoke( + cli.plink2zarr_main, + f"convert {prefix} {zarr_path}", + catch_exceptions=False, + input=response, + ) + assert result.exit_code == 1 + assert "Aborted" in result.stderr + mocked.assert_not_called() + + @pytest.mark.parametrize("force_arg", ["-f", "--force"]) + @mock.patch("bio2zarr.plink.convert") + def test_plink_convert_overwrite_zarr_force(self, mocked, tmp_path, force_arg): + prefix = "tests/data/plink/example" + zarr_path = tmp_path / "zarr" + zarr_path.mkdir() + runner = ct.CliRunner() + result = runner.invoke( + cli.plink2zarr_main, + f"convert {prefix} {zarr_path} {force_arg}", + catch_exceptions=False, + ) + assert result.exit_code == 0 + assert len(result.stdout) == 0 + assert len(result.stderr) == 0 + mocked.assert_called_once_with( + prefix, + str(zarr_path), + **DEFAULT_PLINK_CONVERT_ARGS, + ) + class TestVcfEndToEnd: vcf_path = "tests/data/vcf/sample.vcf.gz" @@ -1032,7 +1092,7 @@ def test_convert(self, tmp_path): zarr_path = tmp_path / "zarr" runner = ct.CliRunner() result = runner.invoke( - cli.tskit2zarr, + cli.tskit2zarr_main, f"convert {ts_path} {zarr_path}", catch_exceptions=False, ) @@ -1051,7 +1111,7 @@ def test_convert(self, tmp_path): zarr_path = tmp_path / "zarr" runner = ct.CliRunner() result = runner.invoke( - cli.plink2zarr, + cli.plink2zarr_main, f"convert {ts_path} {zarr_path}", catch_exceptions=False, ) @@ -1069,9 +1129,9 @@ def test_convert(self, tmp_path): [ main.bio2zarr, cli.vcf2zarr_main, - cli.plink2zarr, + cli.plink2zarr_main, cli.vcfpartition, - cli.tskit2zarr, + cli.tskit2zarr_main, ], ) def test_version(cmd): diff --git a/tests/test_plink.py b/tests/test_plink.py index 0f863232..469d288c 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -93,7 +93,6 @@ def test_generated_bed_files(self, tmp_path, num_variants, num_samples): data = np.arange(num_variants * num_samples, dtype=int) % 4 data[data == 3] = -127 data = data.reshape((num_variants, num_samples)) - bed_reader.to_bed(bed_file, data.T, num_threads=1) bytes_per_variant = (num_samples + 3) // 4 @@ -108,6 +107,31 @@ def test_generated_bed_files(self, tmp_path, num_variants, num_samples): for k in range(num_samples): assert br_map[data[j, k]] == tuple(g[j, k]) + @pytest.mark.parametrize( + ("num_variants", "num_samples"), + [ + (1, 1), + (30, 3), + (300, 1000), + ], + ) + @pytest.mark.parametrize("buffer_size", [0, 1, 3, 100, 100_000, None]) + def test_iter_decode(self, tmp_path, buffer_size, num_variants, num_samples): + bed_file = tmp_path / "a_file.bed" + # Generate a regular pattern of all possible values + data = np.arange(num_variants * num_samples, dtype=int) % 4 + data[data == 3] = -127 + data = data.reshape((num_variants, num_samples)) + bed_reader.to_bed(bed_file, data.T, num_threads=1) + + reader = plink.BedReader(bed_file, num_variants, num_samples) + G = reader.decode(0, num_variants) + assert G.shape == (num_variants, num_samples, 2) + + reader = plink.BedReader(bed_file, num_variants, num_samples) + G2 = list(reader.iter_decode(0, num_variants, buffer_size=buffer_size)) + nt.assert_array_equal(G, G2) + class TestSmallExample: @pytest.fixture(scope="class") @@ -356,7 +380,13 @@ def test_chunk_size( ds = ds.drop_vars("region_index") ds2 = ds2.drop_vars("region_index") xt.assert_equal(ds, ds2) - # TODO check array chunks + root = zarr.open(out, mode="r") + v = variants_chunk_size + s = samples_chunk_size + assert root["call_genotype"].chunks == (v, s, 2) + assert root["call_genotype_phased"].chunks == (v, s) + assert root["variant_position"].chunks == (v,) + assert root["variant_contig"].chunks == (v,) def validate(bed_path, zarr_path):