diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b18448fc..dcd71845 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,6 +81,46 @@ jobs: # https://github.com/coverallsapp/github-action fail-on-error: false + optional_dependencies: + name: Optional dependencies + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Test optional dependencies + run: | + python -m venv env-tskit + source env-tskit/bin/activate + python -m pip install . + python -m bio2zarr tskit2zarr convert tests/data/ts/example.trees ts.vcz > ts.txt 2>&1 || echo $? > ts_exit.txt + test "$(cat ts_exit.txt)" = "1" + grep -q "This process requires the optional tskit module. Install it with: pip install bio2zarr\[tskit\]" ts.txt + python -m pip install '.[tskit]' + python -m bio2zarr tskit2zarr convert tests/data/ts/example.trees ts.vcz + deactivate + + python -m venv env-plink + source env-plink/bin/activate + python -m pip install . + python -m bio2zarr plink2zarr convert tests/data/plink/example.bed plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt + test "$(cat plink_exit.txt)" = "1" + grep -q "This process requires the optional bed_reader module. Install it with: pip install bio2zarr\[plink\]" plink.txt + python -m pip install '.[plink]' + python -m bio2zarr plink2zarr convert tests/data/plink/example.bed plink.vcz + deactivate + + python -m venv env-vcf + source env-vcf/bin/activate + python -m pip install . + python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz > vcf.txt 2>&1 || echo $? > vcf_exit.txt + test "$(cat vcf_exit.txt)" = "1" + grep -q "This process requires the optional cyvcf2 module. Install it with: pip install bio2zarr\[vcf\]" vcf.txt + python -m pip install '.[vcf]' + python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz + deactivate + packaging: name: Packaging runs-on: ubuntu-latest diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index c09be0d8..fcda2cd7 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -38,7 +38,7 @@ jobs: - name: Install package run: | - python3 -m pip install . + python3 -m pip install '.[all]' - name: Build Docs run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index e7f602e2..11c3c261 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # 0.1.6 2025-0X-XX +- Make format-specific dependencies optional (#385) + - Add contigs to plink output (#344) Breaking changes diff --git a/bio2zarr/core.py b/bio2zarr/core.py index f526dc9b..723dd2b7 100644 --- a/bio2zarr/core.py +++ b/bio2zarr/core.py @@ -1,6 +1,8 @@ import concurrent.futures as cf import contextlib import dataclasses +import functools +import importlib import json import logging import math @@ -21,6 +23,26 @@ numcodecs.blosc.use_threads = False +def requires_optional_dependency(module_name, extras_name): + """Decorator to check for optional dependencies""" + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + importlib.import_module(module_name) + except ImportError: + raise ImportError( + f"This process requires the optional {module_name} module. " + f"Install it with: pip install bio2zarr[{extras_name}]" + ) from None + return func(*args, **kwargs) + + return wrapper + + return decorator + + def display_number(x): ret = "n/a" if math.isfinite(x): diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index f864f3a5..76cad279 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -1,7 +1,6 @@ import logging import pathlib -import bed_reader import numpy as np import zarr @@ -11,7 +10,10 @@ class PlinkFormat(vcz.Source): + @core.requires_optional_dependency("bed_reader", "plink") def __init__(self, path): + import bed_reader + self._path = pathlib.Path(path) self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False) @@ -175,7 +177,10 @@ def convert( # FIXME do this more efficiently - currently reading the whole thing # in for convenience, and also comparing call-by-call +@core.requires_optional_dependency("bed_reader", "plink") def validate(bed_path, zarr_path): + import bed_reader + root = zarr.open(store=zarr_path, mode="r") call_genotype = root["call_genotype"][:] diff --git a/bio2zarr/tskit.py b/bio2zarr/tskit.py index 4dccedaa..112e901a 100644 --- a/bio2zarr/tskit.py +++ b/bio2zarr/tskit.py @@ -2,7 +2,6 @@ import pathlib import numpy as np -import tskit from bio2zarr import constants, core, vcz @@ -10,6 +9,7 @@ class TskitFormat(vcz.Source): + @core.requires_optional_dependency("tskit", "tskit") def __init__( self, ts_path, @@ -18,6 +18,8 @@ def __init__( contig_id=None, isolated_as_missing=False, ): + import tskit + self._path = ts_path self.ts = tskit.load(ts_path) self.contig_id = contig_id if contig_id is not None else "1" diff --git a/bio2zarr/vcf_utils.py b/bio2zarr/vcf_utils.py index a8b3b551..b51e8ebc 100644 --- a/bio2zarr/vcf_utils.py +++ b/bio2zarr/vcf_utils.py @@ -9,10 +9,10 @@ from enum import Enum from typing import IO, Any -import cyvcf2 import humanfriendly import numpy as np +from bio2zarr import core from bio2zarr.typing import PathType logger = logging.getLogger(__name__) @@ -395,7 +395,10 @@ class VcfIndexType(Enum): class VcfFile(contextlib.AbstractContextManager): + @core.requires_optional_dependency("cyvcf2", "vcf") def __init__(self, vcf_path, index_path=None): + import cyvcf2 + self.vcf = None self.file_type = None self.index_type = None diff --git a/bio2zarr/vcz_verification.py b/bio2zarr/vcz_verification.py index 6faf356d..2af708c3 100644 --- a/bio2zarr/vcz_verification.py +++ b/bio2zarr/vcz_verification.py @@ -1,9 +1,9 @@ -import cyvcf2 import numpy as np import numpy.testing as nt import tqdm import zarr +from bio2zarr import core from bio2zarr.zarr_utils import first_dim_iter from . import constants @@ -146,7 +146,10 @@ def assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number): nt.assert_equal(vcf_val, zarr_val) +@core.requires_optional_dependency("cyvcf2", "vcf") def verify(vcf_path, zarr_path, show_progress=False): + import cyvcf2 + root = zarr.open(store=zarr_path, mode="r") pos = root["variant_position"][:] allele = root["variant_allele"][:] diff --git a/pyproject.toml b/pyproject.toml index f838e084..17898968 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,16 +17,12 @@ dependencies = [ "zarr >= 2.17,< 3", # Pinning numcodecs due to https://github.com/zarr-developers/zarr-python/issues/2963 "numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16", - "click", "tabulate", "tqdm", "humanfriendly", - # cyvcf2 also pulls in coloredlogs and click", - # colouredlogs pulls in humanfriendly", - "cyvcf2", - "bed_reader", - # TODO Using dev version of tskit for CI, FIXME before release - "tskit @ git+https://github.com/tskit-dev/tskit.git@main#subdirectory=python", + # cyvcf2 also pulls in coloredlogs and click + "coloredlogs", + "click", ] requires-python = ">=3.10" classifiers = [ @@ -65,8 +61,21 @@ dev = [ "pytest-coverage", "pytest-xdist", "sgkit>=0.8.0", - "tqdm" + "tqdm", + "tskit @ git+https://github.com/tskit-dev/tskit.git@main#subdirectory=python", + "bed_reader", + "cyvcf2" ] +# TODO Using dev version of tskit for CI, FIXME before release +tskit = ["tskit @ git+https://github.com/tskit-dev/tskit.git@main#subdirectory=python"] +plink = ["bed_reader"] +vcf = ["cyvcf2"] +all = [ + "tskit @ git+https://github.com/tskit-dev/tskit.git@main#subdirectory=python", + "bed_reader", + "cyvcf2" + ] + [tool.setuptools] packages = ["bio2zarr"] diff --git a/tests/test_core.py b/tests/test_core.py index 5619fd3a..de596d29 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -244,3 +244,14 @@ def test_examples(self, chunk_size, size, start, stop): ) def test_du(path, expected): assert core.du(path) == expected + + +def test_decorator_missing_dependency(): + @core.requires_optional_dependency("non_existent_module", "extras") + def test_function(): + return "success" + + with pytest.raises(ImportError) as exc_info: + test_function() + + assert "pip install bio2zarr[extras]" in str(exc_info.value) diff --git a/tests/test_plink.py b/tests/test_plink.py index ed62eaf7..21ae557a 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -1,3 +1,5 @@ +from unittest import mock + import bed_reader import numpy as np import numpy.testing as nt @@ -55,6 +57,21 @@ def test_genotypes(self, ds): ], ) + def test_missing_dependency(self): + with mock.patch( + "importlib.import_module", + side_effect=ImportError("No module named 'bed_reader'"), + ): + with pytest.raises(ImportError) as exc_info: + plink.convert( + "UNUSED_PATH", + "UNUSED_PATH", + ) + assert ( + "This process requires the optional bed_reader module. " + "Install it with: pip install bio2zarr[plink]" in str(exc_info.value) + ) + class TestEqualSgkit: def test_simulated_example(self, tmp_path): diff --git a/tests/test_ts.py b/tests/test_ts.py index 96be0f3a..80fbe8d5 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -1,5 +1,6 @@ import os import tempfile +from unittest import mock import numpy as np import pytest @@ -93,6 +94,21 @@ def test_simple_tree_sequence(self, tmp_path): "sample_id", } + def test_missing_dependency(self): + with mock.patch( + "importlib.import_module", + side_effect=ImportError("No module named 'tskit'"), + ): + with pytest.raises(ImportError) as exc_info: + ts.convert( + "UNUSED_PATH", + "UNUSED_PATH", + ) + assert ( + "This process requires the optional tskit module. Install " + "it with: pip install bio2zarr[tskit]" in str(exc_info.value) + ) + class TestTskitFormat: """Unit tests for TskitFormat without using full conversion.""" diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 2e7b93b6..7aaed7a3 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -1,6 +1,7 @@ import collections import pathlib import re +from unittest import mock import cyvcf2 import numpy as np @@ -481,6 +482,22 @@ def test_small_example_all_missing_gts(self, ds, tmp_path_factory): p1[1] = True nt.assert_array_equal(p1, ds2["call_genotype_phased"].values) + def test_missing_dependency(self, tmp_path): + with mock.patch( + "importlib.import_module", + side_effect=ImportError("No module named 'cyvcf2'"), + ): + with pytest.raises(ImportError) as exc_info: + vcf_mod.convert( + ["tests/data/vcf/sample.vcf.gz"], + tmp_path / "example.vcf.zarr", + worker_processes=0, # Synchronous mode so the mock works + ) + assert ( + "This process requires the optional cyvcf2 module. Install " + "it with: pip install bio2zarr[vcf]" in str(exc_info.value) + ) + class TestSmallExampleLocalAlleles: data_path = "tests/data/vcf/sample.vcf.gz"