diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index e95e5f69..cfe64580 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -8,8 +8,8 @@ import numcodecs import tabulate -from . import icf as icf_mod from . import plink, provenance, vcf_utils +from . import vcf as vcf_mod logger = logging.getLogger(__name__) @@ -197,7 +197,7 @@ def check_partitions(num_partitions): def get_compressor(cname): if cname is None: return None - config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config() + config = vcf_mod.ICF_DEFAULT_COMPRESSOR.get_config() config["cname"] = cname return numcodecs.get_codec(config) @@ -236,7 +236,7 @@ def explode( """ setup_logging(verbose) check_overwrite_dir(icf_path, force) - icf_mod.explode( + vcf_mod.explode( icf_path, vcfs, worker_processes=worker_processes, @@ -276,7 +276,7 @@ def dexplode_init( setup_logging(verbose) check_overwrite_dir(icf_path, force) check_partitions(num_partitions) - work_summary = icf_mod.explode_init( + work_summary = vcf_mod.explode_init( icf_path, vcfs, target_num_partitions=num_partitions, @@ -304,7 +304,7 @@ def dexplode_partition(icf_path, partition, verbose, one_based): setup_logging(verbose) if one_based: partition -= 1 - icf_mod.explode_partition(icf_path, partition) + vcf_mod.explode_partition(icf_path, partition) @click.command @@ -315,7 +315,7 @@ def dexplode_finalise(icf_path, verbose): Final step for distributed conversion of VCF(s) to intermediate columnar format. """ setup_logging(verbose) - icf_mod.explode_finalise(icf_path) + vcf_mod.explode_finalise(icf_path) @click.command @@ -326,7 +326,7 @@ def inspect(path, verbose): Inspect an intermediate columnar format or Zarr path. """ setup_logging(verbose) - data = icf_mod.inspect(path) + data = vcf_mod.inspect(path) click.echo(tabulate.tabulate(data, headers="keys")) @@ -345,7 +345,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles): err=True, ) stream = click.get_text_stream("stdout") - icf_mod.mkschema( + vcf_mod.mkschema( icf_path, stream, variants_chunk_size=variants_chunk_size, @@ -384,7 +384,7 @@ def encode( """ setup_logging(verbose) check_overwrite_dir(zarr_path, force) - icf_mod.encode( + vcf_mod.encode( icf_path, zarr_path, schema_path=schema, @@ -438,7 +438,7 @@ def dencode_init( setup_logging(verbose) check_overwrite_dir(zarr_path, force) check_partitions(num_partitions) - work_summary = icf_mod.encode_init( + work_summary = vcf_mod.encode_init( icf_path, zarr_path, target_num_partitions=num_partitions, @@ -466,7 +466,7 @@ def dencode_partition(zarr_path, partition, verbose, one_based): setup_logging(verbose) if one_based: partition -= 1 - icf_mod.encode_partition(zarr_path, partition) + vcf_mod.encode_partition(zarr_path, partition) @click.command @@ -478,7 +478,7 @@ def dencode_finalise(zarr_path, verbose, progress): Final step for distributed conversion of ICF to VCF Zarr. """ setup_logging(verbose) - icf_mod.encode_finalise(zarr_path, show_progress=progress) + vcf_mod.encode_finalise(zarr_path, show_progress=progress) @click.command(name="convert") @@ -507,7 +507,7 @@ def convert_vcf( """ setup_logging(verbose) check_overwrite_dir(zarr_path, force) - icf_mod.convert( + vcf_mod.convert( vcfs, zarr_path, variants_chunk_size=variants_chunk_size, diff --git a/bio2zarr/icf.py b/bio2zarr/vcf.py similarity index 100% rename from bio2zarr/icf.py rename to bio2zarr/vcf.py diff --git a/tests/test_cli.py b/tests/test_cli.py index 79a18ce5..219f812e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -84,7 +84,7 @@ class TestWithMocks: vcf_path = "tests/data/vcf/sample.vcf.gz" @pytest.mark.parametrize(("progress", "flag"), [(True, "-P"), (False, "-Q")]) - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode(self, mocked, tmp_path, progress, flag): icf_path = tmp_path / "icf" runner = ct.CliRunner(mix_stderr=False) @@ -101,7 +101,7 @@ def test_vcf_explode(self, mocked, tmp_path, progress, flag): mocked.assert_called_once_with(str(icf_path), (self.vcf_path,), **args) @pytest.mark.parametrize("compressor", ["lz4", "zstd"]) - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode_compressor(self, mocked, tmp_path, compressor): icf_path = tmp_path / "icf" runner = ct.CliRunner(mix_stderr=False) @@ -124,7 +124,7 @@ def test_vcf_explode_compressor(self, mocked, tmp_path, compressor): ) @pytest.mark.parametrize("compressor", ["lz4", "zstd"]) - @mock.patch("bio2zarr.icf.explode_init") + @mock.patch("bio2zarr.vcf.explode_init") def test_vcf_dexplode_init_compressor(self, mocked, tmp_path, compressor): icf_path = tmp_path / "icf" runner = ct.CliRunner(mix_stderr=False) @@ -148,7 +148,7 @@ def test_vcf_dexplode_init_compressor(self, mocked, tmp_path, compressor): ) @pytest.mark.parametrize("compressor", ["LZ4", "asdf"]) - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode_bad_compressor(self, mocked, tmp_path, compressor): runner = ct.CliRunner(mix_stderr=False) icf_path = tmp_path / "icf" @@ -161,7 +161,7 @@ def test_vcf_explode_bad_compressor(self, mocked, tmp_path, compressor): assert "Invalid value for '-C'" in result.stderr mocked.assert_not_called() - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode_multiple_vcfs(self, mocked, tmp_path): icf_path = tmp_path / "icf" runner = ct.CliRunner(mix_stderr=False) @@ -178,7 +178,7 @@ def test_vcf_explode_multiple_vcfs(self, mocked, tmp_path): ) @pytest.mark.parametrize("response", ["y", "Y", "yes"]) - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode_overwrite_icf_confirm_yes(self, mocked, tmp_path, response): icf_path = tmp_path / "icf" icf_path.mkdir() @@ -197,7 +197,7 @@ def test_vcf_explode_overwrite_icf_confirm_yes(self, mocked, tmp_path, response) ) @pytest.mark.parametrize("response", ["y", "Y", "yes"]) - @mock.patch("bio2zarr.icf.encode") + @mock.patch("bio2zarr.vcf.encode") def test_vcf_encode_overwrite_zarr_confirm_yes(self, mocked, tmp_path, response): icf_path = tmp_path / "icf" icf_path.mkdir() @@ -218,7 +218,7 @@ def test_vcf_encode_overwrite_zarr_confirm_yes(self, mocked, tmp_path, response) ) @pytest.mark.parametrize("force_arg", ["-f", "--force"]) - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode_overwrite_icf_force(self, mocked, tmp_path, force_arg): icf_path = tmp_path / "icf" icf_path.mkdir() @@ -236,7 +236,7 @@ def test_vcf_explode_overwrite_icf_force(self, mocked, tmp_path, force_arg): ) @pytest.mark.parametrize("force_arg", ["-f", "--force"]) - @mock.patch("bio2zarr.icf.encode") + @mock.patch("bio2zarr.vcf.encode") def test_vcf_encode_overwrite_icf_force(self, mocked, tmp_path, force_arg): icf_path = tmp_path / "icf" icf_path.mkdir() @@ -257,7 +257,7 @@ def test_vcf_encode_overwrite_icf_force(self, mocked, tmp_path, force_arg): **DEFAULT_ENCODE_ARGS, ) - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode_missing_vcf(self, mocked, tmp_path): icf_path = tmp_path / "icf" runner = ct.CliRunner(mix_stderr=False) @@ -272,7 +272,7 @@ def test_vcf_explode_missing_vcf(self, mocked, tmp_path): mocked.assert_not_called() @pytest.mark.parametrize("response", ["n", "N", "No"]) - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode_overwrite_icf_confirm_no(self, mocked, tmp_path, response): icf_path = tmp_path / "icf" icf_path.mkdir() @@ -287,7 +287,7 @@ def test_vcf_explode_overwrite_icf_confirm_no(self, mocked, tmp_path, response): assert "Aborted" in result.stderr mocked.assert_not_called() - @mock.patch("bio2zarr.icf.explode") + @mock.patch("bio2zarr.vcf.explode") def test_vcf_explode_missing_and_existing_vcf(self, mocked, tmp_path): icf_path = tmp_path / "icf" runner = ct.CliRunner(mix_stderr=False) @@ -302,7 +302,7 @@ def test_vcf_explode_missing_and_existing_vcf(self, mocked, tmp_path): mocked.assert_not_called() @pytest.mark.parametrize(("progress", "flag"), [(True, "-P"), (False, "-Q")]) - @mock.patch("bio2zarr.icf.explode_init", return_value=FakeWorkSummary(5)) + @mock.patch("bio2zarr.vcf.explode_init", return_value=FakeWorkSummary(5)) def test_vcf_dexplode_init(self, mocked, tmp_path, progress, flag): runner = ct.CliRunner(mix_stderr=False) icf_path = tmp_path / "icf" @@ -324,7 +324,7 @@ def test_vcf_dexplode_init(self, mocked, tmp_path, progress, flag): ) @pytest.mark.parametrize("num_partitions", ["-1", "0", "asdf", "1.112"]) - @mock.patch("bio2zarr.icf.explode_init", return_value=5) + @mock.patch("bio2zarr.vcf.explode_init", return_value=5) def test_vcf_dexplode_init_bad_num_partitions( self, mocked, tmp_path, num_partitions ): @@ -339,7 +339,7 @@ def test_vcf_dexplode_init_bad_num_partitions( assert "Invalid value for '-n'" in result.stderr mocked.assert_not_called() - @mock.patch("bio2zarr.icf.explode_init", return_value=5) + @mock.patch("bio2zarr.vcf.explode_init", return_value=5) def test_vcf_dexplode_init_no_partitions(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) icf_path = tmp_path / "icf" @@ -352,7 +352,7 @@ def test_vcf_dexplode_init_no_partitions(self, mocked, tmp_path): assert "-n/--num-partitions must currently be specified" in result.stderr mocked.assert_not_called() - @mock.patch("bio2zarr.icf.explode_partition") + @mock.patch("bio2zarr.vcf.explode_partition") def test_vcf_dexplode_partition(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) icf_path = tmp_path / "icf" @@ -369,7 +369,7 @@ def test_vcf_dexplode_partition(self, mocked, tmp_path): str(icf_path), 1, **DEFAULT_DEXPLODE_PARTITION_ARGS ) - @mock.patch("bio2zarr.icf.explode_partition") + @mock.patch("bio2zarr.vcf.explode_partition") def test_vcf_dexplode_partition_one_based(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) icf_path = tmp_path / "icf" @@ -386,7 +386,7 @@ def test_vcf_dexplode_partition_one_based(self, mocked, tmp_path): str(icf_path), 0, **DEFAULT_DEXPLODE_PARTITION_ARGS ) - @mock.patch("bio2zarr.icf.explode_partition") + @mock.patch("bio2zarr.vcf.explode_partition") def test_vcf_dexplode_partition_missing_dir(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) icf_path = tmp_path / "icf" @@ -401,7 +401,7 @@ def test_vcf_dexplode_partition_missing_dir(self, mocked, tmp_path): mocked.assert_not_called() @pytest.mark.parametrize("partition", ["-- -1", "asdf", "1.112"]) - @mock.patch("bio2zarr.icf.explode_partition") + @mock.patch("bio2zarr.vcf.explode_partition") def test_vcf_dexplode_partition_bad_partition(self, mocked, tmp_path, partition): runner = ct.CliRunner(mix_stderr=False) icf_path = tmp_path / "icf" @@ -416,7 +416,7 @@ def test_vcf_dexplode_partition_bad_partition(self, mocked, tmp_path, partition) assert len(result.stdout) == 0 mocked.assert_not_called() - @mock.patch("bio2zarr.icf.explode_finalise") + @mock.patch("bio2zarr.vcf.explode_finalise") def test_vcf_dexplode_finalise(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) result = runner.invoke( @@ -427,7 +427,7 @@ def test_vcf_dexplode_finalise(self, mocked, tmp_path): assert len(result.stderr) == 0 mocked.assert_called_once_with(str(tmp_path)) - @mock.patch("bio2zarr.icf.inspect") + @mock.patch("bio2zarr.vcf.inspect") def test_inspect(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) result = runner.invoke( @@ -438,7 +438,7 @@ def test_inspect(self, mocked, tmp_path): assert len(result.stderr) == 0 mocked.assert_called_once_with(str(tmp_path)) - @mock.patch("bio2zarr.icf.mkschema") + @mock.patch("bio2zarr.vcf.mkschema") def test_mkschema(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) result = runner.invoke( @@ -455,7 +455,7 @@ def test_mkschema(self, mocked, tmp_path): mocked.assert_called_once() @pytest.mark.parametrize(("progress", "flag"), [(True, "-P"), (False, "-Q")]) - @mock.patch("bio2zarr.icf.encode") + @mock.patch("bio2zarr.vcf.encode") def test_encode(self, mocked, tmp_path, progress, flag): icf_path = tmp_path / "icf" icf_path.mkdir() @@ -478,7 +478,7 @@ def test_encode(self, mocked, tmp_path, progress, flag): ) @pytest.mark.parametrize(("progress", "flag"), [(True, "-P"), (False, "-Q")]) - @mock.patch("bio2zarr.icf.encode_init", return_value=FakeWorkSummary(10)) + @mock.patch("bio2zarr.vcf.encode_init", return_value=FakeWorkSummary(10)) def test_dencode_init(self, mocked, tmp_path, progress, flag): icf_path = tmp_path / "icf" icf_path.mkdir() @@ -501,7 +501,7 @@ def test_dencode_init(self, mocked, tmp_path, progress, flag): **args, ) - @mock.patch("bio2zarr.icf.encode_init", return_value=5) + @mock.patch("bio2zarr.vcf.encode_init", return_value=5) def test_vcf_dencode_init_no_partitions(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) icf_path = tmp_path / "icf" @@ -516,7 +516,7 @@ def test_vcf_dencode_init_no_partitions(self, mocked, tmp_path): assert "-n/--num-partitions must currently be specified" in result.stderr mocked.assert_not_called() - @mock.patch("bio2zarr.icf.encode_partition") + @mock.patch("bio2zarr.vcf.encode_partition") def test_vcf_dencode_partition(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) zarr_path = tmp_path / "zarr" @@ -533,7 +533,7 @@ def test_vcf_dencode_partition(self, mocked, tmp_path): str(zarr_path), 1, **DEFAULT_DENCODE_PARTITION_ARGS ) - @mock.patch("bio2zarr.icf.encode_partition") + @mock.patch("bio2zarr.vcf.encode_partition") def test_vcf_dencode_partition_one_based(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) zarr_path = tmp_path / "zarr" @@ -551,7 +551,7 @@ def test_vcf_dencode_partition_one_based(self, mocked, tmp_path): ) @pytest.mark.parametrize(("progress", "flag"), [(True, "-P"), (False, "-Q")]) - @mock.patch("bio2zarr.icf.encode_finalise") + @mock.patch("bio2zarr.vcf.encode_finalise") def test_vcf_dencode_finalise(self, mocked, tmp_path, progress, flag): runner = ct.CliRunner(mix_stderr=False) result = runner.invoke( @@ -567,7 +567,7 @@ def test_vcf_dencode_finalise(self, mocked, tmp_path, progress, flag): mocked.assert_called_once_with(str(tmp_path), **args) @pytest.mark.parametrize(("progress", "flag"), [(True, "-P"), (False, "-Q")]) - @mock.patch("bio2zarr.icf.convert") + @mock.patch("bio2zarr.vcf.convert") def test_convert_vcf(self, mocked, progress, flag): runner = ct.CliRunner(mix_stderr=False) result = runner.invoke( @@ -587,7 +587,7 @@ def test_convert_vcf(self, mocked, progress, flag): ) @pytest.mark.parametrize("response", ["n", "N", "No"]) - @mock.patch("bio2zarr.icf.convert") + @mock.patch("bio2zarr.vcf.convert") def test_vcf_convert_overwrite_zarr_confirm_no(self, mocked, tmp_path, response): zarr_path = tmp_path / "zarr" zarr_path.mkdir() @@ -617,7 +617,7 @@ def test_convert_plink(self, mocked, progress, flag): mocked.assert_called_once_with("in", "out", **args) @pytest.mark.parametrize("response", ["y", "Y", "yes"]) - @mock.patch("bio2zarr.icf.convert") + @mock.patch("bio2zarr.vcf.convert") def test_vcf_convert_overwrite_zarr_confirm_yes(self, mocked, tmp_path, response): zarr_path = tmp_path / "zarr" zarr_path.mkdir() diff --git a/tests/test_icf.py b/tests/test_icf.py index 479f1ab1..a08064a1 100644 --- a/tests/test_icf.py +++ b/tests/test_icf.py @@ -6,8 +6,8 @@ import numpy.testing as nt import pytest -from bio2zarr import icf as icf_mod from bio2zarr import provenance, vcf_utils, vcz +from bio2zarr import vcf as vcf_mod class TestSmallExample: @@ -25,10 +25,10 @@ class TestSmallExample: @pytest.fixture(scope="class") def icf(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.exploded" - return icf_mod.explode(out, [self.data_path]) + return vcf_mod.explode(out, [self.data_path]) def test_format_version(self, icf): - assert icf.metadata.format_version == icf_mod.ICF_METADATA_FORMAT_VERSION + assert icf.metadata.format_version == vcf_mod.ICF_METADATA_FORMAT_VERSION def test_provenance(self, icf): assert icf.metadata.provenance == { @@ -38,7 +38,7 @@ def test_provenance(self, icf): def test_mkschema(self, tmp_path, icf): schema_file = tmp_path / "schema.json" with open(schema_file, "w") as f: - icf_mod.mkschema(icf.path, f) + vcf_mod.mkschema(icf.path, f) with open(schema_file) as f: schema1 = vcz.VcfZarrSchema.fromjson(f.read()) schema2 = icf.generate_schema() @@ -50,7 +50,7 @@ def test_summary_table(self, icf): assert tuple(sorted(fields)) == self.fields def test_inspect(self, icf): - assert icf.summary_table() == icf_mod.inspect(icf.path) + assert icf.summary_table() == vcf_mod.inspect(icf.path) def test_num_partitions(self, icf): assert icf.num_partitions == 3 @@ -92,7 +92,7 @@ class TestWithGtHeaderNoGenotypes: @pytest.fixture(scope="class") def icf(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.exploded" - return icf_mod.explode(out, [self.data_path]) + return vcf_mod.explode(out, [self.data_path]) def test_gts(self, icf): values = icf.fields["FORMAT/GT"].values @@ -114,7 +114,7 @@ class TestIcfWriterExample: def test_init_paths(self, tmp_path): icf_path = tmp_path / "x.icf" assert not icf_path.exists() - summary = icf_mod.explode_init(icf_path, [self.data_path]) + summary = vcf_mod.explode_init(icf_path, [self.data_path]) assert summary.num_partitions == 3 assert icf_path.exists() wip_path = icf_path / "wip" @@ -127,50 +127,50 @@ def test_init_paths(self, tmp_path): def test_finalise_paths(self, tmp_path): icf_path = tmp_path / "x.icf" wip_path = icf_path / "wip" - summary = icf_mod.explode_init(icf_path, [self.data_path]) + summary = vcf_mod.explode_init(icf_path, [self.data_path]) assert icf_path.exists() for j in range(summary.num_partitions): - icf_mod.explode_partition(icf_path, j) + vcf_mod.explode_partition(icf_path, j) assert wip_path.exists() - icf_mod.explode_finalise(icf_path) + vcf_mod.explode_finalise(icf_path) assert icf_path.exists() assert not wip_path.exists() def test_finalise_no_partitions_fails(self, tmp_path): icf_path = tmp_path / "x.icf" - icf_mod.explode_init(icf_path, [self.data_path]) + vcf_mod.explode_init(icf_path, [self.data_path]) with pytest.raises(FileNotFoundError, match="3 partitions: \\[0, 1, 2\\]"): - icf_mod.explode_finalise(icf_path) + vcf_mod.explode_finalise(icf_path) @pytest.mark.parametrize("partition", [0, 1, 2]) def test_finalise_missing_partition_fails(self, tmp_path, partition): icf_path = tmp_path / "x.icf" - icf_mod.explode_init(icf_path, [self.data_path]) + vcf_mod.explode_init(icf_path, [self.data_path]) for j in range(3): if j != partition: - icf_mod.explode_partition(icf_path, j) + vcf_mod.explode_partition(icf_path, j) with pytest.raises(FileNotFoundError, match=f"1 partitions: \\[{partition}\\]"): - icf_mod.explode_finalise(icf_path) + vcf_mod.explode_finalise(icf_path) @pytest.mark.parametrize("partition", [0, 1, 2]) def test_explode_partition(self, tmp_path, partition): icf_path = tmp_path / "x.icf" - icf_mod.explode_init(icf_path, [self.data_path]) + vcf_mod.explode_init(icf_path, [self.data_path]) summary_file = icf_path / "wip" / f"p{partition}.json" assert not summary_file.exists() - icf_mod.explode_partition(icf_path, partition) + vcf_mod.explode_partition(icf_path, partition) assert summary_file.exists() def test_double_explode_partition(self, tmp_path): partition = 1 icf_path = tmp_path / "x.icf" - icf_mod.explode_init(icf_path, [self.data_path]) + vcf_mod.explode_init(icf_path, [self.data_path]) summary_file = icf_path / "wip" / f"p{partition}.json" assert not summary_file.exists() - icf_mod.explode_partition(icf_path, partition) + vcf_mod.explode_partition(icf_path, partition) with open(summary_file) as f: s1 = f.read() - icf_mod.explode_partition(icf_path, partition) + vcf_mod.explode_partition(icf_path, partition) with open(summary_file) as f: s2 = f.read() assert s1 == s2 @@ -178,19 +178,19 @@ def test_double_explode_partition(self, tmp_path): @pytest.mark.parametrize("partition", [-1, 3, 100]) def test_explode_partition_out_of_range(self, tmp_path, partition): icf_path = tmp_path / "x.icf" - icf_mod.explode_init(icf_path, [self.data_path]) + vcf_mod.explode_init(icf_path, [self.data_path]) with pytest.raises(ValueError, match="Partition index not in the valid range"): - icf_mod.explode_partition(icf_path, partition) + vcf_mod.explode_partition(icf_path, partition) def test_explode_same_file_twice(self, tmp_path): icf_path = tmp_path / "x.icf" with pytest.raises(ValueError, match="Duplicate path provided"): - icf_mod.explode(icf_path, [self.data_path, self.data_path]) + vcf_mod.explode(icf_path, [self.data_path, self.data_path]) def test_explode_same_data_twice(self, tmp_path): icf_path = tmp_path / "x.icf" with pytest.raises(ValueError, match="Overlapping VCF regions"): - icf_mod.explode(icf_path, [self.data_path, "tests/data/vcf/sample.bcf"]) + vcf_mod.explode(icf_path, [self.data_path, "tests/data/vcf/sample.bcf"]) class TestGeneratedFieldsExample: @@ -206,7 +206,7 @@ def icf(self, tmp_path_factory): # df = sgkit.load_dataset("tmp/fields.vcf.sg") # print(df["variant_IC2"]) # print(df["variant_IC2"].values) - return icf_mod.explode(out, [self.data_path]) + return vcf_mod.explode(out, [self.data_path]) @pytest.fixture(scope="class") def schema(self, icf): @@ -274,16 +274,16 @@ class TestInitProperties: def run_explode(self, tmp_path, **kwargs): icf_path = tmp_path / "icf" - icf_mod.explode(icf_path, [self.data_path], **kwargs) - return icf_mod.IntermediateColumnarFormat(icf_path) + vcf_mod.explode(icf_path, [self.data_path], **kwargs) + return vcf_mod.IntermediateColumnarFormat(icf_path) def run_dexplode(self, tmp_path, **kwargs): icf_path = tmp_path / "icf" - summary = icf_mod.explode_init(icf_path, [self.data_path], **kwargs) + summary = vcf_mod.explode_init(icf_path, [self.data_path], **kwargs) for j in range(summary.num_partitions): - icf_mod.explode_partition(icf_path, j) - icf_mod.explode_finalise(icf_path) - return icf_mod.IntermediateColumnarFormat(icf_path) + vcf_mod.explode_partition(icf_path, j) + vcf_mod.explode_finalise(icf_path) + return vcf_mod.IntermediateColumnarFormat(icf_path) @pytest.mark.parametrize( "compressor", @@ -299,12 +299,12 @@ def test_compressor_explode(self, tmp_path, compressor): def test_default_compressor_explode(self, tmp_path): icf = self.run_explode(tmp_path) - assert icf.metadata.compressor == icf_mod.ICF_DEFAULT_COMPRESSOR.get_config() + assert icf.metadata.compressor == vcf_mod.ICF_DEFAULT_COMPRESSOR.get_config() assert icf.metadata.compressor["cname"] == "zstd" def test_default_compressor_dexplode(self, tmp_path): icf = self.run_dexplode(tmp_path) - assert icf.metadata.compressor == icf_mod.ICF_DEFAULT_COMPRESSOR.get_config() + assert icf.metadata.compressor == vcf_mod.ICF_DEFAULT_COMPRESSOR.get_config() assert icf.metadata.compressor["cname"] == "zstd" @pytest.mark.parametrize( @@ -335,40 +335,40 @@ class TestCorruptionDetection: def test_missing_field(self, tmp_path): icf_path = tmp_path / "icf" - icf_mod.explode(icf_path, [self.data_path]) + vcf_mod.explode(icf_path, [self.data_path]) shutil.rmtree(icf_path / "POS") - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) with pytest.raises(FileNotFoundError): icf.fields["POS"].values # noqa B018 def test_missing_chunk_index(self, tmp_path): icf_path = tmp_path / "icf" - icf_mod.explode(icf_path, [self.data_path]) + vcf_mod.explode(icf_path, [self.data_path]) chunk_index_path = icf_path / "POS" / "p0" / "chunk_index" assert chunk_index_path.exists() chunk_index_path.unlink() - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) with pytest.raises(FileNotFoundError): icf.fields["POS"].values # noqa B018 def test_missing_chunk_file(self, tmp_path): icf_path = tmp_path / "icf" - icf_mod.explode(icf_path, [self.data_path]) + vcf_mod.explode(icf_path, [self.data_path]) chunk_file = icf_path / "POS" / "p0" / "2" assert chunk_file.exists() chunk_file.unlink() - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) with pytest.raises(FileNotFoundError): icf.fields["POS"].values # noqa B018 def test_empty_chunk_file(self, tmp_path): icf_path = tmp_path / "icf" - icf_mod.explode(icf_path, [self.data_path]) + vcf_mod.explode(icf_path, [self.data_path]) chunk_file = icf_path / "POS" / "p0" / "2" assert chunk_file.exists() with open(chunk_file, "w") as _: pass - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) with pytest.raises(RuntimeError, match="blosc"): icf.fields["POS"].values # noqa B018 @@ -376,21 +376,21 @@ def test_empty_chunk_file(self, tmp_path): @pytest.mark.parametrize("length", [10, 100, 185]) def test_truncated_chunk_file(self, tmp_path, length): icf_path = tmp_path / "icf" - icf_mod.explode(icf_path, [self.data_path]) + vcf_mod.explode(icf_path, [self.data_path]) chunk_file = icf_path / "POS" / "p0" / "2" with open(chunk_file, "rb") as f: buff = f.read(length) assert len(buff) == length with open(chunk_file, "wb") as f: f.write(buff) - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) # Either Blosc or pickling errors happen here with pytest.raises((RuntimeError, pickle.UnpicklingError)): icf.fields["POS"].values # noqa B018 def test_chunk_incorrect_length(self, tmp_path): icf_path = tmp_path / "icf" - icf_mod.explode(icf_path, [self.data_path]) + vcf_mod.explode(icf_path, [self.data_path]) chunk_file = icf_path / "POS" / "p0" / "2" compressor = numcodecs.Blosc(cname="zstd") with open(chunk_file, "rb") as f: @@ -401,7 +401,7 @@ def test_chunk_incorrect_length(self, tmp_path): pkl = pickle.dumps(x[0]) with open(chunk_file, "wb") as f: f.write(compressor.encode(pkl)) - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) with pytest.raises(ValueError, match="Corruption detected"): icf.fields["POS"].values # noqa B018 with pytest.raises(ValueError, match="Corruption detected"): @@ -414,7 +414,7 @@ class TestSlicing: @pytest.fixture(scope="class") def icf(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.exploded" - return icf_mod.explode( + return vcf_mod.explode( out, [self.data_path], column_chunk_size=0.0125, worker_processes=0 ) @@ -496,8 +496,8 @@ def test_slice(self, icf, start, stop): ) def test_check_overlap(regions): partitions = [ - icf_mod.VcfPartition("", region=vcf_utils.Region(contig, start, end)) + vcf_mod.VcfPartition("", region=vcf_utils.Region(contig, start, end)) for contig, start, end in regions ] with pytest.raises(ValueError, match="Overlapping VCF regions"): - icf_mod.check_overlapping_partitions(partitions) + vcf_mod.check_overlapping_partitions(partitions) diff --git a/tests/test_simulated_data.py b/tests/test_simulated_data.py index 0188a799..ad8386f8 100644 --- a/tests/test_simulated_data.py +++ b/tests/test_simulated_data.py @@ -5,7 +5,7 @@ import pytest import sgkit as sg -from bio2zarr import icf +from bio2zarr import vcf as vcf_mod def run_simulation(num_samples=2, ploidy=1, seed=42, sequence_length=100_000): @@ -60,7 +60,7 @@ def test_ploidy(self, ploidy, tmp_path): ts = run_simulation(ploidy=ploidy) vcf_path = write_vcf(ts, tmp_path / "sim.vcf") out = tmp_path / "example.vcf.zarr" - icf.convert([vcf_path], out) + vcf_mod.convert([vcf_path], out) ds = sg.load_dataset(out) assert_ts_ds_equal(ts, ds, ploidy) @@ -81,7 +81,7 @@ def test_multi_contig(self, contig_ids, tmp_path): def validate_tss_vcf_list(self, contig_ids, tss, vcfs, tmp_path): out = tmp_path / "example.vcf.zarr" - icf.convert(vcfs, out) + vcf_mod.convert(vcfs, out) ds = sg.load_dataset(out).set_index( variants=("variant_contig", "variant_position") ) @@ -103,7 +103,7 @@ def test_indexed(self, indexed, tmp_path): ts = run_simulation(num_samples=12, seed=34) vcf_path = write_vcf(ts, tmp_path / "sim.vcf", indexed=indexed) out = tmp_path / "example.vcf.zarr" - icf.convert([vcf_path], out) + vcf_mod.convert([vcf_path], out) ds = sg.load_dataset(out) assert_ts_ds_equal(ts, ds) @@ -138,4 +138,4 @@ def test_different_lengths(self, tmp_path): vcfs.append(vcf_path) out = tmp_path / "example.vcf.zarr" with pytest.raises(ValueError, match="Incompatible contig definitions"): - icf.convert(vcfs, out) + vcf_mod.convert(vcfs, out) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index fde4c2af..2e7b93b6 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -9,7 +9,8 @@ import sgkit as sg import xarray.testing as xt -from bio2zarr import constants, icf, provenance, vcz_verification +from bio2zarr import constants, provenance, vcz_verification +from bio2zarr import vcf as vcf_mod def assert_dataset_equal(ds1, ds2, drop_vars=None): @@ -25,7 +26,7 @@ class TestSmallExample: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - icf.convert([self.data_path], out) + vcf_mod.convert([self.data_path], out) return sg.load_dataset(out) def test_filters(self, ds): @@ -254,7 +255,7 @@ def test_call_HQ(self, ds): def test_no_genotypes(self, ds, tmp_path): path = "tests/data/vcf/sample_no_genotypes.vcf.gz" out = tmp_path / "example.vcf.zarr" - icf.convert([path], out) + vcf_mod.convert([path], out) ds2 = sg.load_dataset(out) assert len(ds2["sample_id"]) == 0 for field_name in ds: @@ -274,7 +275,7 @@ def test_chunk_size( self, ds, tmp_path, variants_chunk_size, samples_chunk_size, y_chunks, x_chunks ): out = tmp_path / "example.vcf.zarr" - icf.convert( + vcf_mod.convert( [self.data_path], out, variants_chunk_size=variants_chunk_size, @@ -316,23 +317,23 @@ def test_split(self, ds, tmp_path, worker_processes, rotate): # Rotate the list to check we are OK with different orderings files.rotate(rotate) assert len(files) == 3 - icf.convert(files, out, worker_processes=worker_processes) + vcf_mod.convert(files, out, worker_processes=worker_processes) ds2 = sg.load_dataset(out) xt.assert_equal(ds, ds2) @pytest.mark.parametrize("worker_processes", [0, 1, 2]) def test_full_pipeline(self, ds, tmp_path, worker_processes): exploded = tmp_path / "example.exploded" - icf.explode( + vcf_mod.explode( exploded, [self.data_path], worker_processes=worker_processes, ) schema = tmp_path / "schema.json" with open(schema, "w") as f: - icf.mkschema(exploded, f) + vcf_mod.mkschema(exploded, f) out = tmp_path / "example.zarr" - icf.encode(exploded, out, schema, worker_processes=worker_processes) + vcf_mod.encode(exploded, out, schema, worker_processes=worker_processes) ds2 = sg.load_dataset(out) xt.assert_equal(ds, ds2) @@ -342,9 +343,9 @@ def test_max_variant_chunks( self, ds, tmp_path, max_variant_chunks, variants_chunk_size ): exploded = tmp_path / "example.exploded" - icf.explode(exploded, [self.data_path]) + vcf_mod.explode(exploded, [self.data_path]) out = tmp_path / "example.zarr" - icf.encode( + vcf_mod.encode( exploded, out, variants_chunk_size=variants_chunk_size, @@ -360,7 +361,7 @@ def test_max_variant_chunks( @pytest.mark.parametrize("worker_processes", [0, 1, 2]) def test_worker_processes(self, ds, tmp_path, worker_processes): out = tmp_path / "example.vcf.zarr" - icf.convert( + vcf_mod.convert( [self.data_path], out, variants_chunk_size=3, @@ -372,12 +373,12 @@ def test_worker_processes(self, ds, tmp_path, worker_processes): def test_inspect(self, tmp_path): # TODO pretty weak test, we should be doing this better somewhere else out = tmp_path / "example.vcf.zarr" - icf.convert( + vcf_mod.convert( [self.data_path], out, variants_chunk_size=3, ) - data = icf.inspect(out) + data = vcf_mod.inspect(out) assert len(data) > 0 for row in data: assert "name" in row @@ -395,7 +396,7 @@ def test_missing_contig_vcf(self, ds, tmp_path, path): # but the ordering of contigs has been permuted. This seems to be the # sample across VCF and BCF with tabix and VSI indexes zarr_path = tmp_path / "zarr" - icf.convert([path], zarr_path) + vcf_mod.convert([path], zarr_path) ds2 = sg.load_dataset(zarr_path) contig_id_2 = ["19", "X", "20"] assert list(ds2["contig_id"].values) == contig_id_2 @@ -460,7 +461,7 @@ def test_region_index(self, ds): def test_small_example_all_missing_gts(self, ds, tmp_path_factory): data_path = "tests/data/vcf/sample_all_missing_gts.vcf.gz" out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - icf.convert([data_path], out, worker_processes=0) + vcf_mod.convert([data_path], out, worker_processes=0) ds2 = sg.load_dataset(out) assert_dataset_equal( @@ -487,7 +488,7 @@ class TestSmallExampleLocalAlleles: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - icf.convert([self.data_path], out, local_alleles=True) + vcf_mod.convert([self.data_path], out, local_alleles=True) return sg.load_dataset(out) def test_call_LA(self, ds): @@ -534,7 +535,7 @@ class TestTriploidExample: def ds(self, tmp_path_factory, request): data_path = f"tests/data/vcf/{request.param}.vcf.gz" out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - icf.convert([data_path], out, local_alleles=False) + vcf_mod.convert([data_path], out, local_alleles=False) return sg.load_dataset(out) @pytest.mark.parametrize("name", ["triploid", "triploid2", "triploid3"]) @@ -544,7 +545,7 @@ def test_error_with_local_alleles(self, tmp_path_factory, name): with pytest.raises( ValueError, match=re.escape("Local alleles only supported on diploid") ): - icf.convert([data_path], out, local_alleles=True) + vcf_mod.convert([data_path], out, local_alleles=True) def test_ok_without_local_alleles(self, ds): nt.assert_array_equal(ds.call_genotype.values, [[[0, 0, 0]]]) @@ -556,7 +557,7 @@ class TestWithGtHeaderNoGenotypes: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - icf.convert([self.data_path], out, worker_processes=0) + vcf_mod.convert([self.data_path], out, worker_processes=0) return sg.load_dataset(out) def test_gts(self, ds): @@ -569,7 +570,7 @@ class TestChr22Example: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - icf.convert([self.data_path], out, worker_processes=0) + vcf_mod.convert([self.data_path], out, worker_processes=0) return sg.load_dataset(out) def test_call_SB(self, ds): @@ -584,7 +585,7 @@ class Test1000G2020Example: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - icf.convert([self.data_path], out, worker_processes=0) + vcf_mod.convert([self.data_path], out, worker_processes=0) return sg.load_dataset(out) def test_position(self, ds): @@ -695,7 +696,7 @@ class Test1000G2020ExampleLocalAlleles: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - icf.convert([self.data_path], out, worker_processes=0, local_alleles=True) + vcf_mod.convert([self.data_path], out, worker_processes=0, local_alleles=True) return sg.load_dataset(out) def test_position(self, ds): @@ -784,7 +785,7 @@ class Test1000G2020AnnotationsExample: def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.zarr" # TODO capture warnings from htslib here - icf.convert([self.data_path], out, worker_processes=0) + vcf_mod.convert([self.data_path], out, worker_processes=0) return sg.load_dataset(out) def test_position(self, ds): @@ -1024,7 +1025,7 @@ class TestGeneratedFieldsExample: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "vcf.zarr" - icf.convert([self.data_path], out) + vcf_mod.convert([self.data_path], out) return sg.load_dataset(out) def test_info_string1(self, ds): @@ -1066,14 +1067,14 @@ class TestSplitFileErrors: def test_entirely_incompatible(self, tmp_path): path = "tests/data/vcf/" with pytest.raises(ValueError, match="Incompatible"): - icf.explode_init( + vcf_mod.explode_init( tmp_path / "if", [path + "sample.vcf.gz", path + "1kg_2020_chrM.bcf"] ) def test_duplicate_paths(self, tmp_path): path = "tests/data/vcf/" with pytest.raises(ValueError, match="Duplicate"): - icf.explode_init(tmp_path / "if", [path + "sample.vcf.gz"] * 2) + vcf_mod.explode_init(tmp_path / "if", [path + "sample.vcf.gz"] * 2) @pytest.mark.parametrize( @@ -1093,7 +1094,7 @@ def test_duplicate_paths(self, tmp_path): def test_by_validating(name, tmp_path): path = f"tests/data/vcf/{name}" out = tmp_path / "test.zarr" - icf.convert([path], out, worker_processes=0) + vcf_mod.convert([path], out, worker_processes=0) vcz_verification.verify(path, out) @@ -1111,7 +1112,7 @@ def test_by_validating_split(source, suffix, files, tmp_path): source_path = f"tests/data/vcf/{source}" split_files = [f"{source_path}.{suffix}/{f}" for f in files] out = tmp_path / "test.zarr" - icf.convert(split_files, out, worker_processes=0) + vcf_mod.convert(split_files, out, worker_processes=0) vcz_verification.verify(source_path, out) @@ -1122,16 +1123,16 @@ def test_split_explode(tmp_path): "tests/data/vcf/sample.vcf.gz.3.split/X.vcf.gz", ] out = tmp_path / "test.explode" - work_summary = icf.explode_init(out, paths, target_num_partitions=15) + work_summary = vcf_mod.explode_init(out, paths, target_num_partitions=15) assert work_summary.num_partitions == 3 with pytest.raises(FileNotFoundError): - pcvcf = icf.IntermediateColumnarFormat(out) + pcvcf = vcf_mod.IntermediateColumnarFormat(out) for j in range(work_summary.num_partitions): - icf.explode_partition(out, j) - icf.explode_finalise(out) - pcvcf = icf.IntermediateColumnarFormat(out) + vcf_mod.explode_partition(out, j) + vcf_mod.explode_finalise(out) + pcvcf = vcf_mod.IntermediateColumnarFormat(out) summary_d = pcvcf.fields["POS"].vcf_field.summary.asdict() # The compressed size can vary with different numcodecs versions assert summary_d["compressed_size"] in [571, 573, 587] @@ -1143,7 +1144,7 @@ def test_split_explode(tmp_path): "max_value": 1235237, "min_value": 10, } - icf.encode(out, tmp_path / "test.zarr") + vcf_mod.encode(out, tmp_path / "test.zarr") vcz_verification.verify("tests/data/vcf/sample.vcf.gz", tmp_path / "test.zarr") @@ -1151,7 +1152,7 @@ def test_missing_filter(tmp_path): path = "tests/data/vcf/sample_missing_filter.vcf.gz" zarr_path = tmp_path / "zarr" with pytest.raises(ValueError, match="Filter 'q10' was not defined in the header"): - icf.convert([path], zarr_path) + vcf_mod.convert([path], zarr_path) class TestOutOfOrderFields: @@ -1162,7 +1163,7 @@ class TestOutOfOrderFields: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "ooo_example.vcf.zarr" - icf.convert([self.data_path1, self.data_path2], out) + vcf_mod.convert([self.data_path1, self.data_path2], out) return sg.load_dataset(out) def test_filters(self, ds): diff --git a/tests/test_vcf_hypothesis.py b/tests/test_vcf_hypothesis.py index 61d663ef..305a6c2e 100644 --- a/tests/test_vcf_hypothesis.py +++ b/tests/test_vcf_hypothesis.py @@ -4,7 +4,7 @@ from hypothesis import HealthCheck, given, note, settings from hypothesis_vcf import vcf -from bio2zarr import icf +from bio2zarr import vcf as vcf_mod # Make sure POS starts at 1, since CSI indexing doesn't seem to support zero-based @@ -31,4 +31,9 @@ def test_hypothesis_generated_vcf(tmp_path, vcf_string): pysam.tabix_index(str(path), preset="vcf", force=True, csi=True) # test that we can convert VCFs to Zarr without error - icf.convert([str(path) + ".gz"], zarr_path, icf_path=icf_path, worker_processes=0) + vcf_mod.convert( + [str(path) + ".gz"], + zarr_path, + icf_path=icf_path, + worker_processes=0, + ) diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 8522d0d3..fd33b1a8 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -9,7 +9,7 @@ import zarr from bio2zarr import core, vcz -from bio2zarr import icf as icf_mod +from bio2zarr import vcf as vcf_mod from bio2zarr.zarr_utils import zarr_v3 @@ -21,7 +21,7 @@ def vcf_file(): @pytest.fixture(scope="module") def icf_path(vcf_file, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.exploded" - icf_mod.explode(out, [vcf_file]) + vcf_mod.explode(out, [vcf_file]) return out @@ -29,7 +29,7 @@ def icf_path(vcf_file, tmp_path_factory): def schema_path(icf_path, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.schema.json" with open(out, "w") as f: - icf_mod.mkschema(icf_path, f) + vcf_mod.mkschema(icf_path, f) return out @@ -46,7 +46,7 @@ def local_alleles_schema(icf_path, tmp_path_factory): # be much easier. out = tmp_path_factory.mktemp("data") / "example.schema.json" with open(out, "w") as f: - icf_mod.mkschema(icf_path, f, local_alleles=True) + vcf_mod.mkschema(icf_path, f, local_alleles=True) with open(out) as f: return vcz.VcfZarrSchema.fromjson(f.read()) @@ -54,7 +54,7 @@ def local_alleles_schema(icf_path, tmp_path_factory): @pytest.fixture(scope="module") def zarr_path(icf_path, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.zarr" - icf_mod.encode(icf_path, out) + vcf_mod.encode(icf_path, out) return out @@ -79,7 +79,7 @@ def test_parser(self, arg, expected): def test_not_enough_memory(self, tmp_path, icf_path, max_memory): zarr_path = tmp_path / "zarr" with pytest.raises(ValueError, match="Insufficient memory"): - icf_mod.encode(icf_path, zarr_path, max_memory=max_memory) + vcf_mod.encode(icf_path, zarr_path, max_memory=max_memory) @pytest.mark.parametrize("max_memory", ["315KiB", "500KiB"]) def test_not_enough_memory_for_two( @@ -87,7 +87,7 @@ def test_not_enough_memory_for_two( ): other_zarr_path = tmp_path / "zarr" with caplog.at_level("WARNING"): - icf_mod.encode( + vcf_mod.encode( icf_path, other_zarr_path, max_memory=max_memory, @@ -118,12 +118,12 @@ def test_exploded_metadata_mismatch(self, tmpdir, icf_path, version): with pytest.raises( ValueError, match="Intermediate columnar metadata format version mismatch" ): - icf_mod.IcfMetadata.fromdict(d) + vcf_mod.IcfMetadata.fromdict(d) @pytest.mark.parametrize("version", ["0.0", "1.0", "xxxxx", 0.1]) def test_encode_metadata_mismatch(self, tmpdir, icf_path, version): zarr_path = tmpdir / "zarr" - icf_mod.encode_init(icf_path, zarr_path, 1) + vcf_mod.encode_init(icf_path, zarr_path, 1) with open(zarr_path / "wip" / "metadata.json") as f: d = json.load(f) d["format_version"] = version @@ -138,14 +138,14 @@ class TestEncodeDimensionSeparator: @pytest.mark.parametrize("dimension_separator", [None, "/"]) def test_directories(self, tmp_path, icf_path, dimension_separator): zarr_path = tmp_path / "zarr" - icf_mod.encode(icf_path, zarr_path, dimension_separator=dimension_separator) + vcf_mod.encode(icf_path, zarr_path, dimension_separator=dimension_separator) # print(zarr_path) chunk_file = zarr_path / "call_genotype" / "0" / "0" / "0" assert chunk_file.exists() def test_files(self, tmp_path, icf_path): zarr_path = tmp_path / "zarr" - icf_mod.encode(icf_path, zarr_path, dimension_separator=".") + vcf_mod.encode(icf_path, zarr_path, dimension_separator=".") chunk_file = zarr_path / "call_genotype" / "0.0.0" assert chunk_file.exists() @@ -153,7 +153,7 @@ def test_files(self, tmp_path, icf_path): def test_bad_value(self, tmp_path, icf_path, dimension_separator): zarr_path = tmp_path / "zarr" with pytest.raises(ValueError, match="dimension_separator must be either"): - icf_mod.encode(icf_path, zarr_path, dimension_separator=dimension_separator) + vcf_mod.encode(icf_path, zarr_path, dimension_separator=dimension_separator) class TestSchemaChunkSize: @@ -166,7 +166,7 @@ class TestSchemaChunkSize: ], ) def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size): - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema( variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, @@ -184,7 +184,7 @@ def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size): assert found > 0 def test_default_chunk_size(self, icf_path): - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema() assert schema.dimensions["samples"].chunk_size == 10_000 assert schema.dimensions["variants"].chunk_size == 1000 @@ -196,23 +196,23 @@ def assert_json_round_trip(self, schema): assert schema == schema2 def test_generated_no_changes(self, icf_path): - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) self.assert_json_round_trip(icf.generate_schema()) def test_generated_no_fields(self, icf_path): - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema() schema.fields.clear() self.assert_json_round_trip(schema) def test_generated_change_dtype(self, icf_path): - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema() schema.field_map()["variant_position"].dtype = "i8" self.assert_json_round_trip(schema) def test_generated_change_compressor(self, icf_path): - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema() schema.field_map()["variant_position"].compressor = {"cname": "FAKE"} self.assert_json_round_trip(schema) @@ -224,7 +224,7 @@ class TestSchemaEncode: ) def test_codec(self, tmp_path, icf_path, cname, clevel, shuffle): zarr_path = tmp_path / "zarr" - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema() field_changed = False for array_spec in schema.fields: @@ -237,7 +237,7 @@ def test_codec(self, tmp_path, icf_path, cname, clevel, shuffle): schema_path = tmp_path / "schema" with open(schema_path, "w") as f: f.write(schema.asjson()) - icf_mod.encode(icf_path, zarr_path, schema_path=schema_path) + vcf_mod.encode(icf_path, zarr_path, schema_path=schema_path) root = zarr.open(zarr_path) for array_spec in schema.fields: a = root[array_spec.name] @@ -249,26 +249,26 @@ def test_codec(self, tmp_path, icf_path, cname, clevel, shuffle): @pytest.mark.parametrize("dtype", ["i4", "i8"]) def test_genotype_dtype(self, tmp_path, icf_path, dtype): zarr_path = tmp_path / "zarr" - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema() schema.field_map()["call_genotype"].dtype = dtype schema_path = tmp_path / "schema" with open(schema_path, "w") as f: f.write(schema.asjson()) - icf_mod.encode(icf_path, zarr_path, schema_path=schema_path) + vcf_mod.encode(icf_path, zarr_path, schema_path=schema_path) root = zarr.open(zarr_path) assert root["call_genotype"].dtype == dtype @pytest.mark.parametrize("dtype", ["i4", "i8"]) def test_region_index_dtype(self, tmp_path, icf_path, dtype): zarr_path = tmp_path / "zarr" - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema() schema.field_map()["variant_position"].dtype = dtype schema_path = tmp_path / "schema" with open(schema_path, "w") as f: f.write(schema.asjson()) - icf_mod.encode(icf_path, zarr_path, schema_path=schema_path) + vcf_mod.encode(icf_path, zarr_path, schema_path=schema_path) root = zarr.open(zarr_path) assert root["variant_position"].dtype == dtype assert root["region_index"].dtype == dtype @@ -301,7 +301,7 @@ def test_example_schema(self, schema, field, value): assert field.get_chunk_nbytes(schema) == value def test_chunk_size(self, icf_path, tmp_path): - store = icf_mod.IntermediateColumnarFormat(icf_path) + store = vcf_mod.IntermediateColumnarFormat(icf_path) schema = store.generate_schema(samples_chunk_size=2, variants_chunk_size=3) fields = schema.field_map() assert fields["call_genotype"].get_chunk_nbytes(schema) == 3 * 2 * 2 @@ -489,7 +489,7 @@ class TestVcfZarrWriterExample: def test_init_paths(self, icf_path, tmp_path): zarr_path = tmp_path / "x.zarr" assert not zarr_path.exists() - summary = icf_mod.encode_init(icf_path, zarr_path, 7, variants_chunk_size=3) + summary = vcf_mod.encode_init(icf_path, zarr_path, 7, variants_chunk_size=3) assert summary.num_partitions == 3 assert zarr_path.exists() wip_path = zarr_path / "wip" @@ -509,57 +509,57 @@ def test_init_paths(self, icf_path, tmp_path): def test_finalise_paths(self, icf_path, tmp_path): zarr_path = tmp_path / "x.zarr" assert not zarr_path.exists() - summary = icf_mod.encode_init(icf_path, zarr_path, 7, variants_chunk_size=3) + summary = vcf_mod.encode_init(icf_path, zarr_path, 7, variants_chunk_size=3) wip_path = zarr_path / "wip" assert wip_path.exists() for j in range(summary.num_partitions): - icf_mod.encode_partition(zarr_path, j) + vcf_mod.encode_partition(zarr_path, j) assert (wip_path / "partitions" / f"p{j}").exists() - icf_mod.encode_finalise(zarr_path) + vcf_mod.encode_finalise(zarr_path) assert zarr_path.exists() assert not wip_path.exists() def test_finalise_no_partitions_fails(self, icf_path, tmp_path): zarr_path = tmp_path / "x.zarr" - icf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) + vcf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) with pytest.raises( FileNotFoundError, match="Partitions not encoded: \\[0, 1, 2\\]" ): - icf_mod.encode_finalise(zarr_path) + vcf_mod.encode_finalise(zarr_path) @pytest.mark.parametrize("partition", [0, 1, 2]) def test_finalise_missing_partition_fails(self, icf_path, tmp_path, partition): zarr_path = tmp_path / "x.zarr" - icf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) + vcf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) for j in range(3): if j != partition: - icf_mod.encode_partition(zarr_path, j) + vcf_mod.encode_partition(zarr_path, j) with pytest.raises( FileNotFoundError, match=f"Partitions not encoded: \\[{partition}\\]" ): - icf_mod.encode_finalise(zarr_path) + vcf_mod.encode_finalise(zarr_path) @pytest.mark.parametrize("partition", [0, 1, 2]) def test_encode_partition(self, icf_path, tmp_path, partition): zarr_path = tmp_path / "x.zarr" - icf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) + vcf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) partition_path = zarr_path / "wip" / "partitions" / f"p{partition}" assert not partition_path.exists() - icf_mod.encode_partition(zarr_path, partition) + vcf_mod.encode_partition(zarr_path, partition) assert partition_path.exists() def test_double_encode_partition(self, icf_path, tmp_path, caplog): partition = 1 zarr_path = tmp_path / "x.zarr" - icf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) + vcf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) partition_path = zarr_path / "wip" / "partitions" / f"p{partition}" assert not partition_path.exists() - icf_mod.encode_partition(zarr_path, partition) + vcf_mod.encode_partition(zarr_path, partition) assert partition_path.exists() size = core.du(partition_path) assert size > 0 with caplog.at_level("WARNING"): - icf_mod.encode_partition(zarr_path, partition) + vcf_mod.encode_partition(zarr_path, partition) assert "Removing existing partition at" in caplog.text assert partition_path.exists() assert core.du(partition_path) == size @@ -567,9 +567,9 @@ def test_double_encode_partition(self, icf_path, tmp_path, caplog): @pytest.mark.parametrize("partition", [-1, 3, 100]) def test_encode_partition_out_of_range(self, icf_path, tmp_path, partition): zarr_path = tmp_path / "x.zarr" - icf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) + vcf_mod.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) with pytest.raises(ValueError, match="Partition index not in the valid range"): - icf_mod.encode_partition(zarr_path, partition) + vcf_mod.encode_partition(zarr_path, partition) class TestClobberFixedFields: @@ -616,7 +616,7 @@ def test_variant_fields(self, tmp_path, field): vcf_file = tmp_path / "test.vcf" self.generate_vcf(vcf_file, info_field=field) with pytest.raises(ValueError, match=f"INFO field name.*{field}"): - icf_mod.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"]) + vcf_mod.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"]) @pytest.mark.parametrize( "field", @@ -630,12 +630,12 @@ def test_call_fields(self, tmp_path, field): vcf_file = tmp_path / "test.vcf" self.generate_vcf(vcf_file, format_field=field) with pytest.raises(ValueError, match=f"FORMAT field name.*{field}"): - icf_mod.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"]) + vcf_mod.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"]) class TestInspect: def test_icf(self, icf_path): - df = pd.DataFrame(icf_mod.inspect(icf_path)) + df = pd.DataFrame(vcf_mod.inspect(icf_path)) assert sorted(list(df)) == sorted( [ "name", @@ -677,7 +677,7 @@ def test_icf(self, icf_path): ) def test_vcz(self, zarr_path): - df = pd.DataFrame(icf_mod.inspect(zarr_path)) + df = pd.DataFrame(vcf_mod.inspect(zarr_path)) cols = [ "name", "dtype", @@ -727,12 +727,12 @@ def test_vcz(self, zarr_path): @pytest.mark.parametrize("bad_path", ["/NO_WAY", "TTTTTT"]) def test_no_such_path(self, bad_path): with pytest.raises(ValueError, match=f"Path not found: {bad_path}"): - icf_mod.inspect(bad_path) + vcf_mod.inspect(bad_path) @pytest.mark.parametrize("path", ["./", "tests/data/vcf/sample.vcf.gz"]) def test_unknown_format(self, path): with pytest.raises(ValueError, match="not in ICF or VCF Zarr format"): - icf_mod.inspect(path) + vcf_mod.inspect(path) class TestSchemaDefaults: @@ -785,7 +785,7 @@ def test_defaults_with_encode(self, icf_path, tmp_path): zarr_path = tmp_path / "zarr" # Create schema with custom defaults - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema() # Set custom defaults @@ -800,7 +800,7 @@ def test_defaults_with_encode(self, icf_path, tmp_path): f.write(schema.asjson()) # Encode using the schema with custom defaults - icf_mod.encode(icf_path, zarr_path, schema_path=schema_path) + vcf_mod.encode(icf_path, zarr_path, schema_path=schema_path) # Check that arrays use the default compressor when not overridden root = zarr.open(zarr_path) @@ -845,7 +845,7 @@ def test_fromdict(self): assert dim2.chunk_size == 25 def test_json_serialization(self, icf_path): - icf = icf_mod.IntermediateColumnarFormat(icf_path) + icf = vcf_mod.IntermediateColumnarFormat(icf_path) schema = icf.generate_schema(variants_chunk_size=42, samples_chunk_size=24) schema_json = schema.asjson() @@ -865,7 +865,7 @@ class TestDimensionSizes: @pytest.fixture(scope="class") def icf(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.exploded" - return icf_mod.explode(out, [self.data_path]) + return vcf_mod.explode(out, [self.data_path]) @pytest.fixture(scope="class") def schema(self, icf):