diff --git a/CHANGELOG.md b/CHANGELOG.md index 947e8f9a..bb74fac7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,14 @@ # 0.1.2 2024-XX-XX +- Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300) + +- Add chunksize options to mkschema (issue:294) + Breaking changes - ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x and numpy >= 2. Existing ICFs will need to be recreated. -- Add chunksize options to mkschema (issue:294) # 0.1.1 2024-06-19 diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 3094b233..0cf25ca5 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -235,11 +235,10 @@ def fromjson(s): def generate(icf, variants_chunk_size=None, samples_chunk_size=None): m = icf.num_records n = icf.num_samples - # FIXME if samples_chunk_size is None: - samples_chunk_size = 1000 + samples_chunk_size = 10_000 if variants_chunk_size is None: - variants_chunk_size = 10_000 + variants_chunk_size = 1000 logger.info( f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}" ) diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 6787461f..789fd59a 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -74,7 +74,12 @@ def test_not_enough_memory_for_two( other_zarr_path = tmp_path / "zarr" with caplog.at_level("WARNING"): vcf2zarr.encode( - icf_path, other_zarr_path, max_memory=max_memory, worker_processes=2 + icf_path, + other_zarr_path, + max_memory=max_memory, + worker_processes=2, + samples_chunk_size=1000, + variants_chunk_size=10_000, ) assert "Limiting number of workers to 1 to keep within" in caplog.text ds1 = sg.load_dataset(zarr_path) @@ -164,6 +169,12 @@ def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size): found += 1 assert found > 0 + def test_default_chunk_size(self, icf_path): + icf = vcf2zarr.IntermediateColumnarFormat(icf_path) + schema = vcf2zarr.VcfZarrSchema.generate(icf) + assert schema.samples_chunk_size == 10_000 + assert schema.variants_chunk_size == 1000 + class TestSchemaJsonRoundTrip: def assert_json_round_trip(self, schema): @@ -297,8 +308,8 @@ def test_format_version(self, schema): assert schema.format_version == vcz_mod.ZARR_SCHEMA_FORMAT_VERSION def test_chunk_size(self, schema): - assert schema.samples_chunk_size == 1000 - assert schema.variants_chunk_size == 10000 + assert schema.samples_chunk_size == 10000 + assert schema.variants_chunk_size == 1000 def test_samples(self, schema): assert schema.asdict()["samples"] == [ @@ -322,7 +333,7 @@ def test_variant_contig(self, schema): "name": "variant_contig", "dtype": "i1", "shape": (9,), - "chunks": (10000,), + "chunks": (1000,), "dimensions": ("variants",), "description": "An identifier from the reference genome or an " "angle-bracketed ID string pointing to a contig in the assembly file", @@ -342,7 +353,7 @@ def test_call_genotype(self, schema): "name": "call_genotype", "dtype": "i1", "shape": (9, 3, 2), - "chunks": (10000, 1000, 2), + "chunks": (1000, 10000, 2), "dimensions": ("variants", "samples", "ploidy"), "description": "", "vcf_field": None, @@ -361,7 +372,7 @@ def test_call_genotype_mask(self, schema): "name": "call_genotype_mask", "dtype": "bool", "shape": (9, 3, 2), - "chunks": (10000, 1000, 2), + "chunks": (1000, 10000, 2), "dimensions": ("variants", "samples", "ploidy"), "description": "", "vcf_field": None, @@ -380,7 +391,7 @@ def test_call_genotype_phased(self, schema): "name": "call_genotype_mask", "dtype": "bool", "shape": (9, 3, 2), - "chunks": (10000, 1000, 2), + "chunks": (1000, 10000, 2), "dimensions": ("variants", "samples", "ploidy"), "description": "", "vcf_field": None, @@ -399,7 +410,7 @@ def test_call_GQ(self, schema): "name": "call_GQ", "dtype": "i1", "shape": (9, 3), - "chunks": (10000, 1000), + "chunks": (1000, 10000), "dimensions": ("variants", "samples"), "description": "Genotype Quality", "vcf_field": "FORMAT/GQ",