Skip to content

Commit b9590aa

Browse files
Transpose default chunk sizes
Closes #300
1 parent 883a37e commit b9590aa

File tree

3 files changed

+25
-12
lines changed

3 files changed

+25
-12
lines changed

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
# 0.1.2 2024-XX-XX
22

3+
- Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)
4+
5+
- Add chunksize options to mkschema (issue:294)
6+
37
Breaking changes
48

59
- ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x
610
and numpy >= 2. Existing ICFs will need to be recreated.
711

8-
- Add chunksize options to mkschema (issue:294)
912

1013
# 0.1.1 2024-06-19
1114

bio2zarr/vcf2zarr/vcz.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,11 +235,10 @@ def fromjson(s):
235235
def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
236236
m = icf.num_records
237237
n = icf.num_samples
238-
# FIXME
239238
if samples_chunk_size is None:
240-
samples_chunk_size = 1000
239+
samples_chunk_size = 10_000
241240
if variants_chunk_size is None:
242-
variants_chunk_size = 10_000
241+
variants_chunk_size = 1000
243242
logger.info(
244243
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
245244
)

tests/test_vcz.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,12 @@ def test_not_enough_memory_for_two(
7474
other_zarr_path = tmp_path / "zarr"
7575
with caplog.at_level("WARNING"):
7676
vcf2zarr.encode(
77-
icf_path, other_zarr_path, max_memory=max_memory, worker_processes=2
77+
icf_path,
78+
other_zarr_path,
79+
max_memory=max_memory,
80+
worker_processes=2,
81+
samples_chunk_size=1000,
82+
variants_chunk_size=10_000,
7883
)
7984
assert "Limiting number of workers to 1 to keep within" in caplog.text
8085
ds1 = sg.load_dataset(zarr_path)
@@ -164,6 +169,12 @@ def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size):
164169
found += 1
165170
assert found > 0
166171

172+
def test_default_chunk_size(self, icf_path):
173+
icf = vcf2zarr.IntermediateColumnarFormat(icf_path)
174+
schema = vcf2zarr.VcfZarrSchema.generate(icf)
175+
assert schema.samples_chunk_size == 10_000
176+
assert schema.variants_chunk_size == 1000
177+
167178

168179
class TestSchemaJsonRoundTrip:
169180
def assert_json_round_trip(self, schema):
@@ -297,8 +308,8 @@ def test_format_version(self, schema):
297308
assert schema.format_version == vcz_mod.ZARR_SCHEMA_FORMAT_VERSION
298309

299310
def test_chunk_size(self, schema):
300-
assert schema.samples_chunk_size == 1000
301-
assert schema.variants_chunk_size == 10000
311+
assert schema.samples_chunk_size == 10000
312+
assert schema.variants_chunk_size == 1000
302313

303314
def test_samples(self, schema):
304315
assert schema.asdict()["samples"] == [
@@ -322,7 +333,7 @@ def test_variant_contig(self, schema):
322333
"name": "variant_contig",
323334
"dtype": "i1",
324335
"shape": (9,),
325-
"chunks": (10000,),
336+
"chunks": (1000,),
326337
"dimensions": ("variants",),
327338
"description": "An identifier from the reference genome or an "
328339
"angle-bracketed ID string pointing to a contig in the assembly file",
@@ -342,7 +353,7 @@ def test_call_genotype(self, schema):
342353
"name": "call_genotype",
343354
"dtype": "i1",
344355
"shape": (9, 3, 2),
345-
"chunks": (10000, 1000, 2),
356+
"chunks": (1000, 10000, 2),
346357
"dimensions": ("variants", "samples", "ploidy"),
347358
"description": "",
348359
"vcf_field": None,
@@ -361,7 +372,7 @@ def test_call_genotype_mask(self, schema):
361372
"name": "call_genotype_mask",
362373
"dtype": "bool",
363374
"shape": (9, 3, 2),
364-
"chunks": (10000, 1000, 2),
375+
"chunks": (1000, 10000, 2),
365376
"dimensions": ("variants", "samples", "ploidy"),
366377
"description": "",
367378
"vcf_field": None,
@@ -380,7 +391,7 @@ def test_call_genotype_phased(self, schema):
380391
"name": "call_genotype_mask",
381392
"dtype": "bool",
382393
"shape": (9, 3, 2),
383-
"chunks": (10000, 1000, 2),
394+
"chunks": (1000, 10000, 2),
384395
"dimensions": ("variants", "samples", "ploidy"),
385396
"description": "",
386397
"vcf_field": None,
@@ -399,7 +410,7 @@ def test_call_GQ(self, schema):
399410
"name": "call_GQ",
400411
"dtype": "i1",
401412
"shape": (9, 3),
402-
"chunks": (10000, 1000),
413+
"chunks": (1000, 10000),
403414
"dimensions": ("variants", "samples"),
404415
"description": "Genotype Quality",
405416
"vcf_field": "FORMAT/GQ",

0 commit comments

Comments
 (0)