Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# 0.1.2 2024-XX-XX

- Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)

- Add chunksize options to mkschema (issue:294)

Breaking changes

- ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x
and numpy >= 2. Existing ICFs will need to be recreated.

- Add chunksize options to mkschema (issue:294)

# 0.1.1 2024-06-19

Expand Down
5 changes: 2 additions & 3 deletions bio2zarr/vcf2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,10 @@ def fromjson(s):
def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
m = icf.num_records
n = icf.num_samples
# FIXME
if samples_chunk_size is None:
samples_chunk_size = 1000
samples_chunk_size = 10_000
if variants_chunk_size is None:
variants_chunk_size = 10_000
variants_chunk_size = 1000
logger.info(
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
)
Expand Down
27 changes: 19 additions & 8 deletions tests/test_vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,12 @@ def test_not_enough_memory_for_two(
other_zarr_path = tmp_path / "zarr"
with caplog.at_level("WARNING"):
vcf2zarr.encode(
icf_path, other_zarr_path, max_memory=max_memory, worker_processes=2
icf_path,
other_zarr_path,
max_memory=max_memory,
worker_processes=2,
samples_chunk_size=1000,
variants_chunk_size=10_000,
)
assert "Limiting number of workers to 1 to keep within" in caplog.text
ds1 = sg.load_dataset(zarr_path)
Expand Down Expand Up @@ -164,6 +169,12 @@ def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size):
found += 1
assert found > 0

def test_default_chunk_size(self, icf_path):
icf = vcf2zarr.IntermediateColumnarFormat(icf_path)
schema = vcf2zarr.VcfZarrSchema.generate(icf)
assert schema.samples_chunk_size == 10_000
assert schema.variants_chunk_size == 1000


class TestSchemaJsonRoundTrip:
def assert_json_round_trip(self, schema):
Expand Down Expand Up @@ -297,8 +308,8 @@ def test_format_version(self, schema):
assert schema.format_version == vcz_mod.ZARR_SCHEMA_FORMAT_VERSION

def test_chunk_size(self, schema):
assert schema.samples_chunk_size == 1000
assert schema.variants_chunk_size == 10000
assert schema.samples_chunk_size == 10000
assert schema.variants_chunk_size == 1000

def test_samples(self, schema):
assert schema.asdict()["samples"] == [
Expand All @@ -322,7 +333,7 @@ def test_variant_contig(self, schema):
"name": "variant_contig",
"dtype": "i1",
"shape": (9,),
"chunks": (10000,),
"chunks": (1000,),
"dimensions": ("variants",),
"description": "An identifier from the reference genome or an "
"angle-bracketed ID string pointing to a contig in the assembly file",
Expand All @@ -342,7 +353,7 @@ def test_call_genotype(self, schema):
"name": "call_genotype",
"dtype": "i1",
"shape": (9, 3, 2),
"chunks": (10000, 1000, 2),
"chunks": (1000, 10000, 2),
"dimensions": ("variants", "samples", "ploidy"),
"description": "",
"vcf_field": None,
Expand All @@ -361,7 +372,7 @@ def test_call_genotype_mask(self, schema):
"name": "call_genotype_mask",
"dtype": "bool",
"shape": (9, 3, 2),
"chunks": (10000, 1000, 2),
"chunks": (1000, 10000, 2),
"dimensions": ("variants", "samples", "ploidy"),
"description": "",
"vcf_field": None,
Expand All @@ -380,7 +391,7 @@ def test_call_genotype_phased(self, schema):
"name": "call_genotype_mask",
"dtype": "bool",
"shape": (9, 3, 2),
"chunks": (10000, 1000, 2),
"chunks": (1000, 10000, 2),
"dimensions": ("variants", "samples", "ploidy"),
"description": "",
"vcf_field": None,
Expand All @@ -399,7 +410,7 @@ def test_call_GQ(self, schema):
"name": "call_GQ",
"dtype": "i1",
"shape": (9, 3),
"chunks": (10000, 1000),
"chunks": (1000, 10000),
"dimensions": ("variants", "samples"),
"description": "Genotype Quality",
"vcf_field": "FORMAT/GQ",
Expand Down
Loading