Skip to content

Commit b338e64

Browse files
Add standard_dimensions for VCZ and fix chunk size
Sets default chunk size to min(size, default) for large dimensions. Closes #368
1 parent 21d0224 commit b338e64

File tree

3 files changed

+69
-25
lines changed

3 files changed

+69
-25
lines changed

bio2zarr/vcf.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,40 +1057,41 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
10571057
def generate_schema(
10581058
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
10591059
):
1060-
m = self.num_records
1061-
n = self.num_samples
10621060
if local_alleles is None:
10631061
local_alleles = False
10641062

10651063
max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
1066-
dimensions = {
1067-
"variants": vcz.VcfZarrDimension(
1068-
size=m, chunk_size=variants_chunk_size or vcz.DEFAULT_VARIANT_CHUNK_SIZE
1069-
),
1070-
"samples": vcz.VcfZarrDimension(
1071-
size=n, chunk_size=samples_chunk_size or vcz.DEFAULT_SAMPLE_CHUNK_SIZE
1072-
),
1073-
# ploidy and genotypes added conditionally below
1074-
"alleles": vcz.VcfZarrDimension(size=max_alleles),
1075-
"alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1),
1076-
"filters": vcz.VcfZarrDimension(size=self.metadata.num_filters),
1077-
}
10781064

10791065
# Add ploidy and genotypes dimensions only when needed
10801066
max_genotypes = 0
10811067
for field in self.metadata.format_fields:
10821068
if field.vcf_number == "G":
10831069
max_genotypes = max(max_genotypes, field.summary.max_number)
1070+
1071+
ploidy = None
1072+
genotypes_size = None
10841073
if self.gt_field is not None:
10851074
ploidy = max(self.gt_field.summary.max_number - 1, 1)
1086-
dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
1087-
max_genotypes = math.comb(max_alleles + ploidy - 1, ploidy)
1088-
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1075+
# NOTE: it's not clear why we're computing this, when we must have had
1076+
# at least one number=G field to require it anyway?
1077+
genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
1078+
# assert max_genotypes == genotypes_size
10891079
else:
10901080
if max_genotypes > 0:
10911081
# there is no GT field, but there is at least one Number=G field,
10921082
# so need to define genotypes dimension
1093-
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1083+
genotypes_size = max_genotypes
1084+
1085+
dimensions = vcz.standard_dimensions(
1086+
variants_size=self.num_records,
1087+
variants_chunk_size=variants_chunk_size,
1088+
samples_size=self.num_samples,
1089+
samples_chunk_size=samples_chunk_size,
1090+
alleles_size=max_alleles,
1091+
filters_size=self.metadata.num_filters,
1092+
ploidy_size=ploidy,
1093+
genotypes_size=genotypes_size,
1094+
)
10941095

10951096
schema_instance = vcz.VcfZarrSchema(
10961097
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
@@ -1173,7 +1174,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11731174
continue
11741175
array_specs.append(spec_from_field(field))
11751176

1176-
if self.gt_field is not None and n > 0:
1177+
if self.gt_field is not None and self.num_samples > 0:
11771178
array_specs.append(
11781179
vcz.ZarrArraySpec(
11791180
name="call_genotype_phased",

bio2zarr/vcz.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,51 @@ def fromdict(cls, d):
121121
chunk_size=d.get("chunk_size", d["size"]),
122122
)
123123

124+
@classmethod
125+
def unchunked(cls, size):
126+
return cls(size, size)
127+
128+
129+
def standard_dimensions(
130+
*,
131+
variants_size,
132+
samples_size,
133+
variants_chunk_size=None,
134+
samples_chunk_size=None,
135+
alleles_size=None,
136+
filters_size=None,
137+
ploidy_size=None,
138+
genotypes_size=None,
139+
):
140+
"""
141+
Returns a dictionary mapping dimension names to definition for the standard
142+
fields in a VCF.
143+
"""
144+
if variants_chunk_size is None:
145+
variants_chunk_size = max(1, min(variants_size, DEFAULT_VARIANT_CHUNK_SIZE))
146+
if samples_chunk_size is None:
147+
samples_chunk_size = max(1, min(samples_size, DEFAULT_SAMPLE_CHUNK_SIZE))
148+
149+
dimensions = {
150+
"variants": VcfZarrDimension(variants_size, variants_chunk_size),
151+
"samples": VcfZarrDimension(samples_size, samples_chunk_size),
152+
}
153+
154+
if alleles_size is not None:
155+
dimensions["alleles"] = VcfZarrDimension.unchunked(alleles_size)
156+
dimensions["alt_alleles"] = VcfZarrDimension.unchunked(alleles_size - 1)
157+
158+
if filters_size is not None:
159+
dimensions["filters"] = VcfZarrDimension.unchunked(filters_size)
160+
161+
if ploidy_size is not None:
162+
dimensions["ploidy"] = VcfZarrDimension.unchunked(ploidy_size)
163+
164+
if genotypes_size is not None:
165+
dimensions["genotypes"] = VcfZarrDimension.unchunked(genotypes_size)
166+
167+
return dimensions
168+
124169

125170
@dataclasses.dataclass
126171
class ZarrArraySpec:
@@ -1117,7 +1162,6 @@ def __init__(self, path):
11171162
def create_index(self):
11181163
"""Create an index to support efficient region queries."""
11191164
root = zarr.open_group(store=self.path, mode="r+")
1120-
print(list(root.keys()))
11211165
if (
11221166
"variant_contig" not in root
11231167
or "variant_position" not in root

tests/test_vcz.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,8 @@ def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size):
187187
def test_default_chunk_size(self, icf_path):
188188
icf = vcf_mod.IntermediateColumnarFormat(icf_path)
189189
schema = icf.generate_schema()
190-
assert schema.dimensions["samples"].chunk_size == 10_000
191-
assert schema.dimensions["variants"].chunk_size == 1000
190+
assert schema.dimensions["samples"].chunk_size == 3
191+
assert schema.dimensions["variants"].chunk_size == 9
192192

193193

194194
class TestSchemaJsonRoundTrip:
@@ -329,7 +329,6 @@ def test_chunk_not_too_large(self, schema, size):
329329
field = schema.field_map()["variant_H2"]
330330
schema.dimensions[field.dimensions[-1]].size = size
331331
schema.dimensions[field.dimensions[-1]].chunk_size = size
332-
print(schema.dimensions)
333332
schema.validate()
334333

335334

@@ -338,8 +337,8 @@ def test_format_version(self, schema):
338337
assert schema.format_version == vcz.ZARR_SCHEMA_FORMAT_VERSION
339338

340339
def test_chunk_size(self, schema):
341-
assert schema.dimensions["samples"].chunk_size == 10000
342-
assert schema.dimensions["variants"].chunk_size == 1000
340+
assert schema.dimensions["samples"].chunk_size == 3
341+
assert schema.dimensions["variants"].chunk_size == 9
343342

344343
def test_variant_contig(self, schema):
345344
assert get_field_dict(schema, "variant_contig") == {

0 commit comments

Comments
 (0)