Skip to content

Commit a7dce49

Browse files
Add standard_dimensions for VCZ
1 parent 21d0224 commit a7dce49

File tree

3 files changed

+65
-21
lines changed

3 files changed

+65
-21
lines changed

bio2zarr/vcf.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,40 +1057,41 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
10571057
def generate_schema(
10581058
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
10591059
):
1060-
m = self.num_records
1061-
n = self.num_samples
10621060
if local_alleles is None:
10631061
local_alleles = False
10641062

10651063
max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
1066-
dimensions = {
1067-
"variants": vcz.VcfZarrDimension(
1068-
size=m, chunk_size=variants_chunk_size or vcz.DEFAULT_VARIANT_CHUNK_SIZE
1069-
),
1070-
"samples": vcz.VcfZarrDimension(
1071-
size=n, chunk_size=samples_chunk_size or vcz.DEFAULT_SAMPLE_CHUNK_SIZE
1072-
),
1073-
# ploidy and genotypes added conditionally below
1074-
"alleles": vcz.VcfZarrDimension(size=max_alleles),
1075-
"alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1),
1076-
"filters": vcz.VcfZarrDimension(size=self.metadata.num_filters),
1077-
}
10781064

10791065
# Add ploidy and genotypes dimensions only when needed
10801066
max_genotypes = 0
10811067
for field in self.metadata.format_fields:
10821068
if field.vcf_number == "G":
10831069
max_genotypes = max(max_genotypes, field.summary.max_number)
1070+
1071+
ploidy = None
1072+
genotypes_size = None
10841073
if self.gt_field is not None:
10851074
ploidy = max(self.gt_field.summary.max_number - 1, 1)
1086-
dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
1087-
max_genotypes = math.comb(max_alleles + ploidy - 1, ploidy)
1088-
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1075+
# NOTE: it's not clear why we're computing this, when we must have had
1076+
# at least one number=G field to require it anyway?
1077+
genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
1078+
# assert max_genotypes == genotypes_size
10891079
else:
10901080
if max_genotypes > 0:
10911081
# there is no GT field, but there is at least one Number=G field,
10921082
# so need to define genotypes dimension
1093-
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1083+
genotypes_size = max_genotypes
1084+
1085+
dimensions = vcz.standard_dimensions(
1086+
variants_size=self.num_records,
1087+
variants_chunk_size=variants_chunk_size,
1088+
samples_size=self.num_samples,
1089+
samples_chunk_size=samples_chunk_size,
1090+
alleles_size=max_alleles,
1091+
filters_size=self.metadata.num_filters,
1092+
ploidy_size=ploidy,
1093+
genotypes_size=genotypes_size,
1094+
)
10941095

10951096
schema_instance = vcz.VcfZarrSchema(
10961097
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
@@ -1173,7 +1174,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11731174
continue
11741175
array_specs.append(spec_from_field(field))
11751176

1176-
if self.gt_field is not None and n > 0:
1177+
if self.gt_field is not None and self.num_samples > 0:
11771178
array_specs.append(
11781179
vcz.ZarrArraySpec(
11791180
name="call_genotype_phased",

bio2zarr/vcz.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,51 @@ def fromdict(cls, d):
121121
chunk_size=d.get("chunk_size", d["size"]),
122122
)
123123

124+
@classmethod
125+
def unchunked(cls, size):
126+
return cls(size, size)
127+
128+
129+
def standard_dimensions(
130+
*,
131+
variants_size,
132+
samples_size,
133+
variants_chunk_size=None,
134+
samples_chunk_size=None,
135+
alleles_size=None,
136+
filters_size=None,
137+
ploidy_size=None,
138+
genotypes_size=None,
139+
):
140+
"""
141+
Returns a dictionary mapping dimension names to definition for the standard
142+
fields in a VCF.
143+
"""
144+
if variants_chunk_size is None:
145+
variants_chunk_size = max(1, min(variants_size, DEFAULT_VARIANT_CHUNK_SIZE))
146+
if samples_chunk_size is None:
147+
samples_chunk_size = max(1, min(samples_size, DEFAULT_SAMPLE_CHUNK_SIZE))
148+
149+
dimensions = {
150+
"variants": VcfZarrDimension(variants_size, variants_chunk_size),
151+
"samples": VcfZarrDimension(samples_size, samples_chunk_size),
152+
}
153+
154+
if alleles_size is not None:
155+
dimensions["alleles"] = VcfZarrDimension.unchunked(alleles_size)
156+
dimensions["alt_alleles"] = VcfZarrDimension.unchunked(alleles_size - 1)
157+
158+
if filters_size is not None:
159+
dimensions["filters"] = VcfZarrDimension.unchunked(filters_size)
160+
161+
if ploidy_size is not None:
162+
dimensions["ploidy"] = VcfZarrDimension.unchunked(ploidy_size)
163+
164+
if genotypes_size is not None:
165+
dimensions["genotypes"] = VcfZarrDimension.unchunked(genotypes_size)
166+
167+
return dimensions
168+
124169

125170
@dataclasses.dataclass
126171
class ZarrArraySpec:
@@ -1117,7 +1162,6 @@ def __init__(self, path):
11171162
def create_index(self):
11181163
"""Create an index to support efficient region queries."""
11191164
root = zarr.open_group(store=self.path, mode="r+")
1120-
print(list(root.keys()))
11211165
if (
11221166
"variant_contig" not in root
11231167
or "variant_position" not in root

tests/test_vcz.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,6 @@ def test_chunk_not_too_large(self, schema, size):
329329
field = schema.field_map()["variant_H2"]
330330
schema.dimensions[field.dimensions[-1]].size = size
331331
schema.dimensions[field.dimensions[-1]].chunk_size = size
332-
print(schema.dimensions)
333332
schema.validate()
334333

335334

0 commit comments

Comments
 (0)