Skip to content

Commit 8f435e1

Browse files
committed
Add genotypes dimension to schema
1 parent 635cac1 commit 8f435e1

File tree

2 files changed

+29
-10
lines changed

2 files changed

+29
-10
lines changed

bio2zarr/icf.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,12 +1050,31 @@ def generate_schema(
10501050
"samples": vcz.VcfZarrDimension(
10511051
size=n, chunk_size=samples_chunk_size or 10000
10521052
),
1053-
# ploidy added conditionally below
1053+
# ploidy and genotypes added conditionally below
10541054
"alleles": vcz.VcfZarrDimension(size=max_alleles),
10551055
"alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1),
10561056
"filters": vcz.VcfZarrDimension(size=self.metadata.num_filters),
10571057
}
10581058

1059+
# Add ploidy and genotypes dimensions only when needed
1060+
gt_field = None
1061+
max_genotypes = 0
1062+
for field in self.metadata.format_fields:
1063+
if field.name == "GT":
1064+
gt_field = field
1065+
elif field.vcf_number == "G":
1066+
max_genotypes = max(max_genotypes, field.summary.max_number)
1067+
if gt_field is not None:
1068+
ploidy = max(gt_field.summary.max_number - 1, 1)
1069+
dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
1070+
max_genotypes = math.comb(max_alleles + ploidy - 1, ploidy)
1071+
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1072+
else:
1073+
if max_genotypes > 0:
1074+
# there is no GT field, but there is at least one Number=G field,
1075+
# so need to define genotypes dimension
1076+
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1077+
10591078
schema_instance = vcz.VcfZarrSchema(
10601079
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
10611080
dimensions=dimensions,
@@ -1128,18 +1147,12 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11281147
[spec_from_field(field) for field in self.metadata.info_fields]
11291148
)
11301149

1131-
gt_field = None
11321150
for field in self.metadata.format_fields:
11331151
if field.name == "GT":
1134-
gt_field = field
11351152
continue
11361153
array_specs.append(spec_from_field(field))
11371154

11381155
if gt_field is not None and n > 0:
1139-
ploidy = max(gt_field.summary.max_number - 1, 1)
1140-
# Add ploidy dimension only when needed
1141-
schema_instance.dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
1142-
11431156
array_specs.append(
11441157
vcz.ZarrArraySpec(
11451158
name="call_genotype_phased",

bio2zarr/vcz.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,9 +190,15 @@ def from_field(
190190
)
191191
if max_alt_alleles > 0:
192192
dimensions.append("alt_alleles")
193-
elif max_number > 0 and vcf_field.vcf_number == "G":
194-
# TODO: need max_genotypes
195-
dimensions.append("genotypes")
193+
elif vcf_field.vcf_number == "G":
194+
max_genotypes = schema.dimensions["genotypes"].size
195+
if max_number > max_genotypes:
196+
raise ValueError(
197+
f"Max number of values {max_number} exceeds max genotypes "
198+
f"{max_genotypes} for {vcf_field.full_name}"
199+
)
200+
if max_genotypes > 0:
201+
dimensions.append("genotypes")
196202
elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
197203
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
198204
if dimensions[-1] not in schema.dimensions:

0 commit comments

Comments
 (0)