Skip to content

Commit 6bbce8b

Browse files
Merge pull request #113 from jeromekelleher/array-niceties
Add named dimensions for vcf numbers
2 parents aa03df0 + 5f67d20 commit 6bbce8b

File tree

3 files changed

+49
-19
lines changed

3 files changed

+49
-19
lines changed

bio2zarr/vcf.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1388,7 +1388,16 @@ def fixed_field_spec(
13881388
# TODO make an option to add in the empty extra dimension
13891389
if field.summary.max_number > 1:
13901390
shape.append(field.summary.max_number)
1391-
dimensions.append(field.name)
1391+
# TODO we should really be checking this to see if the named dimensions
1392+
# are actually correct.
1393+
if field.vcf_number == "R":
1394+
dimensions.append("alleles")
1395+
elif field.vcf_number == "A":
1396+
dimensions.append("alt_alleles")
1397+
elif field.vcf_number == "G":
1398+
dimensions.append("genotypes")
1399+
else:
1400+
dimensions.append(f"{field.category}_{field.name}_dim")
13921401
variable_name = prefix + field.name
13931402
colspec = ZarrColumnSpec(
13941403
vcf_field=field.full_name,

tests/test_icf.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -195,30 +195,34 @@ def schema(self, icf):
195195
return vcf.VcfZarrSchema.generate(icf)
196196

197197
@pytest.mark.parametrize(
198-
("name", "dtype", "shape"),
198+
("name", "dtype", "shape", "dimensions"),
199199
[
200-
("variant_II1", "i1", (208,)),
201-
("variant_II2", "i2", (208, 2)),
202-
("variant_IIA", "i2", (208, 2)),
203-
("variant_IIR", "i2", (208, 3)),
204-
("variant_IID", "i2", (208, 7)),
205-
("variant_IF1", "f4", (208,)),
206-
("variant_IF2", "f4", (208, 2)),
207-
("variant_IFA", "f4", (208, 2)),
208-
("variant_IFR", "f4", (208, 3)),
209-
("variant_IFD", "f4", (208, 9)),
210-
("variant_IC1", "U1", (208,)),
211-
("variant_IC2", "U1", (208, 2)),
212-
("variant_IS1", "O", (208,)),
213-
("variant_IS2", "O", (208, 2)),
214-
("call_FS2", "O", (208, 2, 2)),
215-
("call_FC2", "U1", (208, 2, 2)),
200+
("variant_II1", "i1", (208,), ("variants",)),
201+
("variant_II2", "i2", (208, 2), ("variants", "INFO_II2_dim")),
202+
("variant_IIA", "i2", (208, 2), ("variants", "alt_alleles")),
203+
("variant_IIR", "i2", (208, 3), ("variants", "alleles")),
204+
("variant_IID", "i2", (208, 7), ("variants", "INFO_IID_dim")),
205+
("variant_IF1", "f4", (208,), ("variants",)),
206+
("variant_IF2", "f4", (208, 2), ("variants", "INFO_IF2_dim")),
207+
("variant_IFA", "f4", (208, 2), ("variants", "alt_alleles")),
208+
("variant_IFR", "f4", (208, 3), ("variants", "alleles")),
209+
("variant_IFD", "f4", (208, 9), ("variants", "INFO_IFD_dim")),
210+
("variant_IC1", "U1", (208,), ("variants",)),
211+
("variant_IC2", "U1", (208, 2), ("variants", "INFO_IC2_dim")),
212+
("variant_IS1", "O", (208,), ("variants",)),
213+
("variant_IS2", "O", (208, 2), ("variants", "INFO_IS2_dim")),
214+
("call_FS2", "O", (208, 2, 2), ("variants", "samples", "FORMAT_FS2_dim")),
215+
("call_FC2", "U1", (208, 2, 2), ("variants", "samples", "FORMAT_FC2_dim")),
216+
("call_FIG", "i2", (208, 2, 6), ("variants", "samples", "genotypes")),
217+
("call_FIA", "i2", (208, 2, 2), ("variants", "samples", "alt_alleles")),
218+
("call_FIR", "i2", (208, 2, 3), ("variants", "samples", "alleles")),
216219
],
217220
)
218-
def test_info_schemas(self, schema, name, dtype, shape):
221+
def test_info_schemas(self, schema, name, dtype, shape, dimensions):
219222
v = schema.columns[name]
220223
assert v.dtype == dtype
221224
assert tuple(v.shape) == shape
225+
assert v.dimensions == dimensions
222226

223227
def test_info_string1(self, icf):
224228
non_missing = [v for v in icf["INFO/IS1"].values if v is not None]

tests/test_vcf_examples.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,23 @@ def test_missing_contig_vcf(self, ds, tmp_path, path):
384384
drop_vars = ["contig_id", "variant_contig"]
385385
xt.assert_equal(ds_c1.drop_vars(drop_vars), ds_c2.drop_vars(drop_vars))
386386

387+
def test_vcf_dimensions(self, ds):
388+
assert ds.call_genotype.dims == ("variants", "samples", "ploidy")
389+
assert ds.call_genotype_mask.dims == ("variants", "samples", "ploidy")
390+
assert ds.call_genotype_phased.dims == ("variants", "samples")
391+
assert ds.call_HQ.dims == ("variants", "samples", "FORMAT_HQ_dim")
392+
assert ds.call_DP.dims == ("variants", "samples")
393+
assert ds.call_GQ.dims == ("variants", "samples")
394+
assert ds.variant_AA.dims == ("variants",)
395+
assert ds.variant_NS.dims == ("variants",)
396+
assert ds.variant_AN.dims == ("variants",)
397+
assert ds.variant_AC.dims == ("variants", "INFO_AC_dim")
398+
assert ds.variant_AF.dims == ("variants", "INFO_AF_dim")
399+
assert ds.variant_DP.dims == ("variants", )
400+
assert ds.variant_DB.dims == ("variants", )
401+
assert ds.variant_H2.dims == ("variants", )
402+
assert ds.variant_position.dims == ("variants", )
403+
387404

388405
class Test1000G2020Example:
389406
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"

0 commit comments

Comments
 (0)