Skip to content

Commit 8c9c1a2

Browse files
committed
Move convert_local_allele_field_types
1 parent ca394d3 commit 8c9c1a2

File tree

2 files changed

+56
-58
lines changed

2 files changed

+56
-58
lines changed

bio2zarr/vcf2zarr/icf.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,62 @@ def __exit__(self, exc_type, exc_val, exc_tb):
822822
return False
823823

824824

825+
def convert_local_allele_field_types(fields):
826+
"""
827+
Update the specified list of fields to include the LAA field, and to convert
828+
any supported localisable fields to the L* counterpart.
829+
830+
Note that we currently support only two ALT alleles per sample, and so the
831+
dimensions of these fields are fixed by that requirement. Later versions may
832+
use summary data storted in the ICF to make different choices, if information
833+
about subsequent alleles (not in the actual genotype calls) should also be
834+
stored.
835+
"""
836+
fields_by_name = {field.name: field for field in fields}
837+
gt = fields_by_name["call_genotype"]
838+
if gt.shape[-1] != 2:
839+
raise ValueError("Local alleles only supported on diploid data")
840+
841+
# TODO check if LA is already in here
842+
843+
shape = gt.shape[:-1]
844+
chunks = gt.chunks[:-1]
845+
dimensions = gt.dimensions[:-1]
846+
847+
la = schema.ZarrArraySpec.new(
848+
vcf_field=None,
849+
name="call_LA",
850+
dtype="i1",
851+
shape=gt.shape,
852+
chunks=gt.chunks,
853+
dimensions=(*dimensions, "local_alleles"),
854+
description=(
855+
"0-based indices into REF+ALT, indicating which alleles"
856+
" are relevant (local) for the current sample"
857+
),
858+
)
859+
ad = fields_by_name.get("call_AD", None)
860+
if ad is not None:
861+
# TODO check if call_LAD is in the list already
862+
ad.name = "call_LAD"
863+
ad.vcf_field = None
864+
ad.shape = (*shape, 2)
865+
ad.chunks = (*chunks, 2)
866+
ad.dimensions = (*dimensions, "local_alleles")
867+
ad.description += " (local-alleles)"
868+
869+
pl = fields_by_name.get("call_PL", None)
870+
if pl is not None:
871+
# TODO check if call_LPL is in the list already
872+
pl.name = "call_LPL"
873+
pl.vcf_field = None
874+
pl.shape = (*shape, 3)
875+
pl.chunks = (*chunks, 3)
876+
pl.description += " (local-alleles)"
877+
pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1])
878+
return [*fields, la]
879+
880+
825881
class IntermediateColumnarFormat(collections.abc.Mapping):
826882
def __init__(self, path):
827883
self.path = pathlib.Path(path)
@@ -1110,8 +1166,6 @@ def fixed_field_spec(
11101166
)
11111167

11121168
if local_alleles:
1113-
from bio2zarr.vcf2zarr.vcz import convert_local_allele_field_types
1114-
11151169
array_specs = convert_local_allele_field_types(array_specs)
11161170

11171171
return schema.VcfZarrSchema(

bio2zarr/vcf2zarr/vcz.py

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -24,62 +24,6 @@ def inspect(path):
2424
return obj.summary_table()
2525

2626

27-
def convert_local_allele_field_types(fields):
28-
"""
29-
Update the specified list of fields to include the LAA field, and to convert
30-
any supported localisable fields to the L* counterpart.
31-
32-
Note that we currently support only two ALT alleles per sample, and so the
33-
dimensions of these fields are fixed by that requirement. Later versions may
34-
use summary data storted in the ICF to make different choices, if information
35-
about subsequent alleles (not in the actual genotype calls) should also be
36-
stored.
37-
"""
38-
fields_by_name = {field.name: field for field in fields}
39-
gt = fields_by_name["call_genotype"]
40-
if gt.shape[-1] != 2:
41-
raise ValueError("Local alleles only supported on diploid data")
42-
43-
# TODO check if LA is already in here
44-
45-
shape = gt.shape[:-1]
46-
chunks = gt.chunks[:-1]
47-
dimensions = gt.dimensions[:-1]
48-
49-
la = schema.ZarrArraySpec.new(
50-
vcf_field=None,
51-
name="call_LA",
52-
dtype="i1",
53-
shape=gt.shape,
54-
chunks=gt.chunks,
55-
dimensions=(*dimensions, "local_alleles"),
56-
description=(
57-
"0-based indices into REF+ALT, indicating which alleles"
58-
" are relevant (local) for the current sample"
59-
),
60-
)
61-
ad = fields_by_name.get("call_AD", None)
62-
if ad is not None:
63-
# TODO check if call_LAD is in the list already
64-
ad.name = "call_LAD"
65-
ad.vcf_field = None
66-
ad.shape = (*shape, 2)
67-
ad.chunks = (*chunks, 2)
68-
ad.dimensions = (*dimensions, "local_alleles")
69-
ad.description += " (local-alleles)"
70-
71-
pl = fields_by_name.get("call_PL", None)
72-
if pl is not None:
73-
# TODO check if call_LPL is in the list already
74-
pl.name = "call_LPL"
75-
pl.vcf_field = None
76-
pl.shape = (*shape, 3)
77-
pl.chunks = (*chunks, 3)
78-
pl.description += " (local-alleles)"
79-
pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1])
80-
return [*fields, la]
81-
82-
8327
def mkschema(
8428
if_path,
8529
out,

0 commit comments

Comments
 (0)