Skip to content

Commit 9b5e967

Browse files
Basic initial local-alleles schema implementation
1 parent 883a37e commit 9b5e967

File tree

2 files changed

+61
-3
lines changed

2 files changed

+61
-3
lines changed

bio2zarr/cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,8 @@ def inspect(path, verbose):
340340
@icf_path
341341
@variants_chunk_size
342342
@samples_chunk_size
343-
def mkschema(icf_path, variants_chunk_size, samples_chunk_size):
343+
@local_alleles
344+
def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
344345
"""
345346
Generate a schema for zarr encoding
346347
"""
@@ -350,6 +351,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size):
350351
stream,
351352
variants_chunk_size=variants_chunk_size,
352353
samples_chunk_size=samples_chunk_size,
354+
local_alleles=local_alleles,
353355
)
354356

355357

bio2zarr/vcf2zarr/vcz.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,47 @@ def variant_chunk_nbytes(self):
182182
ZARR_SCHEMA_FORMAT_VERSION = "0.4"
183183

184184

185+
def convert_local_allele_field_types(fields):
186+
"""
187+
Update the specified list of fields to include the LAA field, and to convert
188+
any supported localisable fields to the L* counterpart.
189+
190+
Note that we currently support only two ALT alleles per sample, and so the
191+
dimensions of these fields are fixed by that requirement. Later versions may
192+
use summry data storted in the ICF to make different choices, if information
193+
about subsequent alleles (not in the actual genotype calls) should also be
194+
stored.
195+
"""
196+
fields_by_name = {field.name: field for field in fields}
197+
gt = fields_by_name["call_genotype"]
198+
if gt.shape[-1] != 2:
199+
raise ValueError("Local alleles only supported on diploid data")
200+
shape = gt.shape[:-1]
201+
chunks = gt.chunks[:-1]
202+
203+
laa = ZarrArraySpec.new(
204+
vcf_field=None,
205+
name="call_LAA",
206+
dtype="i1",
207+
shape=gt.shape,
208+
chunks=gt.chunks,
209+
dimensions=gt.dimensions, # FIXME
210+
description=(
211+
"1-based indices into ALT, indicating which alleles"
212+
" are relevant (local) for the current sample"
213+
),
214+
)
215+
pl = fields_by_name.get("call_PL", None)
216+
if pl is not None:
217+
pl.name = "call_LPL"
218+
pl.vcf_field = None
219+
pl.shape = (*shape, 3)
220+
pl.chunks = (*chunks, 3)
221+
pl.description += " (local-alleles)"
222+
# TODO fix dimensions
223+
return [*fields, laa]
224+
225+
185226
@dataclasses.dataclass
186227
class VcfZarrSchema(core.JsonDataclass):
187228
format_version: str
@@ -232,14 +273,18 @@ def fromjson(s):
232273
return VcfZarrSchema.fromdict(json.loads(s))
233274

234275
@staticmethod
235-
def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
276+
def generate(
277+
icf, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
278+
):
236279
m = icf.num_records
237280
n = icf.num_samples
238281
# FIXME
239282
if samples_chunk_size is None:
240283
samples_chunk_size = 1000
241284
if variants_chunk_size is None:
242285
variants_chunk_size = 10_000
286+
if local_alleles is None:
287+
local_alleles = False
243288
logger.info(
244289
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
245290
)
@@ -366,6 +411,9 @@ def fixed_field_spec(
366411
)
367412
)
368413

414+
if local_alleles:
415+
array_specs = convert_local_allele_field_types(array_specs)
416+
369417
return VcfZarrSchema(
370418
format_version=ZARR_SCHEMA_FORMAT_VERSION,
371419
samples_chunk_size=samples_chunk_size,
@@ -1027,12 +1075,20 @@ def encode_all_partitions(
10271075
pwm.submit(self.encode_partition, partition_index)
10281076

10291077

1030-
def mkschema(if_path, out, *, variants_chunk_size=None, samples_chunk_size=None):
1078+
def mkschema(
1079+
if_path,
1080+
out,
1081+
*,
1082+
variants_chunk_size=None,
1083+
samples_chunk_size=None,
1084+
local_alleles=None,
1085+
):
10311086
store = icf.IntermediateColumnarFormat(if_path)
10321087
spec = VcfZarrSchema.generate(
10331088
store,
10341089
variants_chunk_size=variants_chunk_size,
10351090
samples_chunk_size=samples_chunk_size,
1091+
local_alleles=local_alleles,
10361092
)
10371093
out.write(spec.asjson())
10381094

0 commit comments

Comments
 (0)