@@ -182,6 +182,47 @@ def variant_chunk_nbytes(self):
182
182
ZARR_SCHEMA_FORMAT_VERSION = "0.4"
183
183
184
184
185
+ def convert_local_allele_field_types (fields ):
186
+ """
187
+ Update the specified list of fields to include the LAA field, and to convert
188
+ any supported localisable fields to the L* counterpart.
189
+
190
+ Note that we currently support only two ALT alleles per sample, and so the
191
+ dimensions of these fields are fixed by that requirement. Later versions may
192
+ use summry data storted in the ICF to make different choices, if information
193
+ about subsequent alleles (not in the actual genotype calls) should also be
194
+ stored.
195
+ """
196
+ fields_by_name = {field .name : field for field in fields }
197
+ gt = fields_by_name ["call_genotype" ]
198
+ if gt .shape [- 1 ] != 2 :
199
+ raise ValueError ("Local alleles only supported on diploid data" )
200
+ shape = gt .shape [:- 1 ]
201
+ chunks = gt .chunks [:- 1 ]
202
+
203
+ laa = ZarrArraySpec .new (
204
+ vcf_field = None ,
205
+ name = "call_LAA" ,
206
+ dtype = "i1" ,
207
+ shape = gt .shape ,
208
+ chunks = gt .chunks ,
209
+ dimensions = gt .dimensions , # FIXME
210
+ description = (
211
+ "1-based indices into ALT, indicating which alleles"
212
+ " are relevant (local) for the current sample"
213
+ ),
214
+ )
215
+ pl = fields_by_name .get ("call_PL" , None )
216
+ if pl is not None :
217
+ pl .name = "call_LPL"
218
+ pl .vcf_field = None
219
+ pl .shape = (* shape , 3 )
220
+ pl .chunks = (* chunks , 3 )
221
+ pl .description += " (local-alleles)"
222
+ # TODO fix dimensions
223
+ return [* fields , laa ]
224
+
225
+
185
226
@dataclasses .dataclass
186
227
class VcfZarrSchema (core .JsonDataclass ):
187
228
format_version : str
@@ -232,14 +273,18 @@ def fromjson(s):
232
273
return VcfZarrSchema .fromdict (json .loads (s ))
233
274
234
275
@staticmethod
235
- def generate (icf , variants_chunk_size = None , samples_chunk_size = None ):
276
+ def generate (
277
+ icf , variants_chunk_size = None , samples_chunk_size = None , local_alleles = None
278
+ ):
236
279
m = icf .num_records
237
280
n = icf .num_samples
238
281
# FIXME
239
282
if samples_chunk_size is None :
240
283
samples_chunk_size = 1000
241
284
if variants_chunk_size is None :
242
285
variants_chunk_size = 10_000
286
+ if local_alleles is None :
287
+ local_alleles = False
243
288
logger .info (
244
289
f"Generating schema with chunks={ variants_chunk_size , samples_chunk_size } "
245
290
)
@@ -366,6 +411,9 @@ def fixed_field_spec(
366
411
)
367
412
)
368
413
414
+ if local_alleles :
415
+ array_specs = convert_local_allele_field_types (array_specs )
416
+
369
417
return VcfZarrSchema (
370
418
format_version = ZARR_SCHEMA_FORMAT_VERSION ,
371
419
samples_chunk_size = samples_chunk_size ,
@@ -1027,12 +1075,20 @@ def encode_all_partitions(
1027
1075
pwm .submit (self .encode_partition , partition_index )
1028
1076
1029
1077
1030
- def mkschema (if_path , out , * , variants_chunk_size = None , samples_chunk_size = None ):
1078
+ def mkschema (
1079
+ if_path ,
1080
+ out ,
1081
+ * ,
1082
+ variants_chunk_size = None ,
1083
+ samples_chunk_size = None ,
1084
+ local_alleles = None ,
1085
+ ):
1031
1086
store = icf .IntermediateColumnarFormat (if_path )
1032
1087
spec = VcfZarrSchema .generate (
1033
1088
store ,
1034
1089
variants_chunk_size = variants_chunk_size ,
1035
1090
samples_chunk_size = samples_chunk_size ,
1091
+ local_alleles = local_alleles ,
1036
1092
)
1037
1093
out .write (spec .asjson ())
1038
1094
0 commit comments