@@ -111,9 +111,6 @@ def full_name(self):
111
111
return self .name
112
112
return f"{ self .category } /{ self .name } "
113
113
114
- # TODO add method here to choose a good set compressor and
115
- # filters default here for this field.
116
-
117
114
def smallest_dtype (self ):
118
115
"""
119
116
Returns the smallest dtype suitable for this field based
@@ -123,13 +120,13 @@ def smallest_dtype(self):
123
120
if self .vcf_type == "Float" :
124
121
ret = "f4"
125
122
elif self .vcf_type == "Integer" :
126
- dtype = "i4"
127
- for a_dtype in [ "i1" , "i2" ]:
128
- info = np . iinfo ( a_dtype )
129
- if info . min <= s . min_value and s . max_value <= info . max :
130
- dtype = a_dtype
131
- break
132
- ret = dtype
123
+ if not math . isfinite ( s . max_value ):
124
+ # All missing values; use i1. Note we should have some API to
125
+ # check more explicitly for missingness:
126
+ # https://github.com/sgkit-dev/bio2zarr/issues/131
127
+ ret = "i1"
128
+ else :
129
+ ret = core . min_int_dtype ( s . min_value , s . max_value )
133
130
elif self .vcf_type == "Flag" :
134
131
ret = "bool"
135
132
elif self .vcf_type == "Character" :
@@ -152,6 +149,10 @@ class VcfPartition:
152
149
cname = "zstd" , clevel = 7 , shuffle = numcodecs .Blosc .NOSHUFFLE
153
150
)
154
151
152
+ # TODO refactor this to have embedded Contig dataclass, Filters
153
+ # and Samples dataclasses to allow for more information to be
154
+ # retained and forward compatibility.
155
+
155
156
156
157
@dataclasses .dataclass
157
158
class IcfMetadata :
@@ -183,6 +184,14 @@ def format_fields(self):
183
184
fields .append (field )
184
185
return fields
185
186
187
+ @property
188
+ def num_contigs (self ):
189
+ return len (self .contig_names )
190
+
191
+ @property
192
+ def num_filters (self ):
193
+ return len (self .filters )
194
+
186
195
@property
187
196
def num_records (self ):
188
197
return sum (self .contig_record_counts .values ())
@@ -1242,6 +1251,50 @@ def new(**kwargs):
1242
1251
spec ._choose_compressor_settings ()
1243
1252
return spec
1244
1253
1254
+ @staticmethod
1255
+ def from_field (
1256
+ vcf_field ,
1257
+ * ,
1258
+ num_variants ,
1259
+ num_samples ,
1260
+ variants_chunk_size ,
1261
+ samples_chunk_size ,
1262
+ variable_name = None ,
1263
+ ):
1264
+ shape = [num_variants ]
1265
+ prefix = "variant_"
1266
+ dimensions = ["variants" ]
1267
+ chunks = [variants_chunk_size ]
1268
+ if vcf_field .category == "FORMAT" :
1269
+ prefix = "call_"
1270
+ shape .append (num_samples )
1271
+ chunks .append (samples_chunk_size )
1272
+ dimensions .append ("samples" )
1273
+ if variable_name is None :
1274
+ variable_name = prefix + vcf_field .name
1275
+ # TODO make an option to add in the empty extra dimension
1276
+ if vcf_field .summary .max_number > 1 :
1277
+ shape .append (vcf_field .summary .max_number )
1278
+ # TODO we should really be checking this to see if the named dimensions
1279
+ # are actually correct.
1280
+ if vcf_field .vcf_number == "R" :
1281
+ dimensions .append ("alleles" )
1282
+ elif vcf_field .vcf_number == "A" :
1283
+ dimensions .append ("alt_alleles" )
1284
+ elif vcf_field .vcf_number == "G" :
1285
+ dimensions .append ("genotypes" )
1286
+ else :
1287
+ dimensions .append (f"{ vcf_field .category } _{ vcf_field .name } _dim" )
1288
+ return ZarrColumnSpec .new (
1289
+ vcf_field = vcf_field .full_name ,
1290
+ name = variable_name ,
1291
+ dtype = vcf_field .smallest_dtype (),
1292
+ shape = shape ,
1293
+ chunks = chunks ,
1294
+ dimensions = dimensions ,
1295
+ description = vcf_field .description ,
1296
+ )
1297
+
1245
1298
def _choose_compressor_settings (self ):
1246
1299
"""
1247
1300
Choose compressor and filter settings based on the size and
@@ -1250,15 +1303,19 @@ def _choose_compressor_settings(self):
1250
1303
1251
1304
See https://github.com/pystatgen/bio2zarr/discussions/74
1252
1305
"""
1253
- dt = np .dtype (self .dtype )
1254
1306
# Default is to not shuffle, because autoshuffle isn't recognised
1255
1307
# by many Zarr implementations, and shuffling can lead to worse
1256
1308
# performance in some cases anyway. Turning on shuffle should be a
1257
1309
# deliberate choice.
1258
1310
shuffle = numcodecs .Blosc .NOSHUFFLE
1259
- if dt .itemsize == 1 :
1260
- # Any 1 byte field gets BITSHUFFLE by default
1311
+ if self .name == "call_genotype" and self .dtype == "i1" :
1312
+ # call_genotype gets BITSHUFFLE by default as it gets
1313
+ # significantly better compression (at a cost of slower
1314
+ # decoding)
1315
+ shuffle = numcodecs .Blosc .BITSHUFFLE
1316
+ elif self .dtype == "bool" :
1261
1317
shuffle = numcodecs .Blosc .BITSHUFFLE
1318
+
1262
1319
self .compressor ["shuffle" ] = shuffle
1263
1320
1264
1321
@@ -1313,6 +1370,16 @@ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
1313
1370
f"Generating schema with chunks={ variants_chunk_size , samples_chunk_size } "
1314
1371
)
1315
1372
1373
+ def spec_from_field (field , variable_name = None ):
1374
+ return ZarrColumnSpec .from_field (
1375
+ field ,
1376
+ num_samples = n ,
1377
+ num_variants = m ,
1378
+ samples_chunk_size = samples_chunk_size ,
1379
+ variants_chunk_size = variants_chunk_size ,
1380
+ variable_name = variable_name ,
1381
+ )
1382
+
1316
1383
def fixed_field_spec (
1317
1384
name , dtype , vcf_field = None , shape = (m ,), dimensions = ("variants" ,)
1318
1385
):
@@ -1328,95 +1395,56 @@ def fixed_field_spec(
1328
1395
1329
1396
alt_col = icf .columns ["ALT" ]
1330
1397
max_alleles = alt_col .vcf_field .summary .max_number + 1
1331
- num_filters = len (icf .metadata .filters )
1332
1398
1333
- # # FIXME get dtype from lookup table
1334
1399
colspecs = [
1335
1400
fixed_field_spec (
1336
1401
name = "variant_contig" ,
1337
- dtype = "i2" , # FIXME
1402
+ dtype = core . min_int_dtype ( 0 , icf . metadata . num_contigs ),
1338
1403
),
1339
1404
fixed_field_spec (
1340
1405
name = "variant_filter" ,
1341
1406
dtype = "bool" ,
1342
- shape = (m , num_filters ),
1407
+ shape = (m , icf . metadata . num_filters ),
1343
1408
dimensions = ["variants" , "filters" ],
1344
1409
),
1345
1410
fixed_field_spec (
1346
1411
name = "variant_allele" ,
1347
1412
dtype = "str" ,
1348
- shape = [ m , max_alleles ] ,
1413
+ shape = ( m , max_alleles ) ,
1349
1414
dimensions = ["variants" , "alleles" ],
1350
1415
),
1351
1416
fixed_field_spec (
1352
- vcf_field = "POS" ,
1353
- name = "variant_position" ,
1354
- dtype = "i4" ,
1355
- ),
1356
- fixed_field_spec (
1357
- vcf_field = None ,
1358
1417
name = "variant_id" ,
1359
1418
dtype = "str" ,
1360
1419
),
1361
1420
fixed_field_spec (
1362
- vcf_field = None ,
1363
1421
name = "variant_id_mask" ,
1364
1422
dtype = "bool" ,
1365
1423
),
1366
- fixed_field_spec (
1367
- vcf_field = "QUAL" ,
1368
- name = "variant_quality" ,
1369
- dtype = "f4" ,
1370
- ),
1371
1424
]
1425
+ name_map = {field .full_name : field for field in icf .metadata .fields }
1426
+
1427
+ # Only two of the fixed fields have a direct one-to-one mapping.
1428
+ colspecs .extend (
1429
+ [
1430
+ spec_from_field (name_map ["QUAL" ], variable_name = "variant_quality" ),
1431
+ spec_from_field (name_map ["POS" ], variable_name = "variant_position" ),
1432
+ ]
1433
+ )
1434
+ colspecs .extend ([spec_from_field (field ) for field in icf .metadata .info_fields ])
1372
1435
1373
1436
gt_field = None
1374
- for field in icf .metadata .fields :
1375
- if field .category == "fixed" :
1376
- continue
1437
+ for field in icf .metadata .format_fields :
1377
1438
if field .name == "GT" :
1378
1439
gt_field = field
1379
1440
continue
1380
- shape = [m ]
1381
- prefix = "variant_"
1382
- dimensions = ["variants" ]
1383
- chunks = [variants_chunk_size ]
1384
- if field .category == "FORMAT" :
1385
- prefix = "call_"
1386
- shape .append (n )
1387
- chunks .append (samples_chunk_size )
1388
- dimensions .append ("samples" )
1389
- # TODO make an option to add in the empty extra dimension
1390
- if field .summary .max_number > 1 :
1391
- shape .append (field .summary .max_number )
1392
- # TODO we should really be checking this to see if the named dimensions
1393
- # are actually correct.
1394
- if field .vcf_number == "R" :
1395
- dimensions .append ("alleles" )
1396
- elif field .vcf_number == "A" :
1397
- dimensions .append ("alt_alleles" )
1398
- elif field .vcf_number == "G" :
1399
- dimensions .append ("genotypes" )
1400
- else :
1401
- dimensions .append (f"{ field .category } _{ field .name } _dim" )
1402
- variable_name = prefix + field .name
1403
- colspec = ZarrColumnSpec .new (
1404
- vcf_field = field .full_name ,
1405
- name = variable_name ,
1406
- dtype = field .smallest_dtype (),
1407
- shape = shape ,
1408
- chunks = chunks ,
1409
- dimensions = dimensions ,
1410
- description = field .description ,
1411
- )
1412
- colspecs .append (colspec )
1441
+ colspecs .append (spec_from_field (field ))
1413
1442
1414
1443
if gt_field is not None :
1415
1444
ploidy = gt_field .summary .max_number - 1
1416
1445
shape = [m , n ]
1417
1446
chunks = [variants_chunk_size , samples_chunk_size ]
1418
1447
dimensions = ["variants" , "samples" ]
1419
-
1420
1448
colspecs .append (
1421
1449
ZarrColumnSpec .new (
1422
1450
vcf_field = None ,
0 commit comments