@@ -1284,11 +1284,11 @@ def summary_table(self):
1284
1284
return data
1285
1285
1286
1286
1287
- # TODO refactor this into a VcfZarrWriter class, and get rid of the
1288
- # statis methods.
1289
- class SgvcfZarr :
1290
- def __init__ (self , path ):
1287
+ class VcfZarrWriter :
1288
+ def __init__ (self , path , pcvcf , schema ):
1291
1289
self .path = pathlib .Path (path )
1290
+ self .pcvcf = pcvcf
1291
+ self .schema = schema
1292
1292
self .root = None
1293
1293
1294
1294
def create_array (self , variable ):
@@ -1306,8 +1306,8 @@ def create_array(self, variable):
1306
1306
)
1307
1307
a .attrs ["_ARRAY_DIMENSIONS" ] = variable .dimensions
1308
1308
1309
- def encode_column_slice (self , pcvcf , column , start , stop ):
1310
- source_col = pcvcf .columns [column .vcf_field ]
1309
+ def encode_column_slice (self , column , start , stop ):
1310
+ source_col = self . pcvcf .columns [column .vcf_field ]
1311
1311
array = self .root [column .name ]
1312
1312
ba = core .BufferedArray (array , start )
1313
1313
sanitiser = source_col .sanitiser_factory (ba .buff .shape )
@@ -1320,8 +1320,8 @@ def encode_column_slice(self, pcvcf, column, start, stop):
1320
1320
ba .flush ()
1321
1321
logger .debug (f"Encoded { column .name } slice { start } :{ stop } " )
1322
1322
1323
- def encode_genotypes_slice (self , pcvcf , start , stop ):
1324
- source_col = pcvcf .columns ["FORMAT/GT" ]
1323
+ def encode_genotypes_slice (self , start , stop ):
1324
+ source_col = self . pcvcf .columns ["FORMAT/GT" ]
1325
1325
gt = core .BufferedArray (self .root ["call_genotype" ], start )
1326
1326
gt_mask = core .BufferedArray (self .root ["call_genotype_mask" ], start )
1327
1327
gt_phased = core .BufferedArray (self .root ["call_genotype_phased" ], start )
@@ -1340,9 +1340,9 @@ def encode_genotypes_slice(self, pcvcf, start, stop):
1340
1340
gt_mask .flush ()
1341
1341
logger .debug (f"Encoded GT slice { start } :{ stop } " )
1342
1342
1343
- def encode_alleles_slice (self , pcvcf , start , stop ):
1344
- ref_col = pcvcf .columns ["REF" ]
1345
- alt_col = pcvcf .columns ["ALT" ]
1343
+ def encode_alleles_slice (self , start , stop ):
1344
+ ref_col = self . pcvcf .columns ["REF" ]
1345
+ alt_col = self . pcvcf .columns ["ALT" ]
1346
1346
alleles = core .BufferedArray (self .root ["variant_allele" ], start )
1347
1347
1348
1348
for ref , alt in zip (
@@ -1355,8 +1355,8 @@ def encode_alleles_slice(self, pcvcf, start, stop):
1355
1355
alleles .flush ()
1356
1356
logger .debug (f"Encoded alleles slice { start } :{ stop } " )
1357
1357
1358
- def encode_id_slice (self , pcvcf , start , stop ):
1359
- col = pcvcf .columns ["ID" ]
1358
+ def encode_id_slice (self , start , stop ):
1359
+ col = self . pcvcf .columns ["ID" ]
1360
1360
vid = core .BufferedArray (self .root ["variant_id" ], start )
1361
1361
vid_mask = core .BufferedArray (self .root ["variant_id_mask" ], start )
1362
1362
@@ -1374,8 +1374,8 @@ def encode_id_slice(self, pcvcf, start, stop):
1374
1374
vid_mask .flush ()
1375
1375
logger .debug (f"Encoded ID slice { start } :{ stop } " )
1376
1376
1377
- def encode_filters_slice (self , pcvcf , lookup , start , stop ):
1378
- col = pcvcf .columns ["FILTERS" ]
1377
+ def encode_filters_slice (self , lookup , start , stop ):
1378
+ col = self . pcvcf .columns ["FILTERS" ]
1379
1379
var_filter = core .BufferedArray (self .root ["variant_filter" ], start )
1380
1380
1381
1381
for value in col .iter_values (start , stop ):
@@ -1389,8 +1389,8 @@ def encode_filters_slice(self, pcvcf, lookup, start, stop):
1389
1389
var_filter .flush ()
1390
1390
logger .debug (f"Encoded FILTERS slice { start } :{ stop } " )
1391
1391
1392
- def encode_contig_slice (self , pcvcf , lookup , start , stop ):
1393
- col = pcvcf .columns ["CHROM" ]
1392
+ def encode_contig_slice (self , lookup , start , stop ):
1393
+ col = self . pcvcf .columns ["CHROM" ]
1394
1394
contig = core .BufferedArray (self .root ["variant_contig" ], start )
1395
1395
1396
1396
for value in col .iter_values (start , stop ):
@@ -1403,162 +1403,144 @@ def encode_contig_slice(self, pcvcf, lookup, start, stop):
1403
1403
contig .flush ()
1404
1404
logger .debug (f"Encoded CHROM slice { start } :{ stop } " )
1405
1405
1406
- def encode_samples (self , pcvcf , sample_id , chunk_width ):
1407
- if not np .array_equal (sample_id , pcvcf .metadata .samples ):
1406
+ def encode_samples (self ):
1407
+ if not np .array_equal (self . schema . sample_id , self . pcvcf .metadata .samples ):
1408
1408
raise ValueError ("Subsetting or reordering samples not supported currently" )
1409
1409
array = self .root .array (
1410
1410
"sample_id" ,
1411
- sample_id ,
1411
+ self . schema . sample_id ,
1412
1412
dtype = "str" ,
1413
1413
compressor = core .default_compressor ,
1414
- chunks = (chunk_width ,),
1414
+ chunks = (self . schema . chunk_width ,),
1415
1415
)
1416
1416
array .attrs ["_ARRAY_DIMENSIONS" ] = ["samples" ]
1417
1417
logger .debug ("Samples done" )
1418
1418
1419
- def encode_contig_id (self , pcvcf , contig_names , contig_lengths ):
1419
+ def encode_contig_id (self ):
1420
1420
array = self .root .array (
1421
1421
"contig_id" ,
1422
- contig_names ,
1422
+ self . schema . contig_id ,
1423
1423
dtype = "str" ,
1424
1424
compressor = core .default_compressor ,
1425
1425
)
1426
1426
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
1427
- if contig_lengths is not None :
1427
+ if self . schema . contig_length is not None :
1428
1428
array = self .root .array (
1429
1429
"contig_length" ,
1430
- contig_lengths ,
1430
+ self . schema . contig_length ,
1431
1431
dtype = np .int64 ,
1432
1432
)
1433
1433
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
1434
- return {v : j for j , v in enumerate (contig_names )}
1434
+ return {v : j for j , v in enumerate (self . schema . contig_id )}
1435
1435
1436
- def encode_filter_id (self , pcvcf , filter_names ):
1436
+ def encode_filter_id (self ):
1437
1437
array = self .root .array (
1438
1438
"filter_id" ,
1439
- filter_names ,
1439
+ self . schema . filter_id ,
1440
1440
dtype = "str" ,
1441
1441
compressor = core .default_compressor ,
1442
1442
)
1443
1443
array .attrs ["_ARRAY_DIMENSIONS" ] = ["filters" ]
1444
- return {v : j for j , v in enumerate (filter_names )}
1444
+ return {v : j for j , v in enumerate (self . schema . filter_id )}
1445
1445
1446
- @staticmethod
1447
1446
def encode (
1448
- pcvcf ,
1449
- path ,
1450
- conversion_spec ,
1451
- * ,
1447
+ self ,
1452
1448
worker_processes = 1 ,
1453
1449
max_v_chunks = None ,
1454
1450
show_progress = False ,
1455
1451
):
1456
- path = pathlib .Path (path )
1457
1452
# TODO: we should do this as a future to avoid blocking
1458
- if path .exists ():
1453
+ if self . path .exists ():
1459
1454
logger .warning (f"Deleting existing { path } " )
1460
- shutil .rmtree (path )
1461
- write_path = path .with_suffix (path .suffix + f".{ os .getpid ()} .build" )
1455
+ shutil .rmtree (self . path )
1456
+ write_path = self . path .with_suffix (self . path .suffix + f".{ os .getpid ()} .build" )
1462
1457
store = zarr .DirectoryStore (write_path )
1463
- # FIXME, duplicating logic about the store
1464
1458
logger .info (f"Create zarr at { write_path } " )
1465
- sgvcf = SgvcfZarr (write_path )
1466
- sgvcf .root = zarr .group (store = store , overwrite = True )
1467
- for column in conversion_spec .columns .values ():
1468
- sgvcf .create_array (column )
1459
+ self .root = zarr .group (store = store , overwrite = True )
1460
+ for column in self .schema .columns .values ():
1461
+ self .create_array (column )
1469
1462
1470
- sgvcf .root .attrs ["vcf_zarr_version" ] = "0.2"
1471
- sgvcf .root .attrs ["vcf_header" ] = pcvcf .vcf_header
1472
- sgvcf .root .attrs ["source" ] = f"bio2zarr-{ provenance .__version__ } "
1463
+ self .root .attrs ["vcf_zarr_version" ] = "0.2"
1464
+ self .root .attrs ["vcf_header" ] = self . pcvcf .vcf_header
1465
+ self .root .attrs ["source" ] = f"bio2zarr-{ provenance .__version__ } "
1473
1466
1474
1467
num_slices = max (1 , worker_processes * 4 )
1475
1468
# Using POS arbitrarily to get the array slices
1476
1469
slices = core .chunk_aligned_slices (
1477
- sgvcf .root ["variant_position" ], num_slices , max_chunks = max_v_chunks
1470
+ self .root ["variant_position" ], num_slices , max_chunks = max_v_chunks
1478
1471
)
1479
1472
truncated = slices [- 1 ][- 1 ]
1480
- for array in sgvcf .root .values ():
1473
+ for array in self .root .values ():
1481
1474
if array .attrs ["_ARRAY_DIMENSIONS" ][0 ] == "variants" :
1482
1475
shape = list (array .shape )
1483
1476
shape [0 ] = truncated
1484
1477
array .resize (shape )
1485
1478
1486
1479
chunked_1d = [
1487
- col for col in conversion_spec .columns .values () if len (col .chunks ) <= 1
1480
+ col for col in self . schema .columns .values () if len (col .chunks ) <= 1
1488
1481
]
1489
1482
progress_config = core .ProgressConfig (
1490
- total = sum (sgvcf .root [col .name ].nchunks for col in chunked_1d ),
1483
+ total = sum (self .root [col .name ].nchunks for col in chunked_1d ),
1491
1484
title = "Encode 1D" ,
1492
1485
units = "chunks" ,
1493
1486
show = show_progress ,
1494
1487
)
1495
1488
1496
1489
# Do these syncronously for simplicity so we have the mapping
1497
- filter_id_map = sgvcf .encode_filter_id (pcvcf , conversion_spec .filter_id )
1498
- contig_id_map = sgvcf .encode_contig_id (
1499
- pcvcf , conversion_spec .contig_id , conversion_spec .contig_length
1500
- )
1490
+ filter_id_map = self .encode_filter_id ()
1491
+ contig_id_map = self .encode_contig_id ()
1501
1492
1502
1493
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1503
- pwm .submit (
1504
- sgvcf .encode_samples ,
1505
- pcvcf ,
1506
- conversion_spec .sample_id ,
1507
- conversion_spec .chunk_width ,
1508
- )
1494
+ pwm .submit (self .encode_samples )
1509
1495
for start , stop in slices :
1510
- pwm .submit (sgvcf .encode_alleles_slice , pcvcf , start , stop )
1511
- pwm .submit (sgvcf .encode_id_slice , pcvcf , start , stop )
1512
- pwm .submit (
1513
- sgvcf .encode_filters_slice , pcvcf , filter_id_map , start , stop
1514
- )
1515
- pwm .submit (sgvcf .encode_contig_slice , pcvcf , contig_id_map , start , stop )
1496
+ pwm .submit (self .encode_alleles_slice , start , stop )
1497
+ pwm .submit (self .encode_id_slice , start , stop )
1498
+ pwm .submit (self .encode_filters_slice , filter_id_map , start , stop )
1499
+ pwm .submit (self .encode_contig_slice , contig_id_map , start , stop )
1516
1500
for col in chunked_1d :
1517
1501
if col .vcf_field is not None :
1518
- pwm .submit (sgvcf .encode_column_slice , pcvcf , col , start , stop )
1502
+ pwm .submit (self .encode_column_slice , col , start , stop )
1519
1503
1520
1504
chunked_2d = [
1521
- col for col in conversion_spec .columns .values () if len (col .chunks ) >= 2
1505
+ col for col in self . schema .columns .values () if len (col .chunks ) >= 2
1522
1506
]
1523
1507
if len (chunked_2d ) > 0 :
1524
1508
progress_config = core .ProgressConfig (
1525
- total = sum (sgvcf .root [col .name ].nchunks for col in chunked_2d ),
1509
+ total = sum (self .root [col .name ].nchunks for col in chunked_2d ),
1526
1510
title = "Encode 2D" ,
1527
1511
units = "chunks" ,
1528
1512
show = show_progress ,
1529
1513
)
1530
1514
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1531
- if "call_genotype" in conversion_spec .columns :
1515
+ if "call_genotype" in self . schema .columns :
1532
1516
arrays = [
1533
- sgvcf .root ["call_genotype" ],
1534
- sgvcf .root ["call_genotype_phased" ],
1535
- sgvcf .root ["call_genotype_mask" ],
1517
+ self .root ["call_genotype" ],
1518
+ self .root ["call_genotype_phased" ],
1519
+ self .root ["call_genotype_mask" ],
1536
1520
]
1537
1521
min_mem = sum (array .blocks [0 ].nbytes for array in arrays )
1538
1522
logger .info (
1539
1523
f"Submit encode call_genotypes in { len (slices )} slices. "
1540
1524
f"Min per-worker mem={ display_size (min_mem )} "
1541
1525
)
1542
1526
for start , stop in slices :
1543
- pwm .submit (sgvcf .encode_genotypes_slice , pcvcf , start , stop )
1527
+ pwm .submit (self .encode_genotypes_slice , start , stop )
1544
1528
1545
1529
for col in chunked_2d :
1546
1530
if col .vcf_field is not None :
1547
- array = sgvcf .root [col .name ]
1531
+ array = self .root [col .name ]
1548
1532
min_mem = array .blocks [0 ].nbytes
1549
1533
logger .info (
1550
1534
f"Submit encode { col .name } in { len (slices )} slices. "
1551
1535
f"Min per-worker mem={ display_size (min_mem )} "
1552
1536
)
1553
1537
for start , stop in slices :
1554
- pwm .submit (
1555
- sgvcf .encode_column_slice , pcvcf , col , start , stop
1556
- )
1538
+ pwm .submit (self .encode_column_slice , col , start , stop )
1557
1539
1558
1540
zarr .consolidate_metadata (write_path )
1559
1541
# Atomic swap, now we've completely finished.
1560
- logger .info (f"Moving to final path { path } " )
1561
- os .rename (write_path , path )
1542
+ logger .info (f"Moving to final path { self . path } " )
1543
+ os .rename (write_path , self . path )
1562
1544
1563
1545
1564
1546
def mkschema (if_path , out ):
@@ -1590,11 +1572,8 @@ def encode(
1590
1572
raise ValueError ("Cannot specify schema along with chunk sizes" )
1591
1573
with open (schema_path , "r" ) as f :
1592
1574
schema = ZarrConversionSpec .fromjson (f .read ())
1593
-
1594
- SgvcfZarr .encode (
1595
- pcvcf ,
1596
- zarr_path ,
1597
- conversion_spec = schema ,
1575
+ vzw = VcfZarrWriter (zarr_path , pcvcf , schema )
1576
+ vzw .encode (
1598
1577
max_v_chunks = max_v_chunks ,
1599
1578
worker_processes = worker_processes ,
1600
1579
show_progress = show_progress ,
0 commit comments