@@ -581,22 +581,7 @@ def chunk_files(self, partition_id, start=0):
581
581
def read_chunk (self , path ):
582
582
with open (path , "rb" ) as f :
583
583
pkl = self .compressor .decode (f .read ())
584
- return pickle .loads (pkl ), len (pkl )
585
-
586
- def iter_values_bytes (self ):
587
- num_records = 0
588
- bytes_read = 0
589
- for partition_id in range (self .num_partitions ):
590
- for chunk_path in self .chunk_files (partition_id ):
591
- chunk , chunk_bytes = self .read_chunk (chunk_path )
592
- bytes_read += chunk_bytes
593
- for record in chunk :
594
- yield record , bytes_read
595
- num_records += 1
596
- if num_records != self .num_records :
597
- raise ValueError (
598
- f"Corruption detected: incorrect number of records in { str (self .path )} ."
599
- )
584
+ return pickle .loads (pkl )
600
585
601
586
def iter_values (self , start = None , stop = None ):
602
587
start = 0 if start is None else start
@@ -620,7 +605,7 @@ def iter_values(self, start=None, stop=None):
620
605
)
621
606
622
607
for chunk_path in self .chunk_files (start_partition , start_chunk ):
623
- chunk , _ = self .read_chunk (chunk_path )
608
+ chunk = self .read_chunk (chunk_path )
624
609
for record in chunk :
625
610
if record_id == stop :
626
611
return
@@ -630,7 +615,7 @@ def iter_values(self, start=None, stop=None):
630
615
assert record_id > start
631
616
for partition_id in range (start_partition + 1 , self .num_partitions ):
632
617
for chunk_path in self .chunk_files (partition_id ):
633
- chunk , _ = self .read_chunk (chunk_path )
618
+ chunk = self .read_chunk (chunk_path )
634
619
for record in chunk :
635
620
if record_id == stop :
636
621
return
@@ -641,7 +626,19 @@ def iter_values(self, start=None, stop=None):
641
626
# but making a property for consistency with xarray etc
642
627
@property
643
628
def values (self ):
644
- return [record for record , _ in self .iter_values_bytes ()]
629
+ ret = [None ] * self .num_records
630
+ j = 0
631
+ for partition_id in range (self .num_partitions ):
632
+ for chunk_path in self .chunk_files (partition_id ):
633
+ chunk = self .read_chunk (chunk_path )
634
+ for record in chunk :
635
+ ret [j ] = record
636
+ j += 1
637
+ if j != self .num_records :
638
+ raise ValueError (
639
+ f"Corruption detected: incorrect number of records in { str (self .path )} ."
640
+ )
641
+ return ret
645
642
646
643
def sanitiser_factory (self , shape ):
647
644
"""
@@ -1288,7 +1285,6 @@ def encode_genotypes_slice(self, pcvcf, start, stop):
1288
1285
gt_phased = core .BufferedArray (self .root ["call_genotype_phased" ], start )
1289
1286
1290
1287
for value in source_col .iter_values (start , stop ):
1291
- # core.update_progress(sys.getsizeof(value))
1292
1288
j = gt .next_buffer_row ()
1293
1289
sanitise_value_int_2d (gt .buff , j , value [:, :- 1 ])
1294
1290
j = gt_phased .next_buffer_row ()
@@ -1300,26 +1296,70 @@ def encode_genotypes_slice(self, pcvcf, start, stop):
1300
1296
gt .flush ()
1301
1297
gt_phased .flush ()
1302
1298
gt_mask .flush ()
1303
- logger .debug (f"GT slice { start } :{ stop } done " )
1299
+ logger .debug (f"Encoded GT slice { start } :{ stop } " )
1304
1300
1305
- def encode_alleles (self , pcvcf ):
1301
+ def encode_alleles_slice (self , pcvcf , start , stop ):
1306
1302
ref_col = pcvcf .columns ["REF" ]
1307
1303
alt_col = pcvcf .columns ["ALT" ]
1308
- ref_values = ref_col .values
1309
- alt_values = alt_col .values
1310
- allele_array = self .root ["variant_allele" ]
1311
-
1312
- # We could do this chunk-by-chunk, but it doesn't seem worth the bother.
1313
- alleles = np .full (allele_array .shape , "" , dtype = "O" )
1314
- for j , (ref , alt ) in enumerate (zip (ref_values , alt_values )):
1315
- alleles [j , 0 ] = ref [0 ]
1316
- alleles [j , 1 : 1 + len (alt )] = alt
1317
- allele_array [:] = alleles
1318
- size = sum (
1319
- col .vcf_field .summary .uncompressed_size for col in [ref_col , alt_col ]
1320
- )
1321
- logger .debug ("alleles done" )
1322
- core .update_progress (allele_array .nchunks )
1304
+ alleles = core .BufferedArray (self .root ["variant_allele" ], start )
1305
+
1306
+ for ref , alt in zip (
1307
+ ref_col .iter_values (start , stop ), alt_col .iter_values (start , stop )
1308
+ ):
1309
+ j = alleles .next_buffer_row ()
1310
+ alleles .buff [j , :] = STR_FILL
1311
+ alleles .buff [j , 0 ] = ref [0 ]
1312
+ alleles .buff [j , 1 : 1 + len (alt )] = alt
1313
+ alleles .flush ()
1314
+ logger .debug (f"Encoded alleles slice { start } :{ stop } " )
1315
+
1316
+ def encode_id_slice (self , pcvcf , start , stop ):
1317
+ col = pcvcf .columns ["ID" ]
1318
+ vid = core .BufferedArray (self .root ["variant_id" ], start )
1319
+ vid_mask = core .BufferedArray (self .root ["variant_id_mask" ], start )
1320
+
1321
+ for value in col .iter_values (start , stop ):
1322
+ j = vid .next_buffer_row ()
1323
+ k = vid_mask .next_buffer_row ()
1324
+ assert j == k
1325
+ if value is not None :
1326
+ vid .buff [j ] = value [0 ]
1327
+ vid_mask .buff [j ] = False
1328
+ else :
1329
+ vid .buff [j ] = STR_MISSING
1330
+ vid_mask .buff [j ] = True
1331
+ vid .flush ()
1332
+ vid_mask .flush ()
1333
+ logger .debug (f"Encoded ID slice { start } :{ stop } " )
1334
+
1335
+ def encode_filters_slice (self , pcvcf , lookup , start , stop ):
1336
+ col = pcvcf .columns ["FILTERS" ]
1337
+ var_filter = core .BufferedArray (self .root ["variant_filter" ], start )
1338
+
1339
+ for value in col .iter_values (start , stop ):
1340
+ j = var_filter .next_buffer_row ()
1341
+ var_filter .buff [j ] = False
1342
+ try :
1343
+ for f in value :
1344
+ var_filter .buff [j , lookup [f ]] = True
1345
+ except IndexError :
1346
+ raise ValueError (f"Filter '{ f } ' was not defined in the header." )
1347
+ var_filter .flush ()
1348
+ logger .debug (f"Encoded FILTERS slice { start } :{ stop } " )
1349
+
1350
+ def encode_contig_slice (self , pcvcf , lookup , start , stop ):
1351
+ col = pcvcf .columns ["CHROM" ]
1352
+ contig = core .BufferedArray (self .root ["variant_contig" ], start )
1353
+
1354
+ for value in col .iter_values (start , stop ):
1355
+ j = contig .next_buffer_row ()
1356
+ try :
1357
+ contig .buff [j ] = lookup [value [0 ]]
1358
+ except KeyError :
1359
+ # TODO add advice about adding it to the spec
1360
+ raise ValueError (f"Contig '{ contig } ' was not defined in the header." )
1361
+ contig .flush ()
1362
+ logger .debug (f"Encoded CHROM slice { start } :{ stop } " )
1323
1363
1324
1364
def encode_samples (self , pcvcf , sample_id , chunk_width ):
1325
1365
if not np .array_equal (sample_id , pcvcf .metadata .samples ):
@@ -1333,90 +1373,33 @@ def encode_samples(self, pcvcf, sample_id, chunk_width):
1333
1373
)
1334
1374
array .attrs ["_ARRAY_DIMENSIONS" ] = ["samples" ]
1335
1375
logger .debug ("Samples done" )
1336
- core .update_progress (array .nchunks )
1337
1376
1338
- def encode_contig (self , pcvcf , contig_names , contig_lengths ):
1339
- logger .debug ("Start encode contig" )
1377
+ def encode_contig_id (self , pcvcf , contig_names , contig_lengths ):
1340
1378
array = self .root .array (
1341
1379
"contig_id" ,
1342
1380
contig_names ,
1343
1381
dtype = "str" ,
1344
1382
compressor = core .default_compressor ,
1345
1383
)
1346
1384
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
1347
- core .update_progress (array .nchunks )
1348
-
1349
1385
if contig_lengths is not None :
1350
1386
array = self .root .array (
1351
1387
"contig_length" ,
1352
1388
contig_lengths ,
1353
1389
dtype = np .int64 ,
1354
1390
)
1355
1391
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
1356
- core .update_progress (array .nchunks )
1357
-
1358
- col = pcvcf .columns ["CHROM" ]
1359
- array = self .root ["variant_contig" ]
1360
- buff = np .zeros_like (array )
1361
- lookup = {v : j for j , v in enumerate (contig_names )}
1362
- for j , contig in enumerate (col .values ):
1363
- try :
1364
- buff [j ] = lookup [contig [0 ]]
1365
- except KeyError :
1366
- # TODO add advice about adding it to the spec
1367
- raise ValueError (f"Contig '{ contig } ' was not defined in the header." )
1392
+ return {v : j for j , v in enumerate (contig_names )}
1368
1393
1369
- logger .debug ("Start contig write" )
1370
- array [:] = buff
1371
- core .update_progress (array .nchunks )
1372
- logger .debug ("Contig done" )
1373
-
1374
- def encode_filters (self , pcvcf , filter_names ):
1394
+ def encode_filter_id (self , pcvcf , filter_names ):
1375
1395
array = self .root .array (
1376
1396
"filter_id" ,
1377
1397
filter_names ,
1378
1398
dtype = "str" ,
1379
1399
compressor = core .default_compressor ,
1380
1400
)
1381
1401
array .attrs ["_ARRAY_DIMENSIONS" ] = ["filters" ]
1382
- core .update_progress (array .nchunks )
1383
-
1384
- col = pcvcf .columns ["FILTERS" ]
1385
- array = self .root ["variant_filter" ]
1386
- buff = np .zeros_like (array )
1387
-
1388
- lookup = {v : j for j , v in enumerate (filter_names )}
1389
- for j , filters in enumerate (col .values ):
1390
- try :
1391
- for f in filters :
1392
- buff [j , lookup [f ]] = True
1393
- except IndexError :
1394
- raise ValueError (f"Filter '{ f } ' was not defined in the header." )
1395
-
1396
- array [:] = buff
1397
-
1398
- core .update_progress (array .nchunks )
1399
- logger .debug ("Filters done" )
1400
-
1401
- def encode_id (self , pcvcf ):
1402
- col = pcvcf .columns ["ID" ]
1403
- id_array = self .root ["variant_id" ]
1404
- id_mask_array = self .root ["variant_id_mask" ]
1405
- id_buff = np .full_like (id_array , "" )
1406
- id_mask_buff = np .zeros_like (id_mask_array )
1407
-
1408
- for j , value in enumerate (col .values ):
1409
- if value is not None :
1410
- id_buff [j ] = value [0 ]
1411
- else :
1412
- id_buff [j ] = "." # TODO is this correct??
1413
- id_mask_buff [j ] = True
1414
-
1415
- id_array [:] = id_buff
1416
- core .update_progress (id_array .nchunks )
1417
- id_mask_array [:] = id_mask_buff
1418
- core .update_progress (id_mask_array .nchunks )
1419
- logger .debug ("ID done" )
1402
+ return {v : j for j , v in enumerate (filter_names )}
1420
1403
1421
1404
@staticmethod
1422
1405
def encode (
@@ -1453,49 +1436,54 @@ def encode(
1453
1436
units = "chunks" ,
1454
1437
show = show_progress ,
1455
1438
)
1439
+
1440
+ # Do these syncronously for simplicity so we have the mapping
1441
+ filter_id_map = sgvcf .encode_filter_id (pcvcf , conversion_spec .filter_id )
1442
+ contig_id_map = sgvcf .encode_contig_id (
1443
+ pcvcf , conversion_spec .contig_id , conversion_spec .contig_length
1444
+ )
1445
+
1456
1446
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1457
- # FIXME these need to be split up, as they are quite slow on large
1458
- # files, and if worker_processes < 8 we get a big pause
1459
1447
pwm .submit (
1460
1448
sgvcf .encode_samples ,
1461
1449
pcvcf ,
1462
1450
conversion_spec .sample_id ,
1463
1451
conversion_spec .chunk_width ,
1464
1452
)
1465
- pwm .submit (sgvcf .encode_alleles , pcvcf )
1466
- pwm .submit (sgvcf .encode_id , pcvcf )
1467
- pwm .submit (
1468
- sgvcf .encode_contig ,
1469
- pcvcf ,
1470
- conversion_spec .contig_id ,
1471
- conversion_spec .contig_length ,
1472
- )
1473
- pwm .submit (sgvcf .encode_filters , pcvcf , conversion_spec .filter_id )
1474
- for col in chunked_1d :
1475
- if col .vcf_field is not None :
1476
- for start , stop in slices :
1453
+ for start , stop in slices :
1454
+ pwm .submit (sgvcf .encode_alleles_slice , pcvcf , start , stop )
1455
+ pwm .submit (sgvcf .encode_id_slice , pcvcf , start , stop )
1456
+ pwm .submit (
1457
+ sgvcf .encode_filters_slice , pcvcf , filter_id_map , start , stop
1458
+ )
1459
+ pwm .submit (sgvcf .encode_contig_slice , pcvcf , contig_id_map , start , stop )
1460
+ for col in chunked_1d :
1461
+ if col .vcf_field is not None :
1477
1462
pwm .submit (sgvcf .encode_column_slice , pcvcf , col , start , stop )
1478
1463
1479
1464
chunked_2d = [
1480
1465
col for col in conversion_spec .columns .values () if len (col .chunks ) >= 2
1481
1466
]
1482
- progress_config = core .ProgressConfig (
1483
- total = sum (sgvcf .root [col .name ].nchunks for col in chunked_2d ),
1484
- title = "Encode 2D" ,
1485
- units = "chunks" ,
1486
- show = show_progress ,
1487
- )
1488
- with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1489
- if "call_genotype" in conversion_spec .columns :
1490
- logger .info (f"Submit encode call_genotypes in { len (slices )} slices" )
1491
- for start , stop in slices :
1492
- pwm .submit (sgvcf .encode_genotypes_slice , pcvcf , start , stop )
1493
-
1494
- for col in chunked_2d :
1495
- if col .vcf_field is not None :
1496
- logger .info (f"Submit encode { col .name } in { len (slices )} slices" )
1467
+ if len (chunked_2d ) > 0 :
1468
+ progress_config = core .ProgressConfig (
1469
+ total = sum (sgvcf .root [col .name ].nchunks for col in chunked_2d ),
1470
+ title = "Encode 2D" ,
1471
+ units = "chunks" ,
1472
+ show = show_progress ,
1473
+ )
1474
+ with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1475
+ if "call_genotype" in conversion_spec .columns :
1476
+ logger .info (f"Submit encode call_genotypes in { len (slices )} slices" )
1497
1477
for start , stop in slices :
1498
- pwm .submit (sgvcf .encode_column_slice , pcvcf , col , start , stop )
1478
+ pwm .submit (sgvcf .encode_genotypes_slice , pcvcf , start , stop )
1479
+
1480
+ for col in chunked_2d :
1481
+ if col .vcf_field is not None :
1482
+ logger .info (f"Submit encode { col .name } in { len (slices )} slices" )
1483
+ for start , stop in slices :
1484
+ pwm .submit (
1485
+ sgvcf .encode_column_slice , pcvcf , col , start , stop
1486
+ )
1499
1487
1500
1488
zarr .consolidate_metadata (write_path )
1501
1489
# Atomic swap, now we've completely finished.
@@ -1746,7 +1734,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
1746
1734
assert pos [start_index ] == first_pos
1747
1735
vcf = cyvcf2 .VCF (vcf_path )
1748
1736
if show_progress :
1749
- iterator = tqdm .tqdm (vcf , desc = " Verify" , total = vcf .num_records )
1737
+ iterator = tqdm .tqdm (vcf , desc = " Verify" , total = vcf .num_records )
1750
1738
else :
1751
1739
iterator = vcf
1752
1740
for j , row in enumerate (iterator , start_index ):
0 commit comments