@@ -1262,54 +1262,39 @@ def create_array(self, variable):
1262
1262
)
1263
1263
a .attrs ["_ARRAY_DIMENSIONS" ] = variable .dimensions
1264
1264
1265
- def encode_column (self , pcvcf , column , encoder_threads = 4 ):
1266
- # TODO we're doing this the wrong way at the moment, overcomplicating
1267
- # things by having the ThreadedZarrEncoder. It would be simpler if
1268
- # we split the columns into vertical chunks, and just pushed a bunch
1269
- # of futures for encoding start:end slices of each column. The
1270
- # complicating factor here is that we need to get these slices
1271
- # out of the pcvcf, which takes a little bit of doing (but fine,
1272
- # because we know the number of records in each partition).
1273
- # An annoying factor then is how to update the progess meter
1274
- # because the "bytes read" approach becomes problematic
1275
- # when we might access the same chunk several times.
1276
- # Would perhaps be better to call sys.getsizeof() on the stored
1277
- # value each time.
1278
-
1265
+ def encode_column_slice (self , pcvcf , column , start , stop ):
1279
1266
source_col = pcvcf .columns [column .vcf_field ]
1280
1267
array = self .root [column .name ]
1281
- ba = core .BufferedArray (array )
1268
+ ba = core .BufferedArray (array , start )
1282
1269
sanitiser = source_col .sanitiser_factory (ba .buff .shape )
1283
1270
1284
- with core .ThreadedZarrEncoder ([ba ], encoder_threads ) as te :
1285
- last_bytes_read = 0
1286
- for value , bytes_read in source_col .iter_values_bytes ():
1287
- j = te .next_buffer_row ()
1288
- sanitiser (ba .buff , j , value )
1289
- # print(bytes_read, last_bytes_read, value)
1290
- if last_bytes_read != bytes_read :
1291
- core .update_progress (bytes_read - last_bytes_read )
1292
- last_bytes_read = bytes_read
1293
-
1294
- def encode_genotypes (self , pcvcf , encoder_threads = 4 ):
1271
+ for value in source_col .iter_values (start , stop ):
1272
+ # We write directly into the buffer in the sanitiser function
1273
+ # to make it easier to reason about dimension padding
1274
+ j = ba .next_buffer_row ()
1275
+ sanitiser (ba .buff , j , value )
1276
+ core .update_progress (sys .getsizeof (value ))
1277
+ ba .flush ()
1278
+
1279
+ def encode_genotypes_slice (self , pcvcf , start , stop ):
1295
1280
source_col = pcvcf .columns ["FORMAT/GT" ]
1296
- gt = core .BufferedArray (self .root ["call_genotype" ])
1297
- gt_mask = core .BufferedArray (self .root ["call_genotype_mask" ])
1298
- gt_phased = core .BufferedArray (self .root ["call_genotype_phased" ])
1299
- buffered_arrays = [ gt , gt_phased , gt_mask ]
1300
-
1301
- with core . ThreadedZarrEncoder ( buffered_arrays , encoder_threads ) as te :
1302
- last_bytes_read = 0
1303
- for value , bytes_read in source_col . iter_values_bytes ():
1304
- j = te . next_buffer_row ( )
1305
- sanitise_value_int_2d ( gt . buff , j , value [:, : - 1 ])
1306
- sanitise_value_int_1d ( gt_phased . buff , j , value [:, - 1 ])
1307
- # TODO check is this the correct semantics when we are padding
1308
- # with mixed ploidies?
1309
- gt_mask . buff [ j ] = gt . buff [ j ] < 0
1310
- if last_bytes_read != bytes_read :
1311
- core . update_progress ( bytes_read - last_bytes_read )
1312
- last_bytes_read = bytes_read
1281
+ gt = core .BufferedArray (self .root ["call_genotype" ], start )
1282
+ gt_mask = core .BufferedArray (self .root ["call_genotype_mask" ], start )
1283
+ gt_phased = core .BufferedArray (self .root ["call_genotype_phased" ], start )
1284
+
1285
+ for value in source_col . iter_values ( start , stop ):
1286
+ j = gt . next_buffer_row ()
1287
+ sanitise_value_int_2d ( gt . buff , j , value [:, : - 1 ])
1288
+ j = gt_phased . next_buffer_row ()
1289
+ sanitise_value_int_1d ( gt_phased . buff , j , value [:, - 1 ] )
1290
+ # TODO check is this the correct semantics when we are padding
1291
+ # with mixed ploidies?
1292
+ j = gt_mask . next_buffer_row ()
1293
+ gt_mask . buff [ j ] = gt . buff [ j ] < 0
1294
+ core . update_progress ( sys . getsizeof ( value ))
1295
+ gt . flush ()
1296
+ gt_phased . flush ( )
1297
+ gt_mask . flush ()
1313
1298
1314
1299
def encode_alleles (self , pcvcf ):
1315
1300
ref_col = pcvcf .columns ["REF" ]
@@ -1449,6 +1434,7 @@ def convert(
1449
1434
units = "b" ,
1450
1435
show = show_progress ,
1451
1436
)
1437
+ num_slices = max (1 , worker_processes * 4 )
1452
1438
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1453
1439
pwm .submit (
1454
1440
sgvcf .encode_samples ,
@@ -1465,22 +1451,23 @@ def convert(
1465
1451
conversion_spec .contig_length ,
1466
1452
)
1467
1453
pwm .submit (sgvcf .encode_filters , pcvcf , conversion_spec .filter_id )
1454
+ # Using POS arbitrarily to get the array slices
1455
+ slices = core .chunk_aligned_slices (
1456
+ sgvcf .root ["variant_position" ], num_slices
1457
+ )
1468
1458
has_gt = False
1469
1459
for variable in conversion_spec .columns .values ():
1470
1460
if variable .vcf_field is not None :
1471
- # print("Encode", variable.name)
1472
- # TODO for large columns it's probably worth splitting up
1473
- # these into vertical chunks. Otherwise we tend to get a
1474
- # long wait for the largest GT columns to finish.
1475
- # Straightforward to do because we can chunk-align the work
1476
- # packages.
1477
- pwm .submit (sgvcf .encode_column , pcvcf , variable )
1461
+ for start , stop in slices :
1462
+ pwm .submit (
1463
+ sgvcf .encode_column_slice , pcvcf , variable , start , stop
1464
+ )
1478
1465
else :
1479
1466
if variable .name == "call_genotype" :
1480
1467
has_gt = True
1481
1468
if has_gt :
1482
- # TODO add mixed ploidy
1483
- pwm .executor . submit (sgvcf .encode_genotypes , pcvcf )
1469
+ for start , stop in slices :
1470
+ pwm .submit (sgvcf .encode_genotypes_slice , pcvcf , start , stop )
1484
1471
1485
1472
zarr .consolidate_metadata (write_path )
1486
1473
# Atomic swap, now we've completely finished.
0 commit comments