@@ -1286,7 +1286,7 @@ def summary_table(self):
1286
1286
1287
1287
@dataclasses .dataclass
1288
1288
class EncodingWork :
1289
- func : callable
1289
+ func : callable = dataclasses . field ( repr = False )
1290
1290
start : int
1291
1291
stop : int
1292
1292
columns : list [str ]
@@ -1319,12 +1319,12 @@ def init_array(self, variable):
1319
1319
def get_array (self , name ):
1320
1320
return self .root ["wip_" + name ]
1321
1321
1322
- def finalise_array (self , variable ):
1323
- source = self .path / ("wip_" + variable . name )
1324
- dest = self .path / variable . name
1322
+ def finalise_array (self , variable_name ):
1323
+ source = self .path / ("wip_" + variable_name )
1324
+ dest = self .path / variable_name
1325
1325
# Atomic swap
1326
1326
os .rename (source , dest )
1327
- logger .debug (f"Finalised { variable . name } " )
1327
+ logger .info (f"Finalised { variable_name } " )
1328
1328
1329
1329
def encode_array_slice (self , column , start , stop ):
1330
1330
source_col = self .pcvcf .columns [column .vcf_field ]
@@ -1471,8 +1471,8 @@ def init(self):
1471
1471
self .init_array (column )
1472
1472
1473
1473
def finalise (self ):
1474
- for column in self .schema .columns .values ():
1475
- self .finalise_array (column )
1474
+ # for column in self.schema.columns.values():
1475
+ # self.finalise_array(column)
1476
1476
zarr .consolidate_metadata (self .path )
1477
1477
1478
1478
def encode (
@@ -1536,21 +1536,25 @@ def encode(
1536
1536
work .append (
1537
1537
EncodingWork (self .encode_alleles_slice , start , stop , ["variant_allele" ])
1538
1538
)
1539
- work .append (EncodingWork (self .encode_id_slice , start , stop , ["variant_id" ]))
1539
+ work .append (
1540
+ EncodingWork (
1541
+ self .encode_id_slice , start , stop , ["variant_id" , "variant_id_mask" ]
1542
+ )
1543
+ )
1540
1544
work .append (
1541
1545
EncodingWork (
1542
1546
functools .partial (self .encode_filters_slice , filter_id_map ),
1543
1547
start ,
1544
1548
stop ,
1545
- ["variant_filters " ],
1549
+ ["variant_filter " ],
1546
1550
)
1547
1551
)
1548
1552
work .append (
1549
1553
EncodingWork (
1550
1554
functools .partial (self .encode_contig_slice , contig_id_map ),
1551
1555
start ,
1552
1556
stop ,
1553
- ["variant_contig_id " ],
1557
+ ["variant_contig " ],
1554
1558
)
1555
1559
)
1556
1560
if "call_genotype" in self .schema .columns :
@@ -1567,6 +1571,7 @@ def encode(
1567
1571
self .encode_genotypes_slice , start , stop , variables , gt_memory
1568
1572
)
1569
1573
)
1574
+
1570
1575
# Fail early if we can't fit a particular column into memory
1571
1576
for wp in work :
1572
1577
if wp .memory >= max_memory :
@@ -1581,31 +1586,47 @@ def encode(
1581
1586
units = "B" ,
1582
1587
show = show_progress ,
1583
1588
)
1584
- # TODO add a map of slices completed to column here, so that we can
1585
- # finalise the arrays as they get completed. We'll have to service
1586
- # the futures more, though, not just when we exceed the memory budget
1587
1589
1588
1590
used_memory = 0
1591
+ max_queued = 4 * max (1 , worker_processes )
1592
+ encoded_slices = collections .Counter ()
1593
+
1589
1594
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1590
1595
future = pwm .submit (self .encode_samples )
1591
- future_to_memory_use = {future : 0 }
1592
- for wp in work :
1593
- while used_memory + wp .memory >= max_memory :
1594
- logger .info (
1595
- f"Memory budget { display_size (max_memory )} exceeded: "
1596
- f"used={ display_size (used_memory )} needed={ display_size (wp .memory )} "
1597
- )
1598
- futures = pwm .wait_for_completed ()
1599
- released_mem = sum (
1600
- future_to_memory_use .pop (future ) for future in futures
1601
- )
1602
- logger .info (
1603
- f"{ len (futures )} completed, released { display_size (released_mem )} "
1596
+ future_to_work = {future : EncodingWork (None , 0 , 0 , [])}
1597
+
1598
+ def service_completed_futures ():
1599
+ nonlocal used_memory
1600
+
1601
+ completed = pwm .wait_for_completed ()
1602
+ for future in completed :
1603
+ wp_done = future_to_work .pop (future )
1604
+ used_memory -= wp_done .memory
1605
+ logger .debug (
1606
+ f"Complete { wp_done } : used mem={ display_size (used_memory )} "
1604
1607
)
1605
- used_memory -= released_mem
1608
+ for column in wp_done .columns :
1609
+ encoded_slices [column ] += 1
1610
+ if encoded_slices [column ] == len (slices ):
1611
+ # Do this syncronously for simplicity. Should be
1612
+ # fine as the workers will probably be busy with
1613
+ # large encode tasks most of the time.
1614
+ self .finalise_array (column )
1615
+
1616
+ for wp in work :
1617
+ if (
1618
+ used_memory + wp .memory > max_memory
1619
+ or len (future_to_work ) > max_queued
1620
+ ):
1621
+ service_completed_futures ()
1606
1622
future = pwm .submit (wp .func , wp .start , wp .stop )
1607
1623
used_memory += wp .memory
1608
- future_to_memory_use [future ] = wp .memory
1624
+ logger .debug (f"Submit { wp } : used mem={ display_size (used_memory )} " )
1625
+ future_to_work [future ] = wp
1626
+
1627
+ logger .debug ("All work submitted" )
1628
+ while len (future_to_work ) > 0 :
1629
+ service_completed_futures ()
1609
1630
1610
1631
1611
1632
def mkschema (if_path , out ):
0 commit comments