@@ -48,7 +48,7 @@ def display_number(x):
48
48
49
49
50
50
def display_size (n ):
51
- return humanfriendly .format_size (n )
51
+ return humanfriendly .format_size (n , binary = True )
52
52
53
53
54
54
@dataclasses .dataclass
@@ -1289,6 +1289,7 @@ class EncodingWork:
1289
1289
func : callable
1290
1290
start : int
1291
1291
stop : int
1292
+ memory : int = 0
1292
1293
1293
1294
1294
1295
class VcfZarrWriter :
@@ -1478,7 +1479,17 @@ def encode(
1478
1479
worker_processes = 1 ,
1479
1480
max_v_chunks = None ,
1480
1481
show_progress = False ,
1482
+ max_memory = None ,
1481
1483
):
1484
+ if max_memory is None :
1485
+ # Unbounded
1486
+ max_memory = 2 ** 63
1487
+ else :
1488
+ # Value is specified in Mibibytes
1489
+ max_memory *= 2 ** 20
1490
+
1491
+ # TODO this will move into the setup logic later when we're making it possible
1492
+ # to split the work by slice
1482
1493
num_slices = max (1 , worker_processes * 4 )
1483
1494
# Using POS arbitrarily to get the array slices
1484
1495
slices = core .chunk_aligned_slices (
@@ -1492,8 +1503,16 @@ def encode(
1492
1503
array .resize (shape )
1493
1504
1494
1505
total_bytes = 0
1506
+ encoding_memory_requirements = {}
1495
1507
for col in self .schema .columns .values ():
1496
1508
array = self .get_array (col .name )
1509
+ # NOTE!! this is bad, we're potentially creating quite a large
1510
+ # numpy array for basically nothing. We can compute this.
1511
+ variant_chunk_size = array .blocks [0 ].nbytes
1512
+ encoding_memory_requirements [col .name ] = variant_chunk_size
1513
+ logger .debug (
1514
+ f"{ col .name } requires at least { display_size (variant_chunk_size )} per worker"
1515
+ )
1497
1516
total_bytes += array .nbytes
1498
1517
1499
1518
filter_id_map = self .encode_filter_id ()
@@ -1504,7 +1523,11 @@ def encode(
1504
1523
for col in self .schema .columns .values ():
1505
1524
if col .vcf_field is not None :
1506
1525
f = functools .partial (self .encode_array_slice , col )
1507
- work .append (EncodingWork (f , start , stop ))
1526
+ work .append (
1527
+ EncodingWork (
1528
+ f , start , stop , encoding_memory_requirements [col .name ]
1529
+ )
1530
+ )
1508
1531
work .append (EncodingWork (self .encode_alleles_slice , start , stop ))
1509
1532
work .append (EncodingWork (self .encode_id_slice , start , stop ))
1510
1533
work .append (
@@ -1522,18 +1545,51 @@ def encode(
1522
1545
)
1523
1546
)
1524
1547
if "call_genotype" in self .schema .columns :
1525
- work .append (EncodingWork (self .encode_genotypes_slice , start , stop ))
1548
+ gt_memory = sum (
1549
+ encoding_memory_requirements [name ]
1550
+ for name in [
1551
+ "call_genotype" ,
1552
+ "call_genotype_phased" ,
1553
+ "call_genotype_mask" ,
1554
+ ]
1555
+ )
1556
+ work .append (
1557
+ EncodingWork (self .encode_genotypes_slice , start , stop , gt_memory )
1558
+ )
1559
+ # Fail early if we can't fit a particular column into memory
1560
+ for wp in work :
1561
+ if wp .memory >= max_memory :
1562
+ raise ValueError (f"Insufficient memory for { wp .func } : "
1563
+ f"{ display_size (wp .memory )} > { display_size (max_memory )} " )
1564
+
1526
1565
1527
1566
progress_config = core .ProgressConfig (
1528
1567
total = total_bytes ,
1529
1568
title = "Encode" ,
1530
1569
units = "B" ,
1531
1570
show = show_progress ,
1532
1571
)
1572
+ used_memory = 0
1533
1573
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1534
- pwm .submit (self .encode_samples )
1574
+ future = pwm .submit (self .encode_samples )
1575
+ future_to_memory_use = {future : 0 }
1535
1576
for wp in work :
1536
- pwm .submit (wp .func , wp .start , wp .stop )
1577
+ while used_memory + wp .memory >= max_memory :
1578
+ logger .info (
1579
+ f"Memory budget { display_size (max_memory )} exceeded: "
1580
+ f"used={ display_size (used_memory )} needed={ display_size (wp .memory )} "
1581
+ )
1582
+ futures = pwm .wait_for_completed (timeout = 5 )
1583
+ released_mem = sum (
1584
+ future_to_memory_use .pop (future ) for future in futures
1585
+ )
1586
+ logger .info (
1587
+ f"{ len (futures )} completed, released { display_size (released_mem )} "
1588
+ )
1589
+ used_memory -= released_mem
1590
+ future = pwm .submit (wp .func , wp .start , wp .stop )
1591
+ used_memory += wp .memory
1592
+ future_to_memory_use [future ] = wp .memory
1537
1593
1538
1594
1539
1595
def mkschema (if_path , out ):
@@ -1549,6 +1605,7 @@ def encode(
1549
1605
chunk_length = None ,
1550
1606
chunk_width = None ,
1551
1607
max_v_chunks = None ,
1608
+ max_memory = None ,
1552
1609
worker_processes = 1 ,
1553
1610
show_progress = False ,
1554
1611
):
@@ -1574,6 +1631,7 @@ def encode(
1574
1631
vzw .encode (
1575
1632
max_v_chunks = max_v_chunks ,
1576
1633
worker_processes = worker_processes ,
1634
+ max_memory = max_memory ,
1577
1635
show_progress = show_progress ,
1578
1636
)
1579
1637
vzw .finalise ()
0 commit comments