25
25
26
26
import bed_reader
27
27
28
+ from . import core
29
+
28
30
logger = logging .getLogger (__name__ )
29
31
30
32
INT_MISSING = - 1
@@ -1109,21 +1111,6 @@ def fixed_field_spec(
1109
1111
)
1110
1112
1111
1113
1112
- @dataclasses .dataclass
1113
- class BufferedArray :
1114
- array : Any
1115
- buff : Any
1116
-
1117
- def __init__ (self , array ):
1118
- self .array = array
1119
- dims = list (array .shape )
1120
- dims [0 ] = min (array .chunks [0 ], array .shape [0 ])
1121
- self .buff = np .zeros (dims , dtype = array .dtype )
1122
-
1123
- def swap_buffers (self ):
1124
- self .buff = np .zeros_like (self .buff )
1125
-
1126
-
1127
1114
class SgvcfZarr :
1128
1115
def __init__ (self , path ):
1129
1116
self .path = pathlib .Path (path )
@@ -1143,7 +1130,7 @@ def create_array(self, variable):
1143
1130
def encode_column (self , pcvcf , column ):
1144
1131
source_col = pcvcf .columns [column .vcf_field ]
1145
1132
array = self .root [column .name ]
1146
- ba = BufferedArray (array )
1133
+ ba = core . BufferedArray (array )
1147
1134
sanitiser = source_col .sanitiser_factory (ba .buff .shape )
1148
1135
chunk_length = array .chunks [0 ]
1149
1136
@@ -1157,9 +1144,7 @@ def encode_column(self, pcvcf, column):
1157
1144
j += 1
1158
1145
if j == chunk_length :
1159
1146
flush_futures (futures )
1160
- futures .extend (
1161
- async_flush_array (executor , ba .buff , ba .array , chunk_start )
1162
- )
1147
+ futures .extend (ba .async_flush (executor , chunk_start ))
1163
1148
ba .swap_buffers ()
1164
1149
j = 0
1165
1150
chunk_start += chunk_length
@@ -1170,16 +1155,14 @@ def encode_column(self, pcvcf, column):
1170
1155
1171
1156
if j != 0 :
1172
1157
flush_futures (futures )
1173
- futures .extend (
1174
- async_flush_array (executor , ba .buff [:j ], ba .array , chunk_start )
1175
- )
1158
+ futures .extend (ba .async_flush (executor , chunk_start , j ))
1176
1159
flush_futures (futures )
1177
1160
1178
1161
def encode_genotypes (self , pcvcf ):
1179
1162
source_col = pcvcf .columns ["FORMAT/GT" ]
1180
- gt = BufferedArray (self .root ["call_genotype" ])
1181
- gt_mask = BufferedArray (self .root ["call_genotype_mask" ])
1182
- gt_phased = BufferedArray (self .root ["call_genotype_phased" ])
1163
+ gt = core . BufferedArray (self .root ["call_genotype" ])
1164
+ gt_mask = core . BufferedArray (self .root ["call_genotype_mask" ])
1165
+ gt_phased = core . BufferedArray (self .root ["call_genotype_phased" ])
1183
1166
chunk_length = gt .array .chunks [0 ]
1184
1167
1185
1168
buffered_arrays = [gt , gt_phased , gt_mask ]
@@ -1200,9 +1183,7 @@ def encode_genotypes(self, pcvcf):
1200
1183
if j == chunk_length :
1201
1184
flush_futures (futures )
1202
1185
for ba in buffered_arrays :
1203
- futures .extend (
1204
- async_flush_array (executor , ba .buff , ba .array , chunk_start )
1205
- )
1186
+ futures .extend (ba .async_flush (executor , chunk_start ))
1206
1187
ba .swap_buffers ()
1207
1188
j = 0
1208
1189
chunk_start += chunk_length
@@ -1214,9 +1195,7 @@ def encode_genotypes(self, pcvcf):
1214
1195
if j != 0 :
1215
1196
flush_futures (futures )
1216
1197
for ba in buffered_arrays :
1217
- futures .extend (
1218
- async_flush_array (executor , ba .buff [:j ], ba .array , chunk_start )
1219
- )
1198
+ futures .extend (ba .async_flush (executor , chunk_start , j ))
1220
1199
flush_futures (futures )
1221
1200
1222
1201
def encode_alleles (self , pcvcf ):
@@ -1417,45 +1396,6 @@ def convert(
1417
1396
os .rename (write_path , path )
1418
1397
1419
1398
1420
- def sync_flush_array (np_buffer , zarr_array , offset ):
1421
- zarr_array [offset : offset + np_buffer .shape [0 ]] = np_buffer
1422
-
1423
-
1424
- def async_flush_array (executor , np_buffer , zarr_array , offset ):
1425
- """
1426
- Flush the specified chunk aligned buffer to the specified zarr array.
1427
- """
1428
- logger .debug (f"Schedule flush { zarr_array } @ { offset } " )
1429
- assert zarr_array .shape [1 :] == np_buffer .shape [1 :]
1430
- # print("sync", zarr_array, np_buffer)
1431
-
1432
- if len (np_buffer .shape ) == 1 :
1433
- futures = [executor .submit (sync_flush_array , np_buffer , zarr_array , offset )]
1434
- else :
1435
- futures = async_flush_2d_array (executor , np_buffer , zarr_array , offset )
1436
- return futures
1437
-
1438
-
1439
- def async_flush_2d_array (executor , np_buffer , zarr_array , offset ):
1440
- # Flush each of the chunks in the second dimension separately
1441
- s = slice (offset , offset + np_buffer .shape [0 ])
1442
-
1443
- def flush_chunk (start , stop ):
1444
- zarr_array [s , start :stop ] = np_buffer [:, start :stop ]
1445
-
1446
- chunk_width = zarr_array .chunks [1 ]
1447
- zarr_array_width = zarr_array .shape [1 ]
1448
- start = 0
1449
- futures = []
1450
- while start < zarr_array_width :
1451
- stop = min (start + chunk_width , zarr_array_width )
1452
- future = executor .submit (flush_chunk , start , stop )
1453
- futures .append (future )
1454
- start = stop
1455
-
1456
- return futures
1457
-
1458
-
1459
1399
def generate_spec (columnarised , out ):
1460
1400
pcvcf = PickleChunkedVcf .load (columnarised )
1461
1401
spec = ZarrConversionSpec .generate (pcvcf )
@@ -1516,9 +1456,9 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia
1516
1456
1517
1457
store = zarr .DirectoryStore (zarr_path )
1518
1458
root = zarr .group (store = store )
1519
- gt = BufferedArray (root ["call_genotype" ])
1520
- gt_mask = BufferedArray (root ["call_genotype_mask" ])
1521
- gt_phased = BufferedArray (root ["call_genotype_phased" ])
1459
+ gt = core . BufferedArray (root ["call_genotype" ])
1460
+ gt_mask = core . BufferedArray (root ["call_genotype_mask" ])
1461
+ gt_phased = core . BufferedArray (root ["call_genotype_phased" ])
1522
1462
chunk_length = gt .array .chunks [0 ]
1523
1463
assert start_variant % chunk_length == 0
1524
1464
@@ -1547,9 +1487,7 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia
1547
1487
assert j <= chunk_length
1548
1488
flush_futures (futures )
1549
1489
for ba in buffered_arrays :
1550
- futures .extend (
1551
- async_flush_array (executor , ba .buff [:j ], ba .array , start )
1552
- )
1490
+ ba .async_flush (extend , start , j )
1553
1491
ba .swap_buffers ()
1554
1492
start = stop
1555
1493
flush_futures (futures )
0 commit comments