33import logging
44import os
55import pickle
6+ import time
67
78import dask
89import dask .array as da
@@ -247,7 +248,9 @@ def release_shared_memory(self):
247248 def delete_shared_memory_files (key ):
248249 delete_shared_memory_files (key )
249250
250- def to_shared_memory (self , key = None , mode = "r+" , _dupe = True ):
251+ def to_shared_memory (
252+ self , key = None , mode = "r+" , _dupe = True , dask_scheduler = "threads"
253+ ):
251254 """
252255 Load this Dataset into shared memory.
253256
@@ -262,9 +265,13 @@ def to_shared_memory(self, key=None, mode="r+", _dupe=True):
262265 An identifying key for this shared memory. Use the same key
263266 in `from_shared_memory` to recreate this Dataset elsewhere.
264267 mode : {‘r+’, ‘r’, ‘w+’, ‘c’}, optional
265- This methid returns a copy of the Dataset in shared memory.
268+ This method returns a copy of the Dataset in shared memory.
266269 If memmapped, that copy can be opened in various modes.
267270 See numpy.memmap() for details.
271+ dask_scheduler : str, default 'threads'
272+ The scheduler to use when loading dask arrays into shared memory.
273+ Typically "threads" for multi-threaded reads or "synchronous"
274+ for single-threaded reads. See dask.compute() for details.
268275
269276 Returns
270277 -------
@@ -287,6 +294,7 @@ def to_shared_memory(self, key=None, mode="r+", _dupe=True):
287294 def emit (k , a , is_coord ):
288295 nonlocal names , wrappers , sizes , position
289296 if sparse is not None and isinstance (a .data , sparse .GCXS ):
297+ logger .info (f"preparing sparse array { a .name } " )
290298 wrappers .append (
291299 {
292300 "sparse" : True ,
@@ -308,6 +316,7 @@ def emit(k, a, is_coord):
308316 )
309317 a_nbytes = a .data .nbytes
310318 else :
319+ logger .info (f"preparing dense array { a .name } " )
311320 wrappers .append (
312321 {
313322 "dims" : a .dims ,
@@ -335,19 +344,23 @@ def emit(k, a, is_coord):
335344 emit (k , a , False )
336345
337346 mem = create_shared_memory_array (key , size = position )
347+
348+ logger .info ("declaring shared memory buffer" )
338349 if key .startswith ("memmap:" ):
339350 buffer = memoryview (mem )
340351 else :
341352 buffer = mem .buf
342353
343354 tasks = []
355+ task_names = []
344356 for w in wrappers :
345357 _is_sparse = w .get ("sparse" , False )
346358 _size = w ["nbytes" ]
347359 _name = w ["name" ]
348360 _pos = w ["position" ]
349361 a = self ._obj [_name ]
350362 if _is_sparse :
363+ logger .info (f"running load task: { _name } ({ si_units (_size )} )" )
351364 ad = a .data
352365 _size_d = w ["data.nbytes" ]
353366 _size_i = w ["indices.nbytes" ]
@@ -373,19 +386,30 @@ def emit(k, a, is_coord):
373386 mem_arr_i [:] = ad .indices [:]
374387 mem_arr_p [:] = ad .indptr [:]
375388 else :
389+ logger .info (f"preparing load task: { _name } ({ si_units (_size )} )" )
376390 mem_arr = np .ndarray (
377391 shape = a .shape , dtype = a .dtype , buffer = buffer [_pos : _pos + _size ]
378392 )
379393 if isinstance (a , xr .DataArray ) and isinstance (a .data , da .Array ):
380394 tasks .append (da .store (a .data , mem_arr , lock = False , compute = False ))
395+ task_names .append (_name )
381396 else :
382397 mem_arr [:] = a [:]
383398 if tasks :
384- dask .compute (tasks , scheduler = "threads" )
399+ t = time .time ()
400+ logger .info (f"running { len (tasks )} dask data load tasks" )
401+ if dask_scheduler == "synchronous" :
402+ for task , task_name in zip (tasks , task_names ):
403+ logger .info (f"running load task: { task_name } " )
404+ dask .compute (task , scheduler = dask_scheduler )
405+ else :
406+ dask .compute (tasks , scheduler = dask_scheduler )
407+ logger .info (f"completed dask data load in { time .time ()- t :.3f} seconds" )
385408
386409 if key .startswith ("memmap:" ):
387410 mem .flush ()
388411
412+ logger .info ("storing metadata in shared memory" )
389413 create_shared_list (
390414 [pickle .dumps (self ._obj .attrs )] + [pickle .dumps (i ) for i in wrappers ], key
391415 )
0 commit comments