Merge pull request #313 from dmgav/dask-fix

dmgav · web-flow · commit 690a3086a7cb · 2024-01-18T09:32:25.000-05:00
Fix issues with recent versions of Dask and Numba
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -16,7 +16,7 @@ jobs:
       matrix:
         host-os: ["ubuntu-latest", "macos-latest", "windows-latest"]
         python-version: ["3.9", "3.10", "3.11"]
-        numpy-version: ["1.24"]
+        numpy-version: ["1.26"]
         pyqt-version: ["5.15"]
         include:
           - host-os: "ubuntu-latest"
diff --git a/pyxrf/core/dask_h5py_serializers.py b/pyxrf/core/dask_h5py_serializers.py
@@ -0,0 +1,55 @@
+# The original code for serializers/deserializers can be found in
+# 'distributed/protocols/h5py.py'
+
+import distributed.protocol.h5py  # noqa: F401
+from distributed.protocol.serialize import dask_deserialize, dask_serialize
+
+deserialized_files = set()
+
+
+def serialize_h5py_file(f):
+    if f and (f.mode != "r"):
+        raise ValueError("Can only serialize read-only h5py files")
+    filename = f.filename if f else None
+    return {"filename": filename}, []
+
+
+def serialize_h5py_dataset(x):
+    header, _ = serialize_h5py_file(x.file if x else None)
+    header["name"] = x.name if x else None
+    return header, []
+
+
+def deserialize_h5py_file(header, frames):
+    import h5py
+
+    filename = header["filename"]
+    if filename:
+        file = h5py.File(filename, mode="r")
+        deserialized_files.add(file)
+    else:
+        file = None
+    return file
+
+
+def deserialize_h5py_dataset(header, frames):
+    file = deserialize_h5py_file(header, frames)
+    name = header["name"]
+    dset = file[name] if (file and name) else None
+    return dset
+
+
+def dask_set_custom_serializers():
+    import h5py
+
+    dask_serialize.register((h5py.Group, h5py.Dataset), serialize_h5py_dataset)
+    dask_serialize.register(h5py.File, serialize_h5py_file)
+    dask_deserialize.register((h5py.Group, h5py.Dataset), deserialize_h5py_dataset)
+    dask_deserialize.register(h5py.File, deserialize_h5py_file)
+
+
+def dask_close_all_files():
+    while deserialized_files:
+        file = deserialized_files.pop()
+        if file:
+            file.close()
diff --git a/pyxrf/core/map_processing.py b/pyxrf/core/map_processing.py
@@ -14,6 +14,7 @@
 from numba import jit
 from progress.bar import Bar
 
+from .dask_h5py_serializers import dask_close_all_files, dask_set_custom_serializers
 from .fitting import fit_spectrum
 
 logger = logging.getLogger(__name__)
@@ -519,6 +520,9 @@ def compute_total_spectrum(
     else:
         client_is_local = False
 
+    client.run(dask_set_custom_serializers)
+    dask_set_custom_serializers()
+
     n_workers = len(client.scheduler_info()["workers"])
     logger.info(f"Dask distributed client: {n_workers} workers")
 
@@ -536,9 +540,12 @@ def compute_total_spectrum(
     if file_obj:
         file_obj.close()
 
-    # The following code is needed to cause Dask 'distributed>=2021.7.0' to close the h5file.
-    del result_fut
-    _dask_release_file_descriptors(client=client)
+    client.run(dask_close_all_files)
+    dask_close_all_files()
+
+    # # The following code is needed to cause Dask 'distributed>=2021.7.0' to close the h5file.
+    # del result_fut
+    # _dask_release_file_descriptors(client=client)
 
     if client_is_local:
         client.close()
@@ -614,6 +621,9 @@ def compute_total_spectrum_and_count(
     else:
         client_is_local = False
 
+    client.run(dask_set_custom_serializers)
+    dask_set_custom_serializers()
+
     n_workers = len(client.scheduler_info()["workers"])
     logger.info(f"Dask distributed client: {n_workers} workers")
 
@@ -632,9 +642,12 @@ def compute_total_spectrum_and_count(
     if file_obj:
         file_obj.close()
 
-    # The following code is needed to cause Dask 'distributed>=2021.7.0' to close the h5file.
-    del result_fut
-    _dask_release_file_descriptors(client=client)
+    client.run(dask_close_all_files)
+    dask_close_all_files()
+
+    # # The following code is needed to cause Dask 'distributed>=2021.7.0' to close the h5file.
+    # del result_fut
+    # _dask_release_file_descriptors(client=client)
 
     if client_is_local:
         client.close()
@@ -710,30 +723,30 @@ def _fit_xrf_block(data, data_sel_indices, matv, snip_param, use_snip):
     return data_out
 
 
-def _dask_release_file_descriptors(*, client):
-    """
-    Make sure the Dask Client releases descriptors of the HDF5 files opened in read-only mode
-    so that they could be opened for reading.
-    """
-    # Runs small task on Dask client. Starting from v2021.7.0, Dask Distributed does not always
-    # close HDF5 files, that are open in read-only mode for loading raw data. Submitting and
-    # computing a small unrelated tasks seem to prompt the client to release the resources from
-    # the previous task and close the files.
-    rfut = da.sum(da.random.random((1000,), chunks=(10,))).persist(scheduler=client)
-    rfut.compute(scheduler=client)
+# def _dask_release_file_descriptors(*, client):
+#     """
+#     Make sure the Dask Client releases descriptors of the HDF5 files opened in read-only mode
+#     so that they could be opened for reading.
+#     """
+#     # Runs small task on Dask client. Starting from v2021.7.0, Dask Distributed does not always
+#     # close HDF5 files, that are open in read-only mode for loading raw data. Submitting and
+#     # computing a small unrelated tasks seem to prompt the client to release the resources from
+#     # the previous task and close the files.
+#     rfut = da.sum(da.random.random((1000,), chunks=(10,))).persist(scheduler=client)
+#     rfut.compute(scheduler=client)
 
-    current_os = platform.system()
-    if current_os == "Linux":
-        # Starting with Dask/Distributed version 2022.2.0 the following step is required:
-        # https://distributed.dask.org/en/stable/worker-memory.html#manually-trim-memory
-        # (works for Linux only, there are different solutions for other OS if needed)
-        import ctypes
+#     current_os = platform.system()
+#     if current_os == "Linux":
+#         # Starting with Dask/Distributed version 2022.2.0 the following step is required:
+#         # https://distributed.dask.org/en/stable/worker-memory.html#manually-trim-memory
+#         # (works for Linux only, there are different solutions for other OS if needed)
+#         import ctypes
 
-        def trim_memory() -> int:
-            libc = ctypes.CDLL("libc.so.6")
-            return libc.malloc_trim(0)
+#         def trim_memory() -> int:
+#             libc = ctypes.CDLL("libc.so.6")
+#             return libc.malloc_trim(0)
 
-        client.run(trim_memory)
+#         client.run(trim_memory)
 
 
 def fit_xrf_map(
@@ -857,6 +870,9 @@ def fit_xrf_map(
     else:
         client_is_local = False
 
+    client.run(dask_set_custom_serializers)
+    dask_set_custom_serializers()
+
     n_workers = len(client.scheduler_info()["workers"])
     logger.info(f"Dask distributed client: {n_workers} workers")
 
@@ -881,9 +897,12 @@ def fit_xrf_map(
     if data_is_from_file:
         file_obj.close()
 
-    # The following code is needed to cause Dask 'distributed>=2021.7.0' to close the h5file.
-    del result_fut
-    _dask_release_file_descriptors(client=client)
+    client.run(dask_close_all_files)
+    dask_close_all_files()
+
+    # # The following code is needed to cause Dask 'distributed>=2021.7.0' to close the h5file.
+    # del result_fut
+    # _dask_release_file_descriptors(client=client)
 
     if client_is_local:
         client.close()
@@ -1070,6 +1089,9 @@ def compute_selected_rois(
     else:
         client_is_local = False
 
+    client.run(dask_set_custom_serializers)
+    dask_set_custom_serializers()
+
     n_workers = len(client.scheduler_info()["workers"])
     logger.info(f"Dask distributed client: {n_workers} workers")
 
@@ -1102,9 +1124,12 @@ def compute_selected_rois(
     if file_obj:
         file_obj.close()
 
+    client.run(dask_close_all_files)
+    dask_close_all_files()
+
     # The following code is needed to cause Dask 'distributed>=2021.7.0' to close the h5file.
-    del result_fut
-    _dask_release_file_descriptors(client=client)
+    # del result_fut
+    # _dask_release_file_descriptors(client=client)
 
     if client_is_local:
         client.close()
@@ -1237,14 +1262,37 @@ def snip_method_numba(
     # where there are peaks. On the boundary part, we don't care
     # the accuracy so much. But we need to pay attention to edge
     # effects in general convolution.
-    A = s.sum()
-
-    background = np.convolve(background, s) / A
-    # Trim 'background' array to imitate the np.convolve option 'mode="same"'
-    mg = len(s) - 1
-    n_beg = mg // 2
-    n_end = n_beg - mg  # Negative
-    background = background[n_beg:n_end]
+
+    def convolve(background, s):
+        # Modifies the contents of the 'background' array.
+        # This implementation of convolution replaces the original
+        #   implementation based on 'np.convolve'. Seems to work as fast
+        #   as the original implementation.
+        s_len = len(s)
+        n_beg = (s_len - 1) // 2
+        A = s.sum()
+        source = np.hstack(
+            (
+                np.zeros(n_beg, dtype=background.dtype),
+                background,
+                np.zeros(s_len - n_beg, dtype=background.dtype),
+            )
+        )
+        for n in range(len(background)):
+            background[n] = np.sum(source[n : n + s_len] * s) / A
+
+    convolve(background, s)
+
+    # # The following implementation of convolution stopped working because of
+    # # unclear issues with 'np.convolve' (gave 'List index out of range' error),
+    # # The code is left for reference.
+    # A = s.sum()
+    # background = np.convolve(background, s) / A
+    # # Trim 'background' array to imitate the np.convolve option 'mode="same"'
+    # mg = len(s) - 1
+    # n_beg = mg // 2
+    # n_end = n_beg - mg  # Negative
+    # background = background[n_beg:n_end]
 
     window_p = width * fwhm / e_lin
     if spectral_binning is not None and spectral_binning > 0:
diff --git a/pyxrf/gui_module/wnd_load_quant_calibration.py b/pyxrf/gui_module/wnd_load_quant_calibration.py
@@ -344,7 +344,7 @@ def display_standard_selection_table(self):
                         ttip = f"Fluorescence (F): {fluorescence:12g}\nDensity (D): {density:12g}\n"
                         # Avoid very small values of density (probably zero)
                         if abs(density) > 1e-30:
-                            ttip += f"F/D: {fluorescence/density:12g}"
+                            ttip += f"F/D: {fluorescence / density:12g}"
 
                         item.setToolTip(ttip)
 
diff --git a/pyxrf/model/command_tools.py b/pyxrf/model/command_tools.py
@@ -285,7 +285,7 @@ def get_positions_set(img_dict):
         channel_num = len(param_channel_list)
         for i in range(channel_num):
             inner_path = "xrfmap/" + det_channel_names[i]
-            print(f"Processing data from detector channel {det_channel_names[i]} (#{i+1}) ...")
+            print(f"Processing data from detector channel {det_channel_names[i]} (#{i + 1}) ...")
 
             # load param file
             param_file_name = param_channel_list[i]
@@ -663,7 +663,7 @@ def pyxrf_batch(
             # only ``start_id`` is specified:
             #   process only one file that contains ``start_id`` in its name
             #   (only if such file exists)
-            pattern = f"^[^_]*_{str(start_id)}\D+"  # noqa: W605
+            pattern = f"^[^_]*_{str(start_id)}\\D+"  # noqa: W605
             flist = [fname for fname in all_files if re.search(pattern, os.path.basename(fname))]
 
             if len(flist) < 1:
@@ -679,7 +679,7 @@ def pyxrf_batch(
             #   select files, which contain the respective ID substring in their names
             flist = []
             for data_id in range(start_id, end_id + 1):
-                pattern = f"^[^_]*_{str(data_id)}\D+"  # noqa: W605
+                pattern = f"^[^_]*_{str(data_id)}\\D+"  # noqa: W605
                 flist += [fname for fname in all_files if re.search(pattern, os.path.basename(fname))]
             if len(flist) < 1:
                 print(f"No files with Scan IDs in the range {start_id} .. {end_id} were found.")
diff --git a/pyxrf/model/fit_spectrum.py b/pyxrf/model/fit_spectrum.py
@@ -774,7 +774,7 @@ def save2Dmap_to_hdf(self, *, calculation_info=None, pixel_fit="nnls"):
         #   det1, det2, ... , i.e. 'det' followed by integer number.
         # The channel name is always located at the end of the ``data_title``.
         # If the channel name is found, then build the path using this name.
-        srch = re.search("det\d+$", self.data_title)  # noqa: W605
+        srch = re.search(r"det\d+$", self.data_title)  # noqa: W605
         if srch:
             det_name = srch.group(0)
             fit_name = f"{prefix_fname}_{det_name}_fit"
diff --git a/pyxrf/model/roi_model.py b/pyxrf/model/roi_model.py
@@ -346,7 +346,7 @@ def saveROImap_to_hdf(self, data_dict_roi):
         #   det1, det2, ... , i.e. 'det' followed by integer number.
         # The channel name is always located at the end of the ``data_title``.
         # If the channel name is found, then build the path using this name.
-        srch = re.search("det\d+$", self.data_title)  # noqa: W605
+        srch = re.search(r"det\d+$", self.data_title)  # noqa: W605
         if srch:
             det_name = srch.group(0)
         inner_path = f"xrfmap/{det_name}"
diff --git a/pyxrf/xanes_maps/xanes_maps_api.py b/pyxrf/xanes_maps/xanes_maps_api.py
@@ -3381,12 +3381,12 @@ def _save_xanes_maps_to_tiff(
             print(f"    image size (Ny, Nx): ({n_y_pixels}, {n_x_pixels})", file=f_log)
             print(
                 f"    Y-axis scan range [Y_min, Y_max, abs(Y_max-Y_min)]: "
-                f"[{y_min:.5g}, {y_max:.5g}, {abs(y_max-y_min):.5g}]",
+                f"[{y_min:.5g}, {y_max:.5g}, {abs(y_max - y_min):.5g}]",
                 file=f_log,
             )
             print(
                 f"    X-axis scan range [X_min, X_max, abs(X_max-X_min)]: "
-                f"[{x_min:.5g}, {x_max:.5g}, {abs(x_max-x_min):.5g}]",
+                f"[{x_min:.5g}, {x_max:.5g}, {abs(x_max - x_min):.5g}]",
                 file=f_log,
             )
 

Original file line number	Diff line number	Diff line change
`@@ -3381,12 +3381,12 @@ def _save_xanes_maps_to_tiff(`
`3381`	`3381`	`print(f" image size (Ny, Nx): ({n_y_pixels}, {n_x_pixels})", file=f_log)`
`3382`	`3382`	`print(`
`3383`	`3383`	`f" Y-axis scan range [Y_min, Y_max, abs(Y_max-Y_min)]: "`
`3384`		`- f"[{y_min:.5g}, {y_max:.5g}, {abs(y_max-y_min):.5g}]",`
	`3384`	`+ f"[{y_min:.5g}, {y_max:.5g}, {abs(y_max - y_min):.5g}]",`
`3385`	`3385`	`file=f_log,`
`3386`	`3386`	`)`
`3387`	`3387`	`print(`
`3388`	`3388`	`f" X-axis scan range [X_min, X_max, abs(X_max-X_min)]: "`
`3389`		`- f"[{x_min:.5g}, {x_max:.5g}, {abs(x_max-x_min):.5g}]",`
	`3389`	`+ f"[{x_min:.5g}, {x_max:.5g}, {abs(x_max - x_min):.5g}]",`
`3390`	`3390`	`file=f_log,`
`3391`	`3391`	`)`
`3392`	`3392`