Parallelize umap with process pools (#221)

mtauraso · web-flow · commit 9bc0baeb5d8a · 2025-03-03T13:49:54.000-08:00
- We use a process pool to do the transform
- We use another process pool to do writing of inference data out (also impacts infer verb)
- Some of the loudest warnings from the umap package have been supressed
diff --git a/src/fibad/data_sets/inference_dataset.py b/src/fibad/data_sets/inference_dataset.py
@@ -1,5 +1,6 @@
 import logging
 from collections.abc import Generator
+from multiprocessing import Pool
 from pathlib import Path
 from typing import Optional, Union
 
@@ -168,6 +169,7 @@ def __init__(self, result_dir: Union[str, Path]):
 
         self.all_ids = np.array([], dtype=np.int64)
         self.all_batch_nums = np.array([], dtype=np.int64)
+        self.writer_pool = Pool()
 
     def write_batch(self, ids: np.ndarray, tensors: list[np.ndarray]):
         """Write a batch of tensors into the dataset. This writes the whole batch immediately.
@@ -197,22 +199,47 @@ def write_batch(self, ids: np.ndarray, tensors: list[np.ndarray]):
         if savepath.exists():
             RuntimeError(f"Writing objects in batch {self.batch_index} but {filename} already exists.")
 
-        np.save(savepath, structured_batch, allow_pickle=False)
+        self.writer_pool.apply_async(
+            func=np.save, args=(savepath, structured_batch), kwds={"allow_pickle": False}
+        )
+
         self.all_ids = np.append(self.all_ids, ids)
         self.all_batch_nums = np.append(self.all_batch_nums, np.full(batch_len, self.batch_index))
 
         self.batch_index += 1
 
     def write_index(self):
-        """Writes out the batch index built up by this object over multiple write_batch calls."""
+        """Writes out the batch index built up by this object over multiple write_batch calls.
+        See save_batch_index for details.
+        """
+        # First ensure we are done writing out all batches
+        self.writer_pool.close()
+        self.writer_pool.join()
+
+        # Then write out the batch index.
+        InferenceDataSetWriter.save_batch_index(self.result_dir, self.all_ids, self.all_batch_nums)
+
+    @staticmethod
+    def save_batch_index(result_dir: Path, all_ids: np.ndarray, all_batch_nums: np.ndarray):
+        """Save a batch index in the result directory provided
+
+        Parameters
+        ----------
+        result_dir : Path
+            The results directory
+        all_ids : np.ndarray
+            All IDs to write out.
+        all_batch_nums : np.ndarray
+            The corresponding batch numbers for the IDs provided.
+        """
         batch_index_dtype = np.dtype([("id", np.int64), ("batch_num", np.int64)])
-        batch_index = np.zeros(len(self.all_ids), batch_index_dtype)
-        batch_index["id"] = np.array(self.all_ids)
-        batch_index["batch_num"] = np.array(self.all_batch_nums)
+        batch_index = np.zeros(len(all_ids), batch_index_dtype)
+        batch_index["id"] = np.array(all_ids)
+        batch_index["batch_num"] = np.array(all_batch_nums)
         batch_index.sort(order="id")
 
         filename = "batch_index.npy"
-        savepath = self.result_dir / filename
+        savepath = result_dir / filename
         if savepath.exists():
             RuntimeError("The path to save batch index already exists.")
         np.save(savepath, batch_index, allow_pickle=False)
diff --git a/src/fibad/verbs/umap.py b/src/fibad/verbs/umap.py
@@ -1,6 +1,8 @@
 import logging
 import pickle
+import warnings
 from argparse import ArgumentParser, Namespace
+from multiprocessing import cpu_count
 from pathlib import Path
 from typing import Optional, Union
 
@@ -60,13 +62,21 @@ def run(self, input_dir: Optional[Union[Path, str]] = None):
         None
             The method does not return anything but saves the UMAP representations to disk.
         """
+        with warnings.catch_warnings():
+            warnings.simplefilter(action="ignore", category=FutureWarning)
+            return self._run(input_dir)
+
+    def _run(self, input_dir: Optional[Union[Path, str]] = None):
+        """See run()"""
+        from multiprocessing import Pool
+
         import umap
         from tqdm.auto import tqdm
 
         from fibad.config_utils import create_results_dir
         from fibad.data_sets.inference_dataset import InferenceDataSet, InferenceDataSetWriter
 
-        reducer = umap.UMAP(**self.config["umap.UMAP"])
+        self.reducer = umap.UMAP(**self.config["umap.UMAP"])
 
         # Set up the results directory where we will store our umapped output
         results_dir = create_results_dir(self.config, "umap")
@@ -87,29 +97,65 @@ def run(self, input_dir: Optional[Union[Path, str]] = None):
         data_sample = inference_results[index_choices].numpy().reshape((sample_size, -1))
 
         # Fit a single reducer on the sampled data
-        reducer.fit(data_sample)
+        self.reducer.fit(data_sample)
 
         # Save the reducer to our results directory
         with open(results_dir / "umap.pickle", "wb") as f:
-            pickle.dump(reducer, f)
+            pickle.dump(self.reducer, f)
 
         # Run all data through the reducer in batches, writing it out as we go.
         batch_size = self.config["data_loader"]["batch_size"]
         num_batches = int(np.ceil(total_length / batch_size))
 
         all_indexes = np.arange(0, total_length)
         all_ids = np.array([int(i) for i in inference_results.ids()])
-        for batch_indexes in tqdm(
-            np.array_split(all_indexes, num_batches),
-            desc="Creating Lower Dimensional Representation using UMAP",
-            total=num_batches,
-        ):
-            # We flatten all dimensions of the input array except the dimension
-            # corresponding to batch elements. This ensures that all inputs to
-            # the UMAP algorithm are flattend per input item in the batch
-            batch = inference_results[batch_indexes].reshape(len(batch_indexes), -1)
-            batch_ids = all_ids[batch_indexes]
-            transformed_batch = reducer.transform(batch)
-            umap_results.write_batch(batch_ids, transformed_batch)
+
+        # Process pool to do all the transforms
+        with Pool(processes=cpu_count()) as pool:
+            # Generator expression that gives a batch tuple composed of:
+            # batch ids, inference results
+            args = (
+                (
+                    all_ids[batch_indexes],
+                    # We flatten all dimensions of the input array except the dimension
+                    # corresponding to batch elements. This ensures that all inputs to
+                    # the UMAP algorithm are flattend per input item in the batch
+                    inference_results[batch_indexes].reshape(len(batch_indexes), -1),
+                )
+                for batch_indexes in np.array_split(all_indexes, num_batches)
+            )
+
+            # iterate over the mapped results to write out the umapped points
+            # imap returns results as they complete so writing should complete in parallel for large datasets
+            for batch_ids, transformed_batch in tqdm(
+                pool.imap(self._transform_batch, args),
+                desc="Creating LowerDimensional Representation using UMAP:",
+                total=num_batches,
+            ):
+                logger.debug("Writing a batch out async...")
+                umap_results.write_batch(batch_ids, transformed_batch)
 
         umap_results.write_index()
+
+    def _transform_batch(self, batch_tuple: tuple):
+        """Private helper to transform a single batch
+
+        Parameters
+        ----------
+        batch_tuple : tuple()
+            first element is the IDs of the batch as a numpy array
+            second element is the inference results to transform as a numpy array with shape (batch_len, N)
+            where N is the total number of dimensions in the inference result. Caller flattens all inference
+            result axes for us.
+
+        Returns
+        -------
+        tuple
+            first element is the ids of the batch as a numpy array
+            second element is the results of running the umap transform on the input as a numpy array.
+        """
+        batch_ids, batch = batch_tuple
+        with warnings.catch_warnings():
+            warnings.simplefilter(action="ignore", category=FutureWarning)
+            logger.debug("Transforming a batch ...")
+            return (batch_ids, self.reducer.transform(batch))