multi-gpu inference. Adds 'batch index' to the resulting prediction

skothenhill-nv · skothenhill-nv · commit 12672642e7d6 · 2025-04-30T20:44:07.000Z
dictionary. this allows users to reconstruct the original ordering of
predictions with multi-gpu inference.

Signed-off-by: Steven &lt;skothenhill@nvidia.com&gt;
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/callbacks.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/callbacks.py
@@ -29,7 +29,12 @@
 
 
 class PredictionWriter(BasePredictionWriter, pl.Callback):
-    """A callback that writes predictions to disk at specified intervals during training."""
+    """A callback that writes predictions to disk at specified intervals during training.
+
+    Logits, Embeddings, Hiddens, Input IDs, and Labels may all be saved to the disk depending on trainer configuration.
+    Batch Idxs are provided for each prediction in the same dictionary. These must be used to maintain order between
+    multi device predictions and single device predictions.
+    """
 
     def __init__(
         self,
@@ -42,15 +47,28 @@ def __init__(
 
         Args:
             output_dir: The directory where predictions will be written.
-            write_interval: The interval at which predictions will be written. (batch, epoch)
+            write_interval: The interval at which predictions will be written (batch, epoch). Epoch may not be used with multi-device trainers.
             batch_dim_key_defaults: The default batch dimension for each key, if different from the standard 0.
             seq_dim_key_defaults: The default sequence dimension for each key, if different from the standard 1.
         """
         super().__init__(write_interval)
+        self.write_interval = write_interval
         self.output_dir = str(output_dir)
         self.batch_dim_key_defaults = batch_dim_key_defaults
         self.seq_dim_key_defaults = seq_dim_key_defaults
 
+    def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, *args, **kwargs) -> None:  # noqa: D417
+        """Invoked with Trainer.fit, validate, test, and predict are called. Will immediately fail when 'write_interval' is 'epoch' and 'trainer.num_devices' > 1.
+
+        Args:
+            trainer: The Trainer instance.
+            pl_module: The LightningModule instance.
+        """
+        if trainer.num_devices > 1 and self.write_interval == "epoch":
+            raise ValueError(
+                "Multi-GPU predictions are not permitted as outputs are not ordered and batch indices are lost."
+            )
+
     def write_on_batch_end(
         self,
         trainer: pl.Trainer,
@@ -63,6 +81,9 @@ def write_on_batch_end(
     ) -> None:
         """Writes predictions to disk at the end of each batch.
 
+        Predictions files follow the naming pattern, where rank is the active GPU in which the predictions were made.
+        predictions__rank_{rank}__batch_{batch_idx}.pt
+
         Args:
             trainer: The Trainer instance.
             pl_module: The LightningModule instance.
@@ -78,6 +99,8 @@ def write_on_batch_end(
 
         # batch_indices is not captured due to a lightning bug when return_predictions = False
         # we use input IDs in the prediction to map the result to input
+        prediction["batch_idx"] = batch_idx
+
         torch.save(prediction, result_path)
         logging.info(f"Inference predictions are stored in {result_path}\n{prediction.keys()}")
 
@@ -90,14 +113,23 @@ def write_on_epoch_end(
     ) -> None:
         """Writes predictions to disk at the end of each epoch.
 
+        Writing all predictions on epoch end is memory intensive. It is recommended to use the batch writer instead for
+        large predictions.
+
+        Multi-device predictions will likely yield predictions in an order that is inconsistent with single device predictions and the input data.
+
         Args:
             trainer: The Trainer instance.
             pl_module: The LightningModule instance.
             predictions: The predictions made by the model.
             batch_indices: The indices of the batch.
+
+        Raises:
+            Multi-GPU predictions are output in an inconsistent order with multiple devices.
         """
         # this will create N (num processes) files in `output_dir` each containing
         # the predictions of it's respective rank
+
         result_path = os.path.join(self.output_dir, f"predictions__rank_{trainer.global_rank}.pt")
 
         # collate multiple batches / ignore empty ones
@@ -106,13 +138,14 @@ def write_on_epoch_end(
             collate_kwargs["batch_dim_key_defaults"] = self.batch_dim_key_defaults
         if self.seq_dim_key_defaults is not None:
             collate_kwargs["seq_dim_key_defaults"] = self.seq_dim_key_defaults
+
         prediction = batch_collator([item for item in predictions if item is not None], **collate_kwargs)
 
         # batch_indices is not captured due to a lightning bug when return_predictions = False
         # we use input IDs in the prediction to map the result to input
-        torch.save(prediction, result_path)
         if isinstance(prediction, dict):
             keys = prediction.keys()
         else:
             keys = "tensor"
+        torch.save(prediction, result_path)
         logging.info(f"Inference predictions are stored in {result_path}\n{keys}")