ecmwf · javak87 · Dec 24, 2025 · Dec 24, 2025 · Dec 24, 2025 · Dec 24, 2025
diff --git a/config/config_physical_jepa.yml b/config/config_physical_jepa.yml
@@ -127,6 +127,11 @@ data_loading :
   num_workers: 12
   rng_seed: ???
 
+  # pin GPU memory for faster transfer; it is possible that enabling memory_pinning with 
+  # FSDP2 + DINOv2 can cause the job to hang and trigger a PyTorch timeout error.
+  # If this happens, you can disable the flag, but performance will drop on GH200.
+  memory_pinning: True
+
 
 # config for training
 training_config:
@@ -320,4 +325,4 @@ wgtags:
   # *** Experiment-specific tags ***
   # All extra tags (including lists, dictionaries, etc.) are treated 
   # as strings by mlflow, so treat all extra tags as simple string key: value pairs.
-  grid: null
+  grid: null
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -127,6 +127,11 @@ data_loading :
   rng_seed: ???
   repeat_data_in_mini_epoch : False
 
+  # pin GPU memory for faster transfer; it is possible that enabling memory_pinning with 
+  # FSDP2 + DINOv2 can cause the job to hang and trigger a PyTorch timeout error.
+  # If this happens, you can disable the flag, but performance will drop on GH200.
+  memory_pinning: True
+
 
 # config for training
 training_config:

diff --git a/src/weathergen/datasets/batch.py b/src/weathergen/datasets/batch.py
@@ -34,6 +34,25 @@ class Sample:
     # keys: stream_name, values: StreamData
     streams_data: dict[str, StreamData | None]
 
+    def pin_memory(self):
+        """Pin all tensors in this Sample to CPU pinned memory"""
+
+        # Pin StreamData objects in streams_data dict
+        if hasattr(self, "streams_data") and isinstance(self.streams_data, dict):
+            for _stream_name, stream_data in self.streams_data.items():
+                if stream_data is not None and hasattr(stream_data, "pin_memory"):
+                    stream_data.pin_memory()
+
+        # Pin tensors in meta_info
+        if hasattr(self, "meta_info") and isinstance(self.meta_info, dict):
+            for _key, meta_data in self.meta_info.items():
+                if isinstance(meta_data, SampleMetaData):
+                    # Pin mask tensor
+                    if meta_data.mask is not None and isinstance(meta_data.mask, torch.Tensor):
+                        meta_data.mask = meta_data.mask.pin_memory()
+
+        return self
+
     def __init__(self, streams: dict) -> None:
         self.meta_info = {}
 
@@ -156,6 +175,19 @@ def get_device(self) -> str | torch.device:
         """
         return self.device
 
+    def pin_memory(self):
+        """Pin all tensors in this batch to CPU pinned memory"""
+
+        # pin all samples
+        for sample in self.samples:
+            sample.pin_memory()
+
+        # pin source_tokens_lens
+        if isinstance(self.tokens_lens, torch.Tensor):
+            self.tokens_lens = self.tokens_lens.pin_memory()
+
+        return self
+
 
 class ModelBatch:
     """
@@ -186,6 +218,17 @@ def __init__(self, streams: dict, num_source_samples: int, num_target_samples: i
         self.source2target_matching_idxs = np.full(num_source_samples, -1, dtype=np.int32)
         self.target2source_matching_idxs = [[] for _ in range(num_target_samples)]
 
+    def pin_memory(self):
+        """Pin all tensors in this batch to CPU pinned memory"""
+
+        # pin source samples
+        self.source_samples.pin_memory()
+
+        # pin target samples
+        self.target_samples.pin_memory()
+
+        return self
+
     def to_device(self, device):  # -> ModelBatch
         """
         Move batch to device

diff --git a/src/weathergen/datasets/memory_pinning.py b/src/weathergen/datasets/memory_pinning.py
@@ -0,0 +1,42 @@
+from typing import Protocol, runtime_checkable
+
+import torch
+
+from weathergen.common.io import IOReaderData
+
+
+@runtime_checkable
+class Pinnable(Protocol):
+    """
+    Protocol that allows the pytorch content of a data structure
+    to be pinned to the memory of the current accelerator.
+
+    This extends the pin_memory() capability of a torch Tensor
+    to other classes.
+
+    It is blocking.
+    """
+
+    def pin_memory(self): ...
+
+
+def pin_object(obj: Pinnable | torch.Tensor | IOReaderData | list | dict | None):
+    if obj is None:
+        return
+    elif isinstance(obj, torch.Tensor | Pinnable):
+        obj.pin_memory()
+    elif isinstance(obj, IOReaderData):
+        # Special case: IOReaderData is in common package and can't have torch deps
+        # Note: These SHOULD be numpy arrays per the type hints, but might be tensors
+        pin_object(obj.coords)
+        pin_object(obj.data)
+        pin_object(obj.geoinfos)
+
+    elif isinstance(obj, list):
+        # Assume the list is a list of potentially pinnable objects and traverse it.
+        for e in obj:
+            pin_object(e)
+    elif isinstance(obj, dict):
+        # Assume the values are pinnable.
+        for e in obj.values():
+            pin_object(e)
diff --git a/src/weathergen/datasets/stream_data.py b/src/weathergen/datasets/stream_data.py
@@ -14,6 +14,37 @@
 from weathergen.common.io import IOReaderData
 
 
+def _pin_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    """Pin a tensor to CPU pinned memory.
+
+    Parameters
+    ----------
+    tensor : torch.Tensor
+
+    Returns
+    -------
+    torch.Tensor
+        The pinned tensor.
+    """
+    return tensor.pin_memory() if isinstance(tensor, torch.Tensor) else tensor
+
+
+def _pin_tensor_list(tensor_list: list) -> list:
+    """Pin all tensors in a list to CPU pinned memory.
+
+    Parameters
+    ----------
+    tensor_list : list
+        List of tensors (or other objects) to pin.
+
+    Returns
+    -------
+    list
+        List with all torch.Tensor elements pinned to CPU pinned memory.
+    """
+    return [_pin_tensor(t) for t in tensor_list]
+
+
 class StreamData:
     """
     StreamData object that encapsulates all data the model ingests for one batch item
@@ -75,6 +106,31 @@ def __init__(self, idx: int, input_steps: int, forecast_steps: int, healpix_cell
         self.source_idxs_embed = [torch.tensor([]) for _ in range(self.input_steps)]
         self.source_idxs_embed_pe = [torch.tensor([]) for _ in range(self.input_steps)]
 
+    def pin_memory(self):
+        """Pin all tensors in this StreamData object to CPU pinned memory"""
+
+        # Pin target tensors
+        self.target_coords = _pin_tensor_list(self.target_coords)
+        self.target_coords_lens = _pin_tensor_list(self.target_coords_lens)
+        self.target_tokens = _pin_tensor_list(self.target_tokens)
+        self.target_tokens_lens = _pin_tensor_list(self.target_tokens_lens)
+        self.idxs_inv = _pin_tensor_list(self.idxs_inv)
+        self.target_coords_raw = _pin_tensor_list(self.target_coords_raw)
+
+        # Pin source tensors
+        self.source_tokens_cells = _pin_tensor_list(self.source_tokens_cells)
+        self.source_tokens_lens = _pin_tensor_list(self.source_tokens_lens)
+        self.source_idxs_embed = _pin_tensor_list(self.source_idxs_embed)
+        self.source_idxs_embed_pe = _pin_tensor_list(self.source_idxs_embed_pe)
+
+        # Pin source_raw (list of IOReaderData objects)
+        if hasattr(self, "source_raw"):
+            for raw_data in self.source_raw:
+                if raw_data is not None and hasattr(raw_data, "pin_memory"):
+                    raw_data.pin_memory()
+
+        return self
+
     def to_device(self, device: str) -> None:
         """
         Move data to GPU

diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -180,7 +180,6 @@ def inference(self, cf, devices, run_id_contd, mini_epoch_contd):
             "batch_sampler": None,
             "shuffle": False,
             "num_workers": loader_num_workers,
-            "pin_memory": True,
         }
         self.data_loader_validation = torch.utils.data.DataLoader(
             self.dataset, **loader_params, sampler=None
@@ -226,7 +225,6 @@ def run(self, cf, devices, run_id_contd=None, mini_epoch_contd=None):
             "batch_sampler": None,
             "shuffle": False,
             "num_workers": cf.data_loading.num_workers,
-            "pin_memory": True,
         }
         self.data_loader = torch.utils.data.DataLoader(self.dataset, **loader_params, sampler=None)
         self.data_loader_validation = torch.utils.data.DataLoader(
@@ -398,6 +396,10 @@ def train(self, mini_epoch):
         # training loop
         self.t_start = time.time()
         for bidx, batch in enumerate(dataset_iter):
+            if cf.data_loading.get("memory_pinning", False):
+                # pin memory for faster CPU-GPU transfer
+                batch = batch.pin_memory()
+
             batch.to_device(self.device)
 
             with torch.autocast(
@@ -512,6 +514,10 @@ def validate(self, mini_epoch, mode_cfg, batch_size):
             # print progress bar but only in interactive mode, i.e. when without ddp
             with tqdm.tqdm(total=mode_cfg.samples_per_mini_epoch, disable=self.cf.with_ddp) as pbar:
                 for bidx, batch in enumerate(dataset_val_iter):
+                    if cf.data_loading.get("memory_pinning", False):
+                        # pin memory for faster CPU-GPU transfer
+                        batch = batch.pin_memory()
+
                     batch.to_device(self.device)
 
                     # evaluate model

diff --git a/uv.lock b/uv.lock