fix(models): assert no dropout (#638)

JPXKQX · anaprietonem · web-flow · commit c1bbcece6996 · 2025-11-14T09:58:36.000+01:00
## Description  This PR removes an extra `not` in the assertion checking the usage of dropout with multiple GPUs.  ***As a contributor to the Anemoi framework, please ensure that your changes include unit tests, updates to any affected dependencies and documentation, and have been tested in a parallel setting (i.e., with multiple GPUs). As a reviewer, you are also responsible for verifying these aspects and requesting changes if they are not adequately addressed. For guidelines about those please refer to https://anemoi.readthedocs.io/en/latest/*** By opening this pull request, I affirm that all authors agree to the [Contributor License Agreement.](https://github.com/ecmwf/codex/blob/main/Legal/contributor_license_agreement.md) --------- Co-authored-by: Ana Prieto Nemesio <91897203+anaprietonem@users.noreply.github.com>
diff --git a/models/src/anemoi/models/layers/processor.py b/models/src/anemoi/models/layers/processor.py
@@ -55,6 +55,8 @@ def __init__(
 
         self.layer_factory = load_layer_kernels(layer_kernels)
 
+        self._has_dropout = kwargs.get("dropout_p", 0.0) > 0 if "dropout_p" in kwargs else False
+
         assert (
             num_layers % num_chunks == 0
         ), f"Number of processor layers ({num_layers}) has to be divisible by the number of processor chunks ({num_chunks})."
@@ -83,6 +85,12 @@ def run_layers(self, data: tuple, *args, **kwargs) -> Tensor:
 
     def forward(self, x: Tensor, *args, **kwargs) -> Tensor:
         """Example forward pass."""
+
+        if (model_comm_group := kwargs.get("model_comm_group", None)) is not None:
+            assert (
+                model_comm_group.size() == 1 or not self._has_dropout
+            ), f"Dropout is not supported when model is sharded across {model_comm_group.size()} GPUs"
+
         x = self.run_layers((x,), *args, **kwargs)
         return x
 
@@ -108,6 +116,7 @@ def __init__(
             num_chunks=num_chunks,
             cpu_offload=cpu_offload,
             layer_kernels=layer_kernels,
+            dropout_p=dropout_p,
         )
 
         self.build_layers(
@@ -121,8 +130,6 @@ def __init__(
 
         self.offload_layers(cpu_offload)
 
-        self._has_dropout = dropout_p > 0 if dropout_p else False
-
     def forward(
         self,
         x: Tensor,
@@ -136,11 +143,7 @@ def forward(
         if model_comm_group:
             assert (
                 model_comm_group.size() == 1 or batch_size == 1
-            ), "Only batch size of 1 is supported when model is sharded accross GPUs"
-
-            assert (
-                model_comm_group.size() > 1 and not self._has_dropout
-            ), "Dropout is not supported when model is sharded across GPUS"
+            ), f"Only batch size of 1 is supported when model is sharded accross {model_comm_group.size()} GPUs"
 
         (x,) = self.run_layers((x,), shape_nodes, batch_size, model_comm_group, **kwargs)
 
@@ -210,6 +213,7 @@ def __init__(
             num_heads=num_heads,
             mlp_hidden_ratio=mlp_hidden_ratio,
             layer_kernels=layer_kernels,
+            dropout_p=dropout_p,
         )
 
         self.build_layers(