Tell FSDP2 about embedding engine forward functions (ecmwf#1133)

sophie-xhonneux · web-flow · commit b163ea91d776 · 2025-10-23T14:39:08.000+02:00
* Tell FSDP2 about embedding engine forward functions

Note DO NOT add print functions in forward functions of the model, it
will break with FSDP2

* Add comment
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -239,6 +239,16 @@ def init_model_and_shard(self, cf, devices):
             fully_shard(model)
             for tensor in itertools.chain(model.parameters(), model.buffers()):
                 assert tensor.device == torch.device("meta")
+
+        # For reasons we do not yet fully understand, when using train continue in some 
+        # instances, FSDP2 does not register the forward_channels and forward_columns
+        # functions in the embedding engine as forward functions. Thus, yielding a crash
+        # because the input tensors are not converted to DTensors. This seems to primarily
+        # occur during validation.
+        for embed in model.embeds:
+            torch.distributed.fsdp.register_fsdp_forward_method(embed, "forward_channels")
+            torch.distributed.fsdp.register_fsdp_forward_method(embed, "forward_columns")
+
         return model, model_params
 
     def run(self, cf, devices, run_id_contd=None, epoch_contd=None):