Sophiex/dev/name modules (ecmwf#754)

sophie-xhonneux · web-flow · commit f3facc758d42 · 2025-08-20T08:28:52.000+02:00
* Add names to modules as prep for freezing

* Add functionality to freeze modules based on added names

* Ruff

* Clean up

* Wrong import path

* Ruff
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -73,6 +73,11 @@ loss_fcts_val:
 batch_size_per_gpu: 1
 batch_size_validation_per_gpu: 1
 
+# a regex that needs to fully match the name of the modules you want to freeze
+# e.g. ".*ERA5" will match any module whose name ends in ERA5\
+# encoders and decoders that exist per stream have the stream name attached at the end
+freeze_modules: ""
+
 # training mode: "forecast" or "masking" (masked token modeling)
 # for "masking" to train with auto-encoder mode, forecast_offset should be 0
 training_mode: "masking"
diff --git a/src/weathergen/model/embeddings.py b/src/weathergen/model/embeddings.py
@@ -34,6 +34,7 @@ def __init__(
         norm_type="LayerNorm",
         embed_size_centroids=64,
         unembed_mode="full",
+        stream_name="stream_embed",
     ):
         """Constructor
 
@@ -46,6 +47,8 @@ def __init__(
 
         super(StreamEmbedTransformer, self).__init__()
 
+        self.name = f"StreamEmbedder_{stream_name}"
+
         self.num_tokens = num_tokens
         self.token_size = token_size
         self.num_channels = num_channels
@@ -194,11 +197,12 @@ def forward_columns(self, x_in, centroids):
 
 
 class StreamEmbedLinear(torch.nn.Module):
-    def __init__(self, dim_in, dim_out):
+    def __init__(self, dim_in, dim_out, stream_name="stream_embed"):
         """Constructor"""
 
         super(StreamEmbedLinear, self).__init__()
 
+        self.name = f"StreamEmbedder_{stream_name}"
         self.layer = torch.nn.Linear(dim_in, dim_out)
 
     def forward(self, x):
diff --git a/src/weathergen/model/engines.py b/src/weathergen/model/engines.py
@@ -29,6 +29,8 @@
 
 
 class EmbeddingEngine:
+    name: "EmbeddingEngine"
+
     def __init__(self, cf: Config, sources_size) -> None:
         """
         Initialize the EmbeddingEngine with the configuration.
@@ -47,6 +49,8 @@ def create(self) -> torch.nn.ModuleList:
         :return: torch.nn.ModuleList containing the embedding layers.
         """
         for i, si in enumerate(self.cf.streams):
+            stream_name = si.get("name", i)
+
             if "diagnostic" in si and si["diagnostic"]:
                 self.embeds.append(torch.nn.Identity())
                 continue
@@ -66,12 +70,15 @@ def create(self) -> torch.nn.ModuleList:
                         norm_type=self.cf.norm_type,
                         embed_size_centroids=self.cf.embed_size_centroids,
                         unembed_mode=self.cf.embed_unembed_mode,
+                        stream_name=stream_name,
                     )
                 )
             elif si["embed"]["net"] == "linear":
                 self.embeds.append(
                     StreamEmbedLinear(
-                        self.sources_size[i] * si["token_size"], self.cf.ae_local_dim_embed
+                        self.sources_size[i] * si["token_size"],
+                        self.cf.ae_local_dim_embed,
+                        stream_name=stream_name,
                     )
                 )
             else:
@@ -80,6 +87,8 @@ def create(self) -> torch.nn.ModuleList:
 
 
 class LocalAssimilationEngine:
+    name: "LocalAssimilationEngine"
+
     def __init__(self, cf: Config) -> None:
         """
         Initialize the LocalAssimilationEngine with the configuration.
@@ -122,6 +131,8 @@ def create(self) -> torch.nn.ModuleList:
 
 
 class Local2GlobalAssimilationEngine:
+    name: "Local2GlobalAssimilationEngine"
+
     def __init__(self, cf: Config) -> None:
         """
         Initialize the Local2GlobalAssimilationEngine with the configuration.
@@ -183,6 +194,8 @@ def create(self) -> torch.nn.ModuleList:
 
 
 class GlobalAssimilationEngine:
+    name: "GlobalAssimilationEngine"
+
     def __init__(self, cf: Config, num_healpix_cells: int) -> None:
         """
         Initialize the GlobalAssimilationEngine with the configuration.
@@ -250,6 +263,8 @@ def create(self) -> torch.nn.ModuleList:
 
 
 class ForecastingEngine:
+    name: "ForecastingEngine"
+
     def __init__(self, cf: Config, num_healpix_cells: int) -> None:
         """
         Initialize the ForecastingEngine with the configuration.
@@ -327,13 +342,13 @@ def init_weights_final(m):
 
 
 class EnsPredictionHead(torch.nn.Module):
-    #########################################
     def __init__(
         self,
         dim_embed,
         dim_out,
         ens_num_layers,
         ens_size,
+        stream_name: str,
         norm_type="LayerNorm",
         hidden_factor=2,
         final_activation: None | str = None,
@@ -342,6 +357,8 @@ def __init__(
 
         super(EnsPredictionHead, self).__init__()
 
+        self.name = f"EnsPredictionHead_{stream_name}"
+
         dim_internal = dim_embed * hidden_factor
         # norm = torch.nn.LayerNorm if norm_type == "LayerNorm" else RMSNorm
         enl = ens_num_layers
@@ -390,6 +407,7 @@ def __init__(
         tr_mlp_hidden_factor,
         softcap,
         tro_type,
+        stream_name: str,
     ):
         """
         Initialize the TargetPredictionEngine with the configuration.
@@ -403,6 +421,7 @@ def __init__(
         :param tro_type: Type of target readout (e.g., "obs_value").
         """
         super(TargetPredictionEngineClassic, self).__init__()
+        self.name = f"TargetPredictionEngine_{stream_name}"
 
         self.cf = cf
         self.dims_embed = dims_embed
@@ -496,6 +515,7 @@ def __init__(
         tr_mlp_hidden_factor,
         softcap,
         tro_type,
+        stream_name: str,
     ):
         """
         Initialize the TargetPredictionEngine with the configuration.
@@ -519,6 +539,7 @@ def __init__(
             LayerNorm that does not scale after the layer is applied
         """
         super(TargetPredictionEngine, self).__init__()
+        self.name = f"TargetPredictionEngine_{stream_name}"
 
         self.cf = cf
         self.dims_embed = dims_embed
diff --git a/src/weathergen/model/layers.py b/src/weathergen/model/layers.py
@@ -27,11 +27,15 @@ def __init__(
         norm_type="LayerNorm",
         dim_aux=None,
         norm_eps=1e-5,
+        name: str | None = None,
     ):
         """Constructor"""
 
         super(MLP, self).__init__()
 
+        if name is not None:
+            self.name = name
+
         assert num_layers >= 2
 
         self.with_residual = with_residual
diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py
@@ -260,6 +260,8 @@ def create(self) -> "Model":
         self.pred_heads = torch.nn.ModuleList()
 
         for i_obs, si in enumerate(cf.streams):
+            stream_name = si.get("name", i_obs)
+
             # extract and setup relevant parameters
             etc = si["embed_target_coords"]
             tro_type = si["target_readout"]["type"] if "type" in si["target_readout"] else "token"
@@ -310,6 +312,7 @@ def create(self) -> "Model":
                         with_residual=False,
                         dropout_rate=dropout_rate,
                         norm_eps=self.cf.mlp_norm_eps,
+                        stream_name=f"embed_target_coords_{stream_name}",
                     )
                 )
             else:
@@ -326,6 +329,7 @@ def create(self) -> "Model":
                         dropout_rate=dropout_rate,
                         norm_type=cf.norm_type,
                         norm_eps=self.cf.mlp_norm_eps,
+                        stream_name=f"pred_adapter_kv_{stream_name}",
                     )
                 )
             else:
@@ -345,6 +349,7 @@ def create(self) -> "Model":
                 tr_mlp_hidden_factor,
                 softcap,
                 tro_type,
+                stream_name=stream_name,
             )
 
             self.target_token_engines.append(tte)
@@ -362,6 +367,7 @@ def create(self) -> "Model":
                     si["pred_head"]["ens_size"],
                     norm_type=cf.norm_type,
                     final_activation=final_activation,
+                    stream_name=stream_name,
                 )
             )
 
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -9,6 +9,7 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
+import re
 import time
 from typing import Any
 
@@ -29,6 +30,7 @@
 import weathergen.utils.config as config
 from weathergen.datasets.multi_stream_data_sampler import MultiStreamDataSampler
 from weathergen.model.model import Model, ModelParams
+from weathergen.model.utils import freeze_weights
 from weathergen.train.loss_calculator import LossCalculator
 from weathergen.train.lr_scheduler import LearningRateScheduler
 from weathergen.train.trainer_base import TrainerBase
@@ -52,6 +54,8 @@ def init(
     ):
         self.cf = cf
 
+        self.freeze_modules = cf.get("freeze_modules", "")
+
         assert cf.samples_per_epoch % cf.batch_size_per_gpu == 0
         assert cf.samples_per_validation % cf.batch_size_validation_per_gpu == 0
         assert cf.forecast_policy if cf.forecast_steps > 0 else True
@@ -182,6 +186,11 @@ def run(self, cf, run_id_contd=None, epoch_contd=None):
         if cf.forecast_freeze_model:
             self.model = self.model.freeze_weights_forecast()
 
+        for name, module in self.model.named_modules():
+            name = module.name if hasattr(module, "name") else None
+            if name is not None and re.fullmatch(self.freeze_modules, name):
+                freeze_weights(module)
+
         self.model = self.model.to(self.devices[0])
 
         if cf.compile_model:

Original file line number	Diff line number	Diff line change
`@@ -260,6 +260,8 @@ def create(self) -> "Model":`
`260`	`260`	`self.pred_heads = torch.nn.ModuleList()`
`261`	`261`
`262`	`262`	`for i_obs, si in enumerate(cf.streams):`
	`263`	`+ stream_name = si.get("name", i_obs)`
	`264`	`+`
`263`	`265`	`# extract and setup relevant parameters`
`264`	`266`	`etc = si["embed_target_coords"]`
`265`	`267`	`tro_type = si["target_readout"]["type"] if "type" in si["target_readout"] else "token"`
`@@ -310,6 +312,7 @@ def create(self) -> "Model":`
`310`	`312`	`with_residual=False,`
`311`	`313`	`dropout_rate=dropout_rate,`
`312`	`314`	`norm_eps=self.cf.mlp_norm_eps,`
	`315`	`+ stream_name=f"embed_target_coords_{stream_name}",`
`313`	`316`	`)`
`314`	`317`	`)`
`315`	`318`	`else:`
`@@ -326,6 +329,7 @@ def create(self) -> "Model":`
`326`	`329`	`dropout_rate=dropout_rate,`
`327`	`330`	`norm_type=cf.norm_type,`
`328`	`331`	`norm_eps=self.cf.mlp_norm_eps,`
	`332`	`+ stream_name=f"pred_adapter_kv_{stream_name}",`
`329`	`333`	`)`
`330`	`334`	`)`
`331`	`335`	`else:`
`@@ -345,6 +349,7 @@ def create(self) -> "Model":`
`345`	`349`	`tr_mlp_hidden_factor,`
`346`	`350`	`softcap,`
`347`	`351`	`tro_type,`
	`352`	`+ stream_name=stream_name,`
`348`	`353`	`)`
`349`	`354`
`350`	`355`	`self.target_token_engines.append(tte)`
`@@ -362,6 +367,7 @@ def create(self) -> "Model":`
`362`	`367`	`si["pred_head"]["ens_size"],`
`363`	`368`	`norm_type=cf.norm_type,`
`364`	`369`	`final_activation=final_activation,`
	`370`	`+ stream_name=stream_name,`
`365`	`371`	`)`
`366`	`372`	`)`
`367`	`373`