ecmwf
diff --git a/‎models/src/anemoi/models/migrations/scripts/1763479917_hardware_schema_update.py‎
Lines changed: 71 additions & 0 deletions b/‎models/src/anemoi/models/migrations/scripts/1763479917_hardware_schema_update.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎training/docs/user-guide/distributed.rst‎
Lines changed: 14 additions & 14 deletions b/‎training/docs/user-guide/distributed.rst‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎training/docs/user-guide/models.rst‎
Lines changed: 1 addition & 1 deletion b/‎training/docs/user-guide/models.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎training/docs/user-guide/tracking.rst‎
Lines changed: 1 addition & 1 deletion b/‎training/docs/user-guide/tracking.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎training/docs/user-guide/training.rst‎
Lines changed: 6 additions & 16 deletions b/‎training/docs/user-guide/training.rst‎
Lines changed: 6 additions & 16 deletions
diff --git a/‎training/docs/user-guide/yaml/dataloader.yaml‎
Lines changed: 4 additions & 4 deletions b/‎training/docs/user-guide/yaml/dataloader.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎training/docs/user-guide/yaml/example_crps_config.yaml‎
Lines changed: 10 additions & 9 deletions b/‎training/docs/user-guide/yaml/example_crps_config.yaml‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎training/src/anemoi/training/config/config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎training/src/anemoi/training/config/config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎training/src/anemoi/training/config/dataloader/native_grid.yaml‎
Lines changed: 5 additions & 5 deletions b/‎training/src/anemoi/training/config/dataloader/native_grid.yaml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎training/src/anemoi/training/config/debug.yaml‎
Lines changed: 11 additions & 10 deletions b/‎training/src/anemoi/training/config/debug.yaml‎
Lines changed: 11 additions & 10 deletions
@@ -0,0 +1,71 @@
+# (C) Copyright 2025 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+from anemoi.models.migrations import CkptType
+from anemoi.models.migrations import MigrationContext
+from anemoi.models.migrations import MigrationMetadata
+
+# DO NOT CHANGE -->
+metadata = MigrationMetadata(
+    versions={
+        "migration": "1.0.0",
+        "anemoi-models": "%NEXT_ANEMOI_MODELS_VERSION%",
+    },
+)
+# <-- END DO NOT CHANGE
+
+
+def migrate_setup(context: MigrationContext) -> None:
+    """Migrate setup callback to be run before loading the checkpoint.
+
+    Parameters
+    ----------
+    context : MigrationContext
+       A MigrationContext instance
+    """
+    context.move_attribute(
+        "anemoi.training.schemas.hardware.HardwareSchema", "anemoi.training.schemas.system.HardwareSchema"
+    )
+    context.move_attribute("anemoi.training.schemas.hardware.FilesSchema", "anemoi.training.schemas.system.InputSchema")
+    context.move_attribute(
+        "anemoi.training.schemas.hardware.PathsSchema", "anemoi.training.schemas.system.OutputSchema"
+    )
+    context.move_module("anemoi.training.schemas.hardware", "anemoi.training.schemas.system")
+
+
+def migrate(ckpt: CkptType) -> CkptType:
+    """Migrate the checkpoint.
+
+    Parameters
+    ----------
+    ckpt : CkptType
+        The checkpoint dict.
+
+    Returns
+    -------
+    CkptType
+        The migrated checkpoint dict.
+    """
+    return ckpt
+
+
+def rollback(ckpt: CkptType) -> CkptType:
+    """Rollback the checkpoint.
+
+    Parameters
+    ----------
+    ckpt : CkptType
+        The checkpoint dict.
+
+    Returns
+    -------
+    CkptType
+        The rollbacked checkpoint dict.
+    """
+    return ckpt
@@ -40,13 +40,13 @@ shown in the figure below
 
    Model Sharding (source: `Jacobs et al. (2023) <https://arxiv.org/pdf/2309.14509>`_)
 
-To use model sharding, set ``config.hardware.num_gpus_per_model`` to the
-number of GPUs you wish to shard the model across. Set ``config.model.
-keep_batch_sharded=True`` to also keep batches fully sharded throughout
-training, reducing memory usage for large inputs or long rollouts. It is
-recommended to only shard if the model does not fit in GPU memory, as
-data distribution is a much more efficient way to parallelise the
-training.
+To use model sharding, set ``config.system.hardware.num_gpus_per_model``
+to the number of GPUs you wish to shard the model across. Set
+``config.model. keep_batch_sharded=True`` to also keep batches fully
+sharded throughout training, reducing memory usage for large inputs or
+long rollouts. It is recommended to only shard if the model does not fit
+in GPU memory, as data distribution is a much more efficient way to
+parallelise the training.
 
 Anemoi Training provides different sharding strategies depending if the
 model task is deterministic or ensemble based.
@@ -57,7 +57,7 @@ For deterministic models, the ``DDPGroupStrategy`` is used:
 
    strategy:
       _target_: anemoi.training.distributed.strategy.DDPGroupStrategy
-      num_gpus_per_model: ${hardware.num_gpus_per_model}
+      num_gpus_per_model: ${system.hardware.num_gpus_per_model}
       read_group_size: ${dataloader.read_group_size}
 
 When using model sharding, ``config.dataloader.read_group_size`` allows
@@ -72,20 +72,20 @@ across GPUs:
 
    strategy:
      _target_: anemoi.training.distributed.strategy.DDPEnsGroupStrategy
-     num_gpus_per_model: ${hardware.num_gpus_per_model}
+     num_gpus_per_model: ${system.hardware.num_gpus_per_model}
      read_group_size: ${dataloader.read_group_size}
 
-This requires setting ``config.hardware.num_gpus_per_ensemble`` to the
-number of GPUs you wish to parallelise the ensemble members across and
-``config.training.ensemble_size_per_device`` to the number of ensemble
-members per GPU.
+This requires setting ``config.system.hardware.num_gpus_per_ensemble``
+to the number of GPUs you wish to parallelise the ensemble members
+across and ``config.training.ensemble_size_per_device`` to the number of
+ensemble members per GPU.
 
 *********
  Example
 *********
 
 Suppose the job is running on 2 nodes each with 4 GPUs and that
-``config.hardware.num_gpus_per_model=2`` and
+``config.system.hardware.num_gpus_per_model=2`` and
 ``config.dataloader.batch_size.training=4``. Then each model will be
 sharded across 2 GPUs and the data sharded across ``total number of
 GPUs/num_gpus_per_model=4``. This means the effective batch size is 16.
@@ -55,7 +55,7 @@ al. (2023).
 
 The physical data is encoded on to a multi-mesh latent space of
 decreasing resolution. This multi-mesh is defined by the graph given in
-``config.hardware.files.graph``.
+``config.system.input.graph``.
 
 .. figure:: ../images/gnn-encoder-decoder-multimesh.jpg
    :width: 500
 
@@ -113,7 +113,7 @@ manually installed:
 To enable offline logging, set
 ``config.diagnostics.logger.mlflow.offline`` to ``True`` and run the
 training as usual. Logs will be saved to the directory specified in
-``config.hardware.paths.logs``
+``config.system.output.logs``
 
 When training is done, use the ``mlflow sync`` command to sync the
 offline logs to a server:
 
@@ -502,18 +502,8 @@ finished training. It's also possible to restart the model training from
 a specific checkpoint. This can either be a checkpoint from the same run
 or a checkpoint from a different run that you have run in the past or
 that you using for transfer learning. To do this, set
-``config.hardware.files.warm_start`` to be the checkpoint filename they
-want to restart from and ``config.hardware.paths.warm_start`` to be the
-path to the checkpoint. See the example below.
-
-.. code:: yaml
-
-   # This is a sample YAML block
-   hardware:
-      files:
-         warm_start: checkpoint_epoch_10.ckpt
-      paths:
-         warm_start: /path/to/checkpoint/folder/
+``config.system.input.warm_start`` to be the path to the checkpoint they
+want to restart from.
 
 The above can be adapted depending on the use case and taking advantage
 of hydra, you can also reuse ``config.training.run_id`` or
@@ -540,10 +530,10 @@ flag to True in the configuration file.
       transfer_learning: True
 
 When this flag is active and a checkpoint path is specified in
-config.hardware.files.warm_start or self.last_checkpoint, the system
-loads the pre-trained weights using the `transfer_learning_loading`
-function. This approach ensures only compatible weights are loaded and
-mismatched layers are handled appropriately.
+config.system.input.warm_start or self.last_checkpoint, the system loads
+the pre-trained weights using the `transfer_learning_loading` function.
+This approach ensures only compatible weights are loaded and mismatched
+layers are handled appropriately.
 
 For example, transfer learning might be used to adapt a weather
 forecasting model trained on one geographic region to another region
 
@@ -1,11 +1,11 @@
-dataset: ${hardware.paths.data}/${hardware.files.dataset}
+dataset: ${system.input.dataset}
 
 training:
   dataset: ${dataloader.dataset}
   start: null
   end: 2020
   frequency: ${data.frequency}
-  drop:  []
+  drop: []
 
 validation_rollout: 1 # number of rollouts to use for validation, must be equal or greater than rollout expected by callbacks
 
@@ -14,11 +14,11 @@ validation:
   start: 2021-01-01
   end: 2021
   frequency: ${data.frequency}
-  drop:  []
+  drop: []
 
 test:
   dataset: ${dataloader.dataset}
   start: 2022-01
   end: null
   frequency: ${data.frequency}
-  drop:  []
+  drop: []
@@ -2,7 +2,7 @@ defaults:
   - data: zarr
   - dataloader: native_grid
   - diagnostics: evaluation
-  - hardware: example
+  - system: example
   - graph: encoder_decoder_only
   - model: transformer_ens
   - training: default
@@ -11,14 +11,15 @@ defaults:
 config_validation: True
 
 # Changes in hardware
-hardware:
+system:
   files:
     truncation: ${data.resolution}-O32-linear.mat.npz
     truncation_inv: O32-${data.resolution}-linear.mat.npz
-  num_gpus_per_ensemble: 1
-  num_gpus_per_node: 1
-  num_nodes: 1
-  num_gpus_per_model: 1
+  hardware:
+    num_gpus_per_ensemble: 1
+    num_gpus_per_node: 1
+    num_nodes: 1
+    num_gpus_per_model: 1
 
 data:
   resolution: o96
@@ -32,13 +33,13 @@ training:
   # Changes in strategy
   strategy:
     _target_: anemoi.training.distributed.strategy.DDPEnsGroupStrategy
-    num_gpus_per_ensemble: ${hardware.num_gpus_per_ensemble}
-    num_gpus_per_model: ${hardware.num_gpus_per_model}
+    num_gpus_per_ensemble: ${system.hardware.num_gpus_per_ensemble}
+    num_gpus_per_model: ${system.hardware.num_gpus_per_model}
 
   # Changes in training loss
   training_loss:
     _target_: anemoi.training.losses.kcrps.AlmostFairKernelCRPS
-    scalars: ['variable']
+    scalars: ["variable"]
     ignore_nans: False
     alpha: 1.0
 
 
@@ -2,7 +2,7 @@ defaults:
 - data: zarr
 - dataloader: native_grid
 - diagnostics: evaluation
-- hardware: example
+- system: example
 - graph: multi_scale
 - model: gnn
 - training: default
 
@@ -10,7 +10,7 @@ pin_memory: True
 #   The number of GPUs per model must be divisible by read_group_size.
 #   To disable, set to 1.
 # ============
-read_group_size: ${hardware.num_gpus_per_model}
+read_group_size: ${system.hardware.num_gpus_per_model}
 
 num_workers:
   training: 8
@@ -50,14 +50,14 @@ grid_indices:
 # See https://anemoi-datasets.readthedocs.io
 # ============
 
-dataset: ${hardware.paths.data}/${hardware.files.dataset}
+dataset: ${system.input.dataset}
 
 training:
   dataset: ${dataloader.dataset}
   start: null
   end: 2020
   frequency: ${data.frequency}
-  drop:  []
+  drop: []
 
 validation_rollout: 1 # number of rollouts to use for validation, must be equal or greater than rollout expected by callbacks
 
@@ -66,11 +66,11 @@ validation:
   start: 2021
   end: 2021
   frequency: ${data.frequency}
-  drop:  []
+  drop: []
 
 test:
   dataset: ${dataloader.dataset}
   start: 2022
   end: null
   frequency: ${data.frequency}
-  drop:  []
+  drop: []
@@ -2,7 +2,7 @@ defaults:
 - data: zarr
 - dataloader: native_grid
 - diagnostics: evaluation
-- hardware: example
+- system: example
 - graph: multi_scale
 - model: gnn
 - training: default
@@ -14,20 +14,21 @@ config_validation: True
 ##  When you commit your changes, assign the new features and keywords
 ##  to the correct defaults.
 # For example to change from default GPU count:
-# hardware:
-#   num_gpus_per_node: 1
+# system:
+#   hardware:
+#     num_gpus_per_node: 1
 
 diagnostics:
   plot:
     callbacks: []
-hardware:
-  files:
+system:
+  input:
     graph: ???
-  accelerator: auto
-  num_gpus_per_node: 1
-  num_nodes: 1
-  num_gpus_per_model: 1
-
+  hardware:
+    accelerator: auto
+    num_gpus_per_node: 1
+    num_nodes: 1
+    num_gpus_per_model: 1
 
 model:
   num_channels: 128