WeatherGenerator/src/weathergen/run_train.py at 030bc3086edf620835a5c0e88425ff3f9df0fc4f · mtar/WeatherGenerator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# (C) Copyright 2025 WeatherGenerator contributors.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
#
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

"""
The entry point for training and inference weathergen-atmo
"""

import logging
import pdb
import sys
import time
import traceback
from pathlib import Path

import weathergen.common.config as config
import weathergen.utils.cli as cli
from weathergen.common.logger import init_loggers
from weathergen.train.trainer import Trainer

logger = logging.getLogger(__name__)


def inference():
    # By default, arguments from the command line are read.
    inference_from_args(sys.argv[1:])


def inference_from_args(argl: list[str]):
    """
    Inference function for WeatherGenerator model.
    Entry point for calling the inference code from the command line.

    When running integration tests, the arguments are directly provided.
    """
    parser = cli.get_inference_parser()
    args = parser.parse_args(argl)

    inference_overwrite = dict(
        shuffle=False,
        start_date_val=args.start_date,
        end_date_val=args.end_date,
        samples_per_validation=args.samples,
        log_validation=args.samples if args.save_samples else 0,
        streams_output=args.streams_output,
    )

    cli_overwrite = config.from_cli_arglist(args.options)
    cf = config.load_merge_configs(
        args.private_config,
        args.from_run_id,
        args.mini_epoch,
        args.base_config,
        *args.config,
        inference_overwrite,
        cli_overwrite,
    )
    cf = config.set_run_id(cf, args.run_id, args.reuse_run_id)

    devices = Trainer.init_torch()
    cf = Trainer.init_ddp(cf)

    init_loggers(cf.run_id)

    logger.info(f"DDP initialization: rank={cf.rank}, world_size={cf.world_size}")

    cf.run_history += [(args.from_run_id, cf.istep)]

    trainer = Trainer(cf.train_log_freq)
    try:
        trainer.inference(cf, devices, args.from_run_id, args.mini_epoch)
    except Exception:
        extype, value, tb = sys.exc_info()
        traceback.print_exc()
        if cf.world_size == 1:
            pdb.post_mortem(tb)


####################################################################################################
def train_continue() -> None:
    """
    Function to continue training for WeatherGenerator model.
    Entry point for calling train_continue from the command line.
    Configurations are set in the function body.

    Args:
      from_run_id (str): Run/model id of pretrained WeatherGenerator model to
        continue training. Defaults to None.
    Note: All model configurations are set in the function body.
    """
    train_continue_from_args(sys.argv[1:])


def train_continue_from_args(argl: list[str]):
    parser = cli.get_continue_parser()
    args = parser.parse_args(argl)

    finetune_overwrite = dict()
    cli_overwrite = config.from_cli_arglist(args.options)
    cf = config.load_merge_configs(
        args.private_config,
        args.from_run_id,
        args.mini_epoch,
        finetune_overwrite,
        args.base_config,
        *args.config,
        cli_overwrite,
    )
    cf = config.set_run_id(cf, args.run_id, args.reuse_run_id)

    mp_method = cf.get("multiprocessing_method", "fork")
    devices = Trainer.init_torch(multiprocessing_method=mp_method)
    cf = Trainer.init_ddp(cf)

    init_loggers(cf.run_id)

    # track history of run to ensure traceability of results
    cf.run_history += [(args.from_run_id, cf.istep)]

    trainer = Trainer(cf.train_log_freq)

    try:
        trainer.run(cf, devices, args.from_run_id, args.mini_epoch)
    except Exception:
        extype, value, tb = sys.exc_info()
        traceback.print_exc()
        if cf.world_size == 1:
            pdb.post_mortem(tb)


####################################################################################################
def train() -> None:
    """
    Training function for WeatherGenerator model.
    Entry point for calling the training code from the command line.
    Configurations are set in the function body.

    Args:
      run_id (str, optional): Run/model id of pretrained WeatherGenerator model to
        continue training. Defaults to None.
    Note: All model configurations are set in the function body.
    """
    train_with_args(sys.argv[1:], None)


def train_with_args(argl: list[str], stream_dir: str | None):
    """
    Training function for WeatherGenerator model."""
    parser = cli.get_train_parser()
    args = parser.parse_args(argl)

    cli_overwrite = config.from_cli_arglist(args.options)

    cf = config.load_merge_configs(args.private_config, None, None, args.base_config, *args.config, cli_overwrite)
    cf = config.set_run_id(cf, args.run_id, False)

    cf.data_loader_rng_seed = int(time.time())
    mp_method = cf.get("multiprocessing_method", "fork")
    devices = Trainer.init_torch(multiprocessing_method=mp_method)
    cf = Trainer.init_ddp(cf)

    # if cf.rank == 0:
    # this line should probably come after the processes have been sorted out else we get lots
    # of duplication due to multiple process in the multiGPU case
    init_loggers(cf.run_id)

    logger.info(f"DDP initialization: rank={cf.rank}, world_size={cf.world_size}")

    cf.streams = config.load_streams(Path(cf.streams_directory))

    if cf.with_flash_attention:
        assert cf.with_mixed_precision

    trainer = Trainer(cf.train_log_freq)

    try:
        trainer.run(cf, devices)
    except Exception:
        extype, value, tb = sys.exc_info()
        traceback.print_exc()
        if cf.world_size == 1:
            pdb.post_mortem(tb)


if __name__ == "__main__":
    # Entry point for slurm script.
    # Check whether --from_run_id passed as argument.
    if any("--from_run_id" in arg for arg in sys.argv):
        train_continue()
    else:
        train()