-
Notifications
You must be signed in to change notification settings - Fork 55
Expand file tree
/
Copy pathmain.py
More file actions
183 lines (147 loc) · 6.24 KB
/
main.py
File metadata and controls
183 lines (147 loc) · 6.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# ---------------------------------------------------------------
# © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved.
# Licensed under the MIT License.
#
# Portions of this file are adapted from PyTorch Lightning,
# used under the Apache 2.0 License.
# ---------------------------------------------------------------
import jsonargparse._typehints as _t
from types import MethodType
from gitignore_parser import parse_gitignore
import logging
import torch
import warnings
from lightning.pytorch import cli
from lightning.pytorch.callbacks import ModelSummary, LearningRateMonitor
from lightning.pytorch.loops.training_epoch_loop import _TrainingEpochLoop
from lightning.pytorch.loops.fetchers import _DataFetcher, _DataLoaderIterDataFetcher
from training.lightning_module import LightningModule
from datasets.lightning_data_module import LightningDataModule
# Suppress PyTorch FX warnings for DINOv3 models
import os
os.environ["TORCH_LOGS"] = "-dynamo"
_orig_single = _t.raise_unexpected_value
def _raise_single(*args, exception=None, **kwargs):
if isinstance(exception, Exception):
raise exception
return _orig_single(*args, exception=exception, **kwargs)
_orig_union = _t.raise_union_unexpected_value
def _raise_union(subtypes, val, vals):
for e in reversed(vals):
if isinstance(e, Exception):
raise e
return _orig_union(subtypes, val, vals)
_t.raise_unexpected_value = _raise_single
_t.raise_union_unexpected_value = _raise_union
def _should_check_val_fx(self: _TrainingEpochLoop, data_fetcher: _DataFetcher) -> bool:
if not self._should_check_val_epoch():
return False
is_infinite_dataset = self.trainer.val_check_batch == float("inf")
is_last_batch = self.batch_progress.is_last_batch
if is_last_batch and (
is_infinite_dataset or isinstance(data_fetcher, _DataLoaderIterDataFetcher)
):
return True
if self.trainer.should_stop and self.trainer.fit_loop._can_stop_early:
return True
is_val_check_batch = is_last_batch
if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset:
is_val_check_batch = (
self.batch_idx + 1
) % self.trainer.limit_train_batches == 0
elif self.trainer.val_check_batch != float("inf"):
if self.trainer.check_val_every_n_epoch is not None:
is_val_check_batch = (
self.batch_idx + 1
) % self.trainer.val_check_batch == 0
else:
# added below to check val based on global steps instead of batches in case of iteration based val check and gradient accumulation
is_val_check_batch = (
self.global_step
) % self.trainer.val_check_batch == 0 and not self._should_accumulate()
return is_val_check_batch
class LightningCLI(cli.LightningCLI):
def __init__(self, *args, **kwargs):
logging.getLogger().setLevel(logging.INFO)
torch.set_float32_matmul_precision("medium")
torch._dynamo.config.capture_scalar_outputs = True
torch._dynamo.config.suppress_errors = True
warnings.filterwarnings(
"ignore",
message=r".*It is recommended to use .* when logging on epoch level in distributed setting to accumulate the metric across devices.*",
)
warnings.filterwarnings(
"ignore",
message=r"^The ``compute`` method of metric PanopticQuality was called before the ``update`` method.*",
)
warnings.filterwarnings(
"ignore", message=r"^Grad strides do not match bucket view strides.*"
)
warnings.filterwarnings(
"ignore",
message=r".*Detected call of `lr_scheduler\.step\(\)` before `optimizer\.step\(\)`.*",
)
warnings.filterwarnings(
"ignore",
message=r".*functools.partial will be a method descriptor in future Python versions*",
)
super().__init__(*args, **kwargs)
def add_arguments_to_parser(self, parser):
parser.add_argument("--compile_disabled", action="store_true")
parser.link_arguments(
"data.init_args.num_classes", "model.init_args.num_classes"
)
parser.link_arguments(
"data.init_args.num_classes",
"model.init_args.network.init_args.num_classes",
)
parser.link_arguments(
"data.init_args.stuff_classes", "model.init_args.stuff_classes"
)
parser.link_arguments("data.init_args.img_size", "model.init_args.img_size")
parser.link_arguments(
"data.init_args.img_size", "model.init_args.network.init_args.img_size"
)
parser.link_arguments(
"data.init_args.img_size",
"model.init_args.network.init_args.encoder.init_args.img_size",
)
parser.link_arguments(
"model.init_args.ckpt_path",
"model.init_args.network.init_args.encoder.init_args.ckpt_path",
)
def fit(self, model, **kwargs):
if hasattr(self.trainer.logger.experiment, "log_code"):
is_gitignored = parse_gitignore(".gitignore")
include_fn = lambda path: path.endswith(".py") or path.endswith(".yaml")
self.trainer.logger.experiment.log_code(
".", include_fn=include_fn, exclude_fn=is_gitignored
)
self.trainer.fit_loop.epoch_loop._should_check_val_fx = MethodType(
_should_check_val_fx, self.trainer.fit_loop.epoch_loop
)
if not self.config[self.config["subcommand"]]["compile_disabled"]:
model = torch.compile(model)
self.trainer.fit(model, **kwargs)
def cli_main():
LightningCLI(
LightningModule,
LightningDataModule,
subclass_mode_model=True,
subclass_mode_data=True,
save_config_callback=None,
seed_everything_default=0,
trainer_defaults={
"precision": "16-mixed",
"enable_model_summary": False,
"callbacks": [
ModelSummary(max_depth=3),
LearningRateMonitor(logging_interval="epoch"),
],
"devices": 1,
"gradient_clip_val": 0.01,
"gradient_clip_algorithm": "norm",
},
)
if __name__ == "__main__":
cli_main()