|
10 | 10 | import unittest
|
11 | 11 |
|
12 | 12 | from copy import deepcopy
|
13 |
| -from typing import TypeVar |
| 13 | +from typing import Tuple, TypeVar |
14 | 14 | from unittest.mock import MagicMock, patch
|
15 | 15 |
|
16 | 16 | import torch
|
|
27 | 27 |
|
28 | 28 | from torchtnt.framework.auto_unit import AutoPredictUnit, SWALRParams, SWAParams
|
29 | 29 | from torchtnt.framework.evaluate import evaluate
|
| 30 | +from torchtnt.framework.fit import fit |
30 | 31 | from torchtnt.framework.predict import predict
|
| 32 | +from torchtnt.framework.state import ActivePhase, State |
31 | 33 | from torchtnt.framework.train import train
|
32 | 34 | from torchtnt.utils.distributed import spawn_multi_process
|
33 | 35 | from torchtnt.utils.env import init_from_env, seed
|
|
38 | 40 | T = TypeVar("T")
|
39 | 41 |
|
40 | 42 |
|
| 43 | +Batch = Tuple[torch.Tensor, torch.Tensor] |
| 44 | + |
| 45 | + |
| 46 | +class DummySWAAutoUnit(DummyAutoUnit): |
| 47 | + def compute_loss(self, state: State, data: Batch) -> Tuple[torch.Tensor, object]: |
| 48 | + """ |
| 49 | + Computes loss for given batch. If in EVAL or PREDICT phase, uses swa model's output |
| 50 | + """ |
| 51 | + inputs, targets = data |
| 52 | + if state.active_phase == ActivePhase.TRAIN: |
| 53 | + outputs = self.module(inputs) |
| 54 | + else: |
| 55 | + outputs = self.swa_model(inputs) if self.swa_model else self.module(inputs) |
| 56 | + |
| 57 | + loss = torch.nn.functional.cross_entropy(outputs, targets) |
| 58 | + |
| 59 | + return loss, outputs |
| 60 | + |
| 61 | + |
41 | 62 | class TestAutoUnitGPU(unittest.TestCase):
|
42 | 63 | @skip_if_not_gpu
|
43 | 64 | @skip_if_not_distributed
|
@@ -184,6 +205,112 @@ def forward(self, x):
|
184 | 205 | for p1, p2 in zip(swa_params, swa_fsdp_params, strict=True):
|
185 | 206 | torch.testing.assert_close(p2, p1, check_device=False)
|
186 | 207 |
|
| 208 | + @skip_if_not_distributed |
| 209 | + @skip_if_not_gpu |
| 210 | + def test_stochastic_weight_averaging_fsdp_with_eval(self) -> None: |
| 211 | + """ |
| 212 | + Test that swa params with FSDP is identical to non-FSDP swa |
| 213 | + """ |
| 214 | + spawn_multi_process( |
| 215 | + 2, |
| 216 | + "nccl", |
| 217 | + self._test_stochastic_weight_averaging_fsdp_with_eval, |
| 218 | + ) |
| 219 | + |
| 220 | + @staticmethod |
| 221 | + def _test_stochastic_weight_averaging_fsdp_with_eval() -> None: |
| 222 | + """ |
| 223 | + Compares the swa model parameters after training without FSDP and with FSDP. |
| 224 | + They should be identical. |
| 225 | + """ |
| 226 | + |
| 227 | + class Net(torch.nn.Module): |
| 228 | + def __init__(self): |
| 229 | + super(Net, self).__init__() |
| 230 | + self.l1 = torch.nn.Linear(2, 2) |
| 231 | + self.b1 = torch.nn.BatchNorm1d(2) |
| 232 | + self.l2 = torch.nn.Linear(2, 2) |
| 233 | + |
| 234 | + def forward(self, x): |
| 235 | + x = self.l1(x) |
| 236 | + x = self.b1(x) |
| 237 | + x = self.l2(x) |
| 238 | + return x |
| 239 | + |
| 240 | + # so all ranks start with same initialized weights |
| 241 | + device = init_from_env() |
| 242 | + seed(0) |
| 243 | + my_module = Net() |
| 244 | + |
| 245 | + auto_unit = DummySWAAutoUnit( |
| 246 | + module=deepcopy(my_module), |
| 247 | + device=device, |
| 248 | + step_lr_interval="step", |
| 249 | + swa_params=SWAParams( |
| 250 | + warmup_steps_or_epochs=1, |
| 251 | + step_or_epoch_update_freq=1, |
| 252 | + swalr_params=SWALRParams( |
| 253 | + anneal_steps_or_epochs=3, |
| 254 | + ), |
| 255 | + averaging_method="ema", |
| 256 | + ), |
| 257 | + ) |
| 258 | + |
| 259 | + auto_unit_fsdp = DummySWAAutoUnit( |
| 260 | + module=my_module, |
| 261 | + device=device, |
| 262 | + step_lr_interval="step", |
| 263 | + strategy=FSDPStrategy(), |
| 264 | + swa_params=SWAParams( |
| 265 | + warmup_steps_or_epochs=1, |
| 266 | + step_or_epoch_update_freq=1, |
| 267 | + swalr_params=SWALRParams( |
| 268 | + anneal_steps_or_epochs=3, |
| 269 | + ), |
| 270 | + averaging_method="ema", |
| 271 | + ), |
| 272 | + ) |
| 273 | + |
| 274 | + input_dim = 2 |
| 275 | + dataset_len = 10 |
| 276 | + batch_size = 2 |
| 277 | + |
| 278 | + dataloader = generate_random_dataloader(dataset_len, input_dim, batch_size) |
| 279 | + eval_dataloader = generate_random_dataloader(dataset_len, input_dim, batch_size) |
| 280 | + fit( |
| 281 | + auto_unit, |
| 282 | + dataloader, |
| 283 | + eval_dataloader, |
| 284 | + max_epochs=3, |
| 285 | + max_train_steps_per_epoch=5, |
| 286 | + evaluate_every_n_epochs=0, |
| 287 | + ) |
| 288 | + |
| 289 | + fit( |
| 290 | + auto_unit_fsdp, |
| 291 | + dataloader, |
| 292 | + eval_dataloader, |
| 293 | + max_epochs=3, |
| 294 | + max_train_steps_per_epoch=5, |
| 295 | + # this is key arg, to ensure that swa model is updated |
| 296 | + # even after swa model forward pass is used in eval |
| 297 | + evaluate_every_n_epochs=1, |
| 298 | + ) |
| 299 | + |
| 300 | + swa_params = list(auto_unit.swa_model.parameters()) |
| 301 | + swa_buffers = list(auto_unit.swa_model.buffers()) |
| 302 | + with FSDP.summon_full_params(auto_unit_fsdp.swa_model): |
| 303 | + swa_fsdp_params = auto_unit_fsdp.swa_model.parameters() |
| 304 | + swa_fsdp_buffers = auto_unit_fsdp.swa_model.buffers() |
| 305 | + |
| 306 | + # Iterate and compare each parameter |
| 307 | + for p1, p2 in zip(swa_params, swa_fsdp_params, strict=True): |
| 308 | + torch.testing.assert_close(p2, p1, check_device=False) |
| 309 | + |
| 310 | + # Iterate and compare each buffer |
| 311 | + for b1, b2 in zip(swa_buffers, swa_fsdp_buffers, strict=True): |
| 312 | + torch.testing.assert_close(b2, b1, check_device=False) |
| 313 | + |
187 | 314 | @skip_if_not_gpu
|
188 | 315 | @patch("torch.autocast")
|
189 | 316 | def test_eval_mixed_precision_bf16(self, mock_autocast: MagicMock) -> None:
|
|
0 commit comments