test: check ckpt resuming

samsja · samsja · commit af062701fbf6 · 2024-08-20T15:21:10.000+02:00
diff --git a/open_diloco/train_fsdp.py b/open_diloco/train_fsdp.py
@@ -506,7 +506,8 @@ def scheduler_fn(opt):
             if config.max_steps is not None and real_step >= config.max_steps:
                 break
     log("Training completed.")
-    metric_logger.finish()
+    if rank == 0:
+        metric_logger.finish()
 
 
 if __name__ == "__main__":
diff --git a/open_diloco/utils.py b/open_diloco/utils.py
@@ -1,6 +1,6 @@
 import hashlib
 from functools import partial
-import json
+import pickle
 from typing import Any, Generator, Protocol
 
 import torch
@@ -210,6 +210,5 @@ def log(self, metrics: dict[str, Any]):
         self.data.append(metrics)
 
     def finish(self):
-        with open(self.project, "a") as f:
-            for d in self.data:
-                f.write(json.dumps(d) + "\n")
+        with open(self.project, "wb") as f:
+            pickle.dump(self.data, f)
diff --git a/tests/test_training/test_train.py b/tests/test_training/test_train.py
@@ -1,17 +1,10 @@
+import pickle
 import subprocess
+import numpy as np
 import pytest
 import socket
-import os
-from unittest import mock
 from hivemind.dht.dht import DHT
-
-
-@pytest.fixture(autouse=True)
-def set_env():
-    os.environ["WANDB_MODE"] = "disabled"
-
-    with mock.patch.dict(os.environ, {"WANDB_MODE": "disabled"}):
-        yield
+from open_diloco.ckpt_utils import CKPT_PREFIX
 
 
 def get_random_available_port():
@@ -41,25 +34,54 @@ def config() -> list[str]:
         "16",
         "--max_steps",
         "50",
+        "--metric_logger_type",
+        "dummy",
     ]
 
 
-@pytest.mark.parametrize("num_gpu", [1, 2])
-def test_multi_gpu(config, random_available_port, num_gpu):
-    result = subprocess.run(
-        [
-            "torchrun",
-            f"--nproc_per_node={num_gpu}",
-            "--rdzv-endpoint",
-            f"localhost:{random_available_port}",
-            "open_diloco/train_fsdp.py",
-            *config,
-        ],
-    )
+@pytest.mark.parametrize("num_gpu", [2])
+def test_multi_gpu_ckpt(config, random_available_port, num_gpu, tmp_path):
+    ckpt_path = f"{tmp_path}/ckpt"
+    log_file_1 = f"{tmp_path}/log1.json"
+    log_file_2 = f"{tmp_path}/log2.json"
+
+    run_1 = ["--ckpt.path", ckpt_path, "--ckpt.interval", "10", "--project", log_file_1]
+
+    cmd = [
+        "torchrun",
+        f"--nproc_per_node={num_gpu}",
+        "--rdzv-endpoint",
+        f"localhost:{random_available_port}",
+        "open_diloco/train_fsdp.py",
+        *config,
+    ]
+
+    result = subprocess.run(cmd + run_1)
 
     if result.returncode != 0:
         pytest.fail(f"Process {result} failed {result.stderr}")
 
+    run_2 = ["--ckpt.path", ckpt_path, "--ckpt.resume", f"{ckpt_path}/{CKPT_PREFIX}_20", "--project", log_file_2]
+
+    results_resume = subprocess.run(cmd + run_2)
+
+    if results_resume.returncode != 0:
+        pytest.fail(f"Process {result} failed {result.stderr}")
+
+    with open(log_file_1, "rb") as f:
+        log1 = pickle.load(f)
+    with open(log_file_2, "rb") as f:
+        log2 = pickle.load(f)
+
+    log1 = {data["step"]: [data["Loss"], data["lr"]] for data in log1}
+    log2 = {data["step"]: [data["Loss"], data["lr"]] for data in log2}
+
+    common_step = set(log1.keys()) & set(log2.keys())
+
+    for step in common_step:
+        assert np.allclose(log1[step][0], log2[step][0], atol=1e-3), f"Loss at step {step} is different"
+        assert log1[step][1] == log2[step][1], f"Lr at step {step} is different"
+
 
 @pytest.fixture
 def config_hv() -> list[str]:
@@ -76,6 +98,8 @@ def config_hv() -> list[str]:
         "16",
         "--max_steps",
         "100",
+        "--metric_logger_type",
+        "dummy",
     ]
 
     return config + [