fix torch compile log act (#23)

samsja · samsja · commit f6748dd8cb75 · 2024-08-22T09:41:02.000Z
* fix renaming logic for key

* fix stuff

* fix exploding norm

* remove print
diff --git a/open_diloco/train_fsdp.py b/open_diloco/train_fsdp.py
@@ -7,6 +7,7 @@
 """
 
 from functools import partial
+import math
 import os
 import time
 from contextlib import nullcontext
@@ -26,7 +27,6 @@
     DataCollatorForLanguageModeling,
     LlamaConfig,
     LlamaForCausalLM,
-    get_cosine_schedule_with_warmup,
 )
 from torch.distributed.fsdp import (
     FullyShardedDataParallel as FSDP,
@@ -46,6 +46,7 @@
 )
 from open_diloco.hivemind_diloco import AllReduceStrategy, DiLoCoOptimizer
 from open_diloco.utils import WandbLogger, DummyLogger
+from torch.optim.lr_scheduler import LambdaLR
 
 from hivemind.dht.dht import DHT
 from hivemind.utils.networking import log_visible_maddrs
@@ -173,6 +174,27 @@ def get_model(config: Config) -> LlamaForCausalLM:
     return LlamaForCausalLM.from_pretrained(pretrained_model_name_or_path=config.path_model, config=config_model)
 
 
+def _get_cosine_schedule_with_warmup_lr_lambda(
+    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, min_lr_rate: float = 0.0
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+    factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+    factor = factor * (1 - min_lr_rate) + min_lr_rate
+    return max(0, factor)
+
+
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_inner_steps):
+    lambda_lr = partial(
+        _get_cosine_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=0.5,
+    )
+    return LambdaLR(optimizer, lambda_lr, -1)
+
+
 def train(config: Config):
     sharding_strategy = get_sharding_strategy(config.sharding_strategy)
     local_rank = int(os.environ["LOCAL_RANK"])
@@ -254,6 +276,7 @@ def scheduler_fn(opt):
             opt,
             num_warmup_steps=config.warmup_steps,
             num_training_steps=config.total_steps,
+            num_inner_steps=config.hv.local_steps,
         )
 
     if config.hv is not None: