skeleton code of ts integration

pradeepfn · pradeepfn · commit 8ad4f35e08aa · 2025-08-29T07:54:51.000-07:00
diff --git a/src/forge/actors/trainer.py b/src/forge/actors/trainer.py
@@ -5,16 +5,23 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import asyncio
 import logging
 import math
 import os
 from typing import Any
 
 import torch
 import torchtitan.experiments.forge.train_spec as forge_train_spec
+
+# from tqdm import tqdm
+
+from forge.controller import ForgeActor
 from monarch.actor import current_rank, current_size, endpoint
 from omegaconf import DictConfig, OmegaConf
 from torch import nn
+from torchstore import MultiProcessStore
+from torchstore._state_dict_utils import push_state_dict
 from torchtitan.components.loss import LossFunction
 
 # from torchdata.stateful_dataloader import StatefulDataLoader
@@ -25,10 +32,6 @@
 from torchtitan.experiments.forge.engine import ForgeEngine
 from torchtitan.experiments.forge.job_config import ForgeJobConfig
 
-# from tqdm import tqdm
-
-from forge.controller import ForgeActor
-
 # from forge.interfaces import RLLoss
 
 # stubs for now
@@ -68,6 +71,10 @@ def __init__(self, config: DictConfig):
         self.gradient_accumulation_steps = 1  # Example value, adjust as needed
         self._rank = current_rank().rank
         self._size = math.prod(current_size().values())
+
+        # init torchstore
+        self._tstore = asyncio.run(MultiProcessStore.create_store())
+
         self._init_dist()
         super().__init__(job_config)
 
@@ -201,6 +208,15 @@ def train_step(self, batch) -> None:
         # self.profiler.step()
         self.current_step += 1
 
+        # save to torchstore. Hacking in to the Checkpointer's prepped state-dict for now.
+        # TODOs:
+        # 1. Figure out if there is a value in calling state_dict_adatpr.to_hf()
+        # 2. Checkpoint invokes state-dict flattening during dcp_save for [MODEL].
+        #    May need to replicate the same in this code path.
+        # 3. Integrate zero-overhead version of push_state_dict.
+        # 4. Figure out a way to notify the generator app that weights are ready. This beyond the initial integration success.
+        # 5. Unify CheckpointManager and TorchStore weights save control path.
+        push_state_dict(self._tstore, self.checkpointer.states, f"v{self.current_step}")
         # if self.current_step % self.train_config.val_every_n_steps == 0:
         #     self.validate()
         self.checkpointer.save(