Merge branch 'main' into weight_sync

Allen Wang · Allen Wang · commit 4a7281e5d1b2 · 2025-11-06T09:01:25.000-08:00
diff --git a/.meta/mast/README.md b/.meta/mast/README.md
@@ -5,8 +5,6 @@ This only applies to Meta internal users.
 
 ## Quick Start
 
-⚠️ Important Note: the setup script will clone the forge repository under "/data/users/$USER".
-
 ### 1. Run the Setup Script
 
 The `env_setup.sh` script will automatically:
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -212,6 +212,7 @@ class DatasetActor(ForgeActor):
     @endpoint
     def setup(self):
         self._tokenizer = get_tokenizer(self.model)
+        self._epoch = 0
 
         def gsm8k_transform(sample):
             system_prompt = """
@@ -232,12 +233,12 @@ def gsm8k_transform(sample):
             formatted_target = target.split("#### ")[1]
             return {"request": formatted_request, "target": formatted_target}
 
-        ds = load_dataset(
+        self._base_dataset = load_dataset(
             self.path, self.revision, split=self.data_split, streaming=self.streaming
         )
-        ds = ds.map(gsm8k_transform)
-        ds = ds.shuffle()
-        self._iterator = iter(ds)
+        self._base_dataset = self._base_dataset.map(gsm8k_transform)
+        self._base_dataset = self._base_dataset.shuffle()
+        self._iterator = iter(self._base_dataset)
 
     @endpoint
     async def sample(self) -> dict[str, str] | None:
@@ -250,10 +251,18 @@ async def sample(self) -> dict[str, str] | None:
                 len(sample["request"]),
                 Reduce.MEAN,
             )
+            record_metric("dataset/sample/current_epoch", self._epoch, Reduce.MAX)
 
             return sample
         except StopIteration:
-            return None
+            # Restart iterator for next epoch with reshuffling
+            self._epoch += 1
+            print(
+                f"Dataset epoch {self._epoch - 1} completed. Starting epoch {self._epoch}"
+            )
+            self._base_dataset.set_epoch(self._epoch)
+            self._iterator = iter(self._base_dataset)
+            return next(self._iterator)
 
     @endpoint
     async def pad_token(self):
diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 8
-local_batch_size: 16 # per-device batch size
+local_batch_size: 12 # per-device batch size
 max_req_tokens: 1024
 max_res_tokens: 1024
 model: "Qwen/Qwen3-8B"
diff --git a/apps/sft/main.py b/apps/sft/main.py
@@ -154,6 +154,15 @@ def setup_data(self):
             generation_config_path=os.path.join(
                 self.job_config.model.hf_assets_path, "generation_config.json"
             ),
+            chat_template_path=(
+                path
+                if os.path.exists(
+                    path := os.path.join(
+                        self.job_config.model.hf_assets_path, "chat_template.jinja"
+                    )
+                )
+                else None
+            ),
         )
 
         dataset = sft_iterable_dataset(
diff --git a/docs/source/getting_started.md b/docs/source/getting_started.md
@@ -124,6 +124,7 @@ After installation, verify that all components are working correctly:
 
    # Test basic Monarch functionality
    procs = this_host().spawn_procs({'gpus': 1})
+   procs.initialized.get()
    print('Monarch: Process spawning works')
    "
    ```
diff --git a/src/forge/actors/trainer.py b/src/forge/actors/trainer.py
@@ -176,8 +176,7 @@ async def train_step(
 
         # TODO: delete item() to avoid cpu-gpu sync
         loss = loss.detach().item()
-        record_metric("rl_trainer/count_training_steps", 1, Reduce.SUM)
-        record_metric("rl_trainer/avg_grpo_loss", loss, Reduce.MEAN)
+        record_metric("rl_trainer/avg_loss", loss, Reduce.MEAN)
 
         # These are placeholder values until the loss function exposes these metrics
         # record_metric("rl_trainer/step/avg_kl_divergence", 0.0, Reduce.MEAN)
diff --git a/src/forge/data/tokenizer.py b/src/forge/data/tokenizer.py
@@ -215,8 +215,8 @@ class HuggingFaceModelTokenizer(ModelTokenizer):
     Args:
         tokenizer_json_path (str): Path to tokenizer.json file
         tokenizer_config_json_path (str | None): Path to tokenizer_config.json file. Default: None
-        generation_config_path (str | None): Path to generation_config.json file.
-            Default: None
+        generation_config_path (str | None): Path to generation_config.json file. Default: None
+        chat_template_path (str | None): Path to chat_template.jinja file. Default: None
         truncation_type (str): type of truncation to apply, either "left" or "right".
             Default is "right".
     """
@@ -227,6 +227,7 @@ def __init__(
         *,
         tokenizer_config_json_path: str | None = None,
         generation_config_path: str | None = None,
+        chat_template_path: str | None = None,
         truncation_type: str = "right",
     ):
         self.base_tokenizer = HuggingFaceBaseTokenizer(
@@ -245,7 +246,13 @@ def __init__(
 
         # It is used sometimes in HF chat_templates
         _env.globals["raise_exception"] = self._raise_helper
-        self.template = _env.from_string(config["chat_template"])
+
+        if chat_template_path:
+            with open(chat_template_path, "r") as f:
+                self.template = _env.from_string(f.read())
+        else:
+            self.template = _env.from_string(config["chat_template"])
+
         self.truncation_type = truncation_type
 
         self.special_tokens_mapping = {}
diff --git a/src/forge/observability/metric_actors.py b/src/forge/observability/metric_actors.py
@@ -437,12 +437,12 @@ def extract_values_from_valuemesh(results) -> list[dict[str, Any]]:
                 await backend.log_batch(reduced_metrics, global_step)
 
     @endpoint
-    def has_fetcher(self, proc_id: str) -> bool:
+    async def has_fetcher(self, proc_id: str) -> bool:
         """Check if a fetcher is registered with the given proc_id."""
         return proc_id in self.fetchers
 
     @endpoint
-    def get_fetcher_count(self) -> int:
+    async def get_fetcher_count(self) -> int:
         return len(self.fetchers)
 
     @endpoint

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@ After installation, verify that all components are working correctly:`
`124`	`124`
`125`	`125`	`# Test basic Monarch functionality`
`126`	`126`	`procs = this_host().spawn_procs({'gpus': 1})`
	`127`	`+ procs.initialized.get()`
`127`	`128`	`print('Monarch: Process spawning works')`
`128`	`129`	`"`
`129`	`130`	```