meta-pytorch
diff --git a/‎.github/workflows/unit_test.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/unit_test.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 26 additions & 8 deletions b/‎README.md‎
Lines changed: 26 additions & 8 deletions
diff --git a/‎apps/grpo/main.py‎
Lines changed: 85 additions & 140 deletions b/‎apps/grpo/main.py‎
Lines changed: 85 additions & 140 deletions
diff --git a/‎apps/rl/llama3_8b.yaml‎
Lines changed: 5 additions & 1 deletion b/‎apps/rl/llama3_8b.yaml‎
Lines changed: 5 additions & 1 deletion
@@ -26,7 +26,7 @@ jobs:
       - name: Install pytorch
         run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
       - name: Install monarch
-        run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/wheels
+        run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci
       - name: Install dependencies
         run: python -m pip install --no-build-isolation -e ".[dev]"
       - name: Run unit tests with coverage
 
@@ -187,3 +187,5 @@ cover/
 
 # wandb
 wandb/
+
+assets/wheels/vllm*.whl
@@ -6,20 +6,38 @@
 
 ## Installation
 
-### Basic (Broken)
+### Basic
+
+Forge requires the latest PyTorch nightly with Monarch, vLLM, and torchtitan. For convenience,
+we have pre-packaged these dependencies as wheels in assets/wheels. (Note that the basic install script
+uses [DNF](https://docs.fedoraproject.org/en-US/quick-docs/dnf/), but could be easily extended to other Linux OS.)
+
+Forge requires the Github CLI (gh) to download a compatible vLLM package. See [here](https://github.com/cli/cli#installation) for gh install instructions before continuting.
 
 ```bash
-pip install uv
-git clone https://github.com/pytorch-labs/forge
-cd forge
-uv sync
+conda create -n forge python=3.10
+conda activate forge
+./scripts/install.sh
+```
 
-# Or for dev install:
-uv sync --all-extras
+After install, you can run the following command and should see output confirming GRPO training is running.
+```
+python -m apps.grpo.main
+```
+
+If you need to re-build the wheels for whatever reason, you can do so with:
+```bash
+conda create -n forge python=3.10
+conda activate forge
+./scripts/build_wheels.sh
 ```
 
+Since the vLLM wheel is too large for GitHub, we uploaded it as a release:
+```
+$ gh release create v0.0.0 assets/wheels/vllm-*.whl --title "Forge Wheels v0.0.0"
+```
 
-### Internal Machine
+### Meta Internal Build
 
 1. Build uv package
 
 
@@ -5,51 +5,26 @@
 # LICENSE file in the root directory of this source tree.
 
 import asyncio
+import logging
 import time
 from dataclasses import dataclass
 from typing import Callable
 
 import torch
 from datasets import load_dataset
 from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig
+from forge.actors.reference_actor import compute_sequence_logprobs, TitanRefModel
 from forge.actors.replay_buffer import ReplayBuffer
-from forge.controller import ServiceConfig, spawn_service
 from forge.controller.actor import ForgeActor
+from forge.controller.service import ServiceConfig, shutdown_service, spawn_service
 from forge.data.rewards import MathReward, ThinkingReward
 from forge.util.metric_logging import get_metric_logger
 from monarch.actor import endpoint
+from torchtitan.config.job_config import Model as TitanJobModelConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-
-def compute_sequence_logprobs(
-    model: torch.nn.Module,
-    input_ids: torch.Tensor,
-    attention_mask: torch.Tensor,
-    requires_grad: bool = True,
-) -> torch.Tensor:
-    context_manager = torch.enable_grad() if requires_grad else torch.no_grad()
-
-    with context_manager:
-        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-        logits = outputs.logits
-
-        # Apply log softmax to get log probabilities
-        log_probs = torch.log_softmax(logits, dim=-1)
-
-        # Extract log probabilities for the actual tokens (excluding the first token for next-token prediction)
-        shifted_input_ids = input_ids[:, 1:]  # Remove first token
-        shifted_log_probs = log_probs[:, :-1, :]  # Remove last logit
-
-        # Gather log probabilities for actual tokens
-        token_log_probs = torch.gather(
-            shifted_log_probs, dim=-1, index=shifted_input_ids.unsqueeze(-1)
-        ).squeeze(-1)
-
-        # Sum log probabilities across sequence (masked by attention)
-        shifted_attention_mask = attention_mask[:, 1:]
-        sequence_log_probs = (token_log_probs * shifted_attention_mask).sum(dim=-1)
-
-        return sequence_log_probs
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
 
 
 @dataclass
@@ -269,63 +244,21 @@ async def __call__(self, groups: list[Group]) -> list[float]:
         return advantages
 
 
-class RefModel(ForgeActor):
-    def __init__(self, model_name, device: torch.device | None = None):
-        super().__init__()
-        self.model_name = model_name
-
-        # Set device
-        if device is None:
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        else:
-            self.device = device
-
-        # Initialize model and tokenizer
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.bfloat16,
-            trust_remote_code=True,
-        ).to(self.device)
-
-        # Set model to eval mode for reference computations
-        self.model.eval()
-
-        self.logger.info(f"Model initialized on {self.device}")
-
-    @endpoint
-    async def forward(self, token_ids: list[int]) -> torch.Tensor:
-        # Use provided token_ids directly
-        input_ids = (
-            torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(self.device)
-        )
-        # Create attention mask of all 1s since we have actual tokens (no padding)
-        attention_mask = torch.ones_like(input_ids).to(self.device)
-
-        # Compute log probabilities using shared utility function
-        sequence_log_probs = compute_sequence_logprobs(
-            self.model, input_ids, attention_mask, requires_grad=False
-        )
-
-        return (
-            sequence_log_probs.squeeze()
-        )  # Remove batch dimension for single response
-
-
 class DatasetActor(ForgeActor):
     """Actor wrapper for HuggingFace dataset to provide async interface."""
 
-    def __init__(self, *args, **kwargs):
+    def __init__(
+        self, path: str, config_name: str, split: str, streaming: bool, **kwargs
+    ):
         super().__init__()
-        self._setup_dataset(*args, **kwargs)
 
-    def _setup_dataset(self, *args, **kwargs):
         def gsm8k_to_messages(sample):
             question = sample["question"]
             full_answer: str = sample["answer"]
             answer = full_answer.split("#### ")[1]
             return {"question": question, "answer": answer}
 
-        ds = load_dataset(*args, **kwargs)
+        ds = load_dataset(path, config_name, split=split, streaming=streaming)
         ds = ds.map(gsm8k_to_messages)
         ds = ds.shuffle()
         self._iterator = iter(ds)
@@ -341,7 +274,8 @@ async def __next__(self) -> dict[str, str] | None:
 async def main():
     """Main GRPO training loop with rollout and training processes."""
     group_size = 1
-    model = "Qwen/Qwen3-1.7B"
+    model = "Qwen/Qwen3-0.6B"
+    titan_model = TitanJobModelConfig(name="qwen3", flavor="0.6B")
 
     # ---- Setup WandB Logger ---- #
     logger = get_metric_logger(
@@ -351,74 +285,69 @@ async def main():
     )
 
     # ---- Setup services ---- #
-    default_service_cfg = ServiceConfig(
-        procs_per_replica=1,
-        num_replicas=1,
-    )
-
-    policy = await spawn_service(
-        default_service_cfg,
-        Policy,
-        PolicyConfig(
-            num_workers=1,
-            worker_params=WorkerConfig(model=model),
-            sampling_params=SamplingOverrides(num_samples=group_size, max_tokens=16),
-            available_devices="3",
+    (
+        dataloader,
+        policy,
+        trainer,
+        replay_buffer,
+        compute_advantages,
+        ref_model,
+        reward_actor,
+    ) = await asyncio.gather(
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1),
+            DatasetActor,
+            path="openai/gsm8k",
+            config_name="main",
+            split="train",
+            streaming=True,
+        ),
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1),
+            Policy,
+            config=PolicyConfig(
+                worker_params=WorkerConfig(model=model),
+                sampling_params=SamplingOverrides(
+                    num_samples=group_size, max_tokens=16
+                ),
+            ),
+        ),
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1),
+            Trainer,
+            learning_rate=1e-5,
+            beta=0.1,
+            model_name=model,
+        ),
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1),
+            ReplayBuffer,
+            batch_size=4,
+            max_policy_age=1,
+        ),
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1),
+            ComputeAdvantages,
+            gamma=0.99,
+            lambda_=0.95,
+        ),
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1, with_gpus=True),
+            TitanRefModel,
+            model=titan_model,
+        ),
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1),
+            RewardActor,
+            reward_functions=[MathReward(), ThinkingReward()],
         ),
-    )
-
-    trainer = await spawn_service(
-        default_service_cfg,
-        Trainer,
-        learning_rate=1e-5,
-        beta=0.1,
-        model_name=model,
-        device=torch.device("cuda:1"),
-    )
-
-    replay_buffer = await spawn_service(
-        default_service_cfg,
-        ReplayBuffer,
-        batch_size=4,
-        max_policy_age=1,
-    )
-
-    dataloader = await spawn_service(
-        default_service_cfg,
-        DatasetActor,
-        "openai/gsm8k",
-        "main",
-        split="train",
-        streaming=True,
-    )
-
-    compute_advantages = await spawn_service(
-        default_service_cfg,
-        ComputeAdvantages,
-        gamma=0.99,
-        lambda_=0.95,
-    )
-
-    ref_model = await spawn_service(
-        default_service_cfg,
-        RefModel,
-        model_name=model,
-        device=torch.device("cuda:2"),
-    )
-
-    reward_actor = await spawn_service(
-        default_service_cfg,
-        RewardActor,
-        reward_functions=[MathReward(), ThinkingReward()],
     )
 
     print("All services initialized successfully!")
 
     # ---- Core RL loops ---- #
     async def continuous_rollouts():
         rollout_count = 0
-        # TODO: Move this into setup
-        asyncio.create_task(policy.run_processing.call())
         while True:
             sample = await dataloader.__next__.choose()
             if sample is None:
@@ -432,9 +361,14 @@ async def continuous_rollouts():
                 target=target,
                 policy_version=version,
             )
-            actions = await policy.generate.choose(prompt)
+            responses = await policy.generate.choose(prompt)
+            actions = responses.outputs
             for action in actions:
-                ref_logprobs = await ref_model.forward.choose(action.token_ids)
+                request_tokens = responses.prompt_token_ids
+                response_tokens = action.token_ids
+                ref_logprobs = await ref_model.forward.choose(
+                    request=request_tokens, response=response_tokens
+                )
                 reward = await reward_actor.evaluate_response.choose(
                     prompt=prompt, response=action.text, target=target
                 )
@@ -489,6 +423,17 @@ async def continuous_training():
         print("Training interrupted by user")
         rollout_task.cancel()
         training_task.cancel()
+    finally:
+        print("Shutting down...")
+        await asyncio.gather(
+            shutdown_service(policy),
+            shutdown_service(trainer),
+            shutdown_service(replay_buffer),
+            shutdown_service(dataloader),
+            shutdown_service(compute_advantages),
+            shutdown_service(ref_model),
+            shutdown_service(reward_actor),
+        )
 
 
 if __name__ == "__main__":
 
@@ -18,6 +18,7 @@ trainer:
   processes:
     scheduler: local # local | mast (not supported yet)
     num_hosts: 1
+    with_gpus: True
     num_procs: 4
 
   optimizer:
@@ -33,9 +34,11 @@ trainer:
     seq_len: 2048
     max_norm: 1.0
     steps: 5
-    compile: false
     dataset: "c4"
 
+  compile:
+    enable: false
+
   parallelism:
     data_parallel_replicate_degree: 1
     data_parallel_shard_degree: -1
@@ -65,6 +68,7 @@ replay_buffer:
   processes:
     scheduler: local # local | mast (not supported yet)
     num_hosts: 1
+    with_gpus: False
     num_procs: 1
 
 # policy:
-Original file line number
+Diff line change
 # wandb
 wandb/
++
 +assets/wheels/vllm*.whl