meta-pytorch
diff --git a/‎.github/packaging/pre_build_gpu.sh‎
Lines changed: 1 addition & 0 deletions b/‎.github/packaging/pre_build_gpu.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/build_vllm.yaml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/build_vllm.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.github/workflows/build_wheels.yaml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/build_wheels.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.github/workflows/docs.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/docs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/grpo/qwen3_1_7b.yaml‎
Lines changed: 3 additions & 3 deletions b/‎apps/grpo/qwen3_1_7b.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎apps/grpo/qwen3_32b.yaml‎
Lines changed: 3 additions & 3 deletions b/‎apps/grpo/qwen3_32b.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎apps/grpo/qwen3_8b.yaml‎
Lines changed: 3 additions & 5 deletions b/‎apps/grpo/qwen3_8b.yaml‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎src/forge/actors/policy.py‎
Lines changed: 8 additions & 33 deletions b/‎src/forge/actors/policy.py‎
Lines changed: 8 additions & 33 deletions
diff --git a/‎src/forge/actors/trainer.py‎
Lines changed: 2 additions & 69 deletions b/‎src/forge/actors/trainer.py‎
Lines changed: 2 additions & 69 deletions
@@ -16,6 +16,7 @@ echo "build dir is $BUILD_DIR"
 echo "wheel dir is $WHL_DIR"
 
 build_monarch() {
+    export MONARCH_PACKAGE_NAME="torchmonarch"
     # Get Rust build related pieces
     if ! command -v rustup &> /dev/null; then
         echo "getting rustup"
 
@@ -12,7 +12,7 @@ permissions:
 
 jobs:
   build:
-    name: forge-cu126-nightly
+    name: forge-cu129-nightly
     uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
     strategy:
       fail-fast: false
@@ -31,13 +31,13 @@ jobs:
             {
               "python_version": "3.10",
               "gpu_arch_type": "cpu",
-              "gpu_arch_version": "12.6",
-              "desired_cuda": "cu126",
-              "container_image": "pytorch/manylinux2_28-builder:cuda12.6",
+              "gpu_arch_version": "12.9",
+              "desired_cuda": "cu129",
+              "container_image": "pytorch/manylinux2_28-builder:cuda12.9",
               "package_type": "manywheel",
-              "build_name": "manywheel-py3_10-cuda12_6",
+              "build_name": "manywheel-py3_10-cuda12_9",
               "validation_runner": "linux.12xlarge.memory",
-              "installation": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126",
+              "installation": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129",
               "channel": "nightly",
               "upload_to_base_bucket": "no",
               "stable_version": "2.8.0",
 
@@ -12,7 +12,7 @@ permissions:
 
 jobs:
   build:
-    name: forge-cu126-nightly
+    name: forge-cu129-nightly
     uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
     strategy:
       fail-fast: false
@@ -31,13 +31,13 @@ jobs:
             {
               "python_version": "3.10",
               "gpu_arch_type": "cuda",
-              "gpu_arch_version": "12.6",
-              "desired_cuda": "cu126",
-              "container_image": "pytorch/manylinux2_28-builder:cuda12.6",
+              "gpu_arch_version": "12.9",
+              "desired_cuda": "cu129",
+              "container_image": "pytorch/manylinux2_28-builder:cuda12.9",
               "package_type": "manywheel",
-              "build_name": "manywheel-py3_10-cuda12_6",
+              "build_name": "manywheel-py3_10-cuda12_9",
               "validation_runner": "linux.4xlarge.nvidia.gpu",
-              "installation": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126",
+              "installation": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129",
               "channel": "nightly",
               "upload_to_base_bucket": "no",
               "stable_version": "2.8.0",
 
@@ -9,6 +9,7 @@ on:
 
 jobs:
   build-docs:
+    if: github.repository_owner == 'meta-pytorch'
     name: Build Documentation
     runs-on: linux.g5.4xlarge.nvidia.gpu
     timeout-minutes: 30
 
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 8
-batch_size: 16
+local_batch_size: 16 # per-device batch size
 max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-1.7B"
@@ -56,7 +56,7 @@ trainer:
   lr_scheduler:
     warmup_steps: 1
   training:
-    local_batch_size: ${batch_size}
+    local_batch_size: ${local_batch_size}
     seq_len: 2048
     max_norm: 1.0
     steps: 1000000
@@ -85,7 +85,7 @@ trainer:
 
 # Replay buffer configuration
 replay_buffer:
-  batch_size: ${batch_size}
+  batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
 
 
@@ -4,7 +4,7 @@
 
 # Global configuration
 group_size: 16
-batch_size: 32
+local_batch_size: 32 # per-device batch size
 max_req_tokens: 1024
 max_res_tokens: 1024
 model: "Qwen/Qwen3-32B"
@@ -59,7 +59,7 @@ trainer:
   lr_scheduler:
     warmup_steps: 1
   training:
-    local_batch_size: ${batch_size}
+    local_batch_size: ${local_batch_size}
     seq_len: 2048
     max_norm: 1.0
     steps: 1000000
@@ -87,7 +87,7 @@ trainer:
 
 # Replay buffer configuration
 replay_buffer:
-  batch_size: ${batch_size}
+  batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
   dp_size: 1
 
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 8
-batch_size: 16
+local_batch_size: 16 # per-device batch size
 max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-8B"
@@ -28,7 +28,6 @@ dataset:
 
 # Policy configuration
 policy:
-  use_vllm_builtin_load: true
   engine_config:
     model: ${model}
     tensor_parallel_size: 2
@@ -43,7 +42,6 @@ policy:
 # Trainer configuration
 trainer:
   use_dcp: true
-  use_vllm_builtin_load: true
   model:
     name: qwen3
     flavor: 8B
@@ -55,7 +53,7 @@ trainer:
   lr_scheduler:
     warmup_steps: 1
   training:
-    local_batch_size: ${batch_size}
+    local_local_batch_size: ${local_batch_size}
     seq_len: 2048
     max_norm: 1.0
     steps: 1000000
@@ -84,7 +82,7 @@ trainer:
 
 # Replay buffer configuration
 replay_buffer:
-  batch_size: ${batch_size}
+  local_batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   # This should match the dp_size of TorchTitan
   # Here it's set explicitly to 2, because we've set
 
@@ -10,7 +10,6 @@
 import logging
 import os
 import sys
-import time
 from collections.abc import Mapping
 from copy import copy
 from dataclasses import asdict, dataclass, field, fields
@@ -140,7 +139,6 @@ def create_vllm_config(self) -> VllmConfig:
 class Policy(PolicyInterface):
     engine_config: EngineConfig | Mapping = field(default_factory=EngineConfig)
     sampling_config: SamplingConfig | Mapping = field(default_factory=SamplingConfig)
-    use_vllm_builtin_load: bool = True
     available_devices: str | None = None
     use_dcp: bool = True
     # Gets set up by setup
@@ -246,7 +244,7 @@ async def setup(self):
 
         self.request_id = 0
         self.policy_version = 0
-        self.requests: dict[str, tuple[None | ParentRequest, asyncio.Future]] = {}
+        self.requests: dict[str, tuple[ParentRequest | None, asyncio.Future]] = {}
 
         # TODO: Investigate whether this can be combined with `policy.running`
         # Whether this policy is accepting requests.
@@ -484,14 +482,11 @@ async def update_weights(self, policy_version: int):
             record_metric("policy/update_weights/count_weight_updates", 1, Reduce.SUM)
 
             logger.debug(f"Starting weight update on {self.__class__.__name__}")
-            if self.use_vllm_builtin_load:
-                await self.policy_worker.update.call(version=policy_version)
-            else:
-                await self.policy_worker.update_DEPRECATED.call(version=policy_version)
+            await self.policy_worker.update.call(version=policy_version)
             self.policy_version = policy_version
 
             # After updating the weights, we need to reset the KV cache
-            self.scheduler.kv_cache_manager.reset_prefix_cache()
+            self.scheduler.reset_prefix_cache()
 
         # Resume accepting requests and wake up any waiting generate() calls
         async with self.request_lock:
@@ -501,16 +496,8 @@ async def update_weights(self, policy_version: int):
         logger.info(f"Weight update completed (now v{self.policy_version})")
 
     @endpoint
-    async def update_weights_DEPRECATED(self, policy_version: int):  # noqa: N802
-        # TODO: If generating long sequences, this might be long and will block policy weight updates
-        curr_requests = [fut for _, fut in self.requests.values()]
-        if curr_requests:
-            logger.debug(f"Waiting for {len(curr_requests)} pending requests")
-            await asyncio.gather(*curr_requests)
-
-        await self.policy_worker.update_DEPRECATED.call(version=policy_version)
-        self.policy_version = policy_version
-        logger.info(f"Weight update completed (now v{self.policy_version})")
+    async def _reset_prefix_cache(self):
+        self.scheduler.reset_prefix_cache()
 
     @endpoint
     async def get_version(self) -> int:
@@ -550,6 +537,7 @@ def _to_completions(self, request_output: RequestOutput) -> list[Completion]:
                     token_ids=torch.tensor(output.token_ids),
                     logprobs=self._extract_logprobs(output),
                     generator_version=self.policy_version,
+                    metadata={"num_cached_tokens": request_output.num_cached_tokens},
                 )
             )
 
@@ -587,8 +575,8 @@ def __post_init__(self):
 
     @endpoint
     async def setup(self):
-        # TODO: remove ["gpus"] when monarch implements a flat rank
-        self.rank = current_rank()["gpus"]
+        self.rank = current_rank().rank
+        os.environ["RANK"] = str(self.rank)
         self.worker = self.setup_worker()
 
     @endpoint
@@ -631,19 +619,6 @@ async def _load_tensor_parallel_state_dict(
                 current_tensor,
             )
 
-    @endpoint
-    async def update_DEPRECATED(self, version: int):  # noqa: N802
-        """Update model weights by reading state dict from torchstore.
-        Deprecated. This uses manual sharding logic which is buggy."""
-        key = f"{self.state_dict_key}{DELIM}{version}"
-        model = self.worker.model_runner.model
-        current_state_dict = model.state_dict()
-        start = time.perf_counter()
-        await self._load_tensor_parallel_state_dict(current_state_dict, version)
-        logger.info(
-            f"Loaded state dict from {key} in {time.perf_counter() - start} seconds"
-        )
-
     @endpoint
     async def update(self, version: int):
         """Update model weights by reading state dict from torchstore"""
 
@@ -21,7 +21,6 @@
 from monarch.actor import current_rank, current_size, endpoint
 from torch import Tensor
 from torch.distributed.checkpoint._nested_dict import flatten_state_dict
-from torchstore.state_dict_utils import DELIM
 from torchtitan.config.job_config import (
     ActivationCheckpoint,
     Checkpoint,
@@ -114,8 +113,6 @@ class RLTrainer(ForgeActor):
     state_dict_key: str = "model_state_dict"
     use_dcp: bool = True
     dcp_path: str = "forge_dcp_tmp"
-    vllm_tp_DEPRECATED: int = 1  # noqa: N815
-    use_vllm_builtin_load: bool = True
 
     def __post_init__(self):
         """Initializes config types and env variables.
@@ -159,6 +156,8 @@ def __post_init__(self):
             "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
         }
         os.environ.update(env)
+        logger.info("Compiling loss")
+        self.loss = torch.compile(self.loss)
 
     @endpoint
     async def setup(self):
@@ -168,9 +167,7 @@ async def setup(self):
             "loss",
             "state_dict_key",
             "use_dcp",
-            "use_vllm_builtin_load",
             "dcp_path",
-            "vllm_tp_DEPRECATED",
         }:
             engine_config.pop(key)  # Not part of job config
         self.engine = ForgeEngine(ForgeJobConfig(**engine_config))
@@ -302,76 +299,12 @@ async def train_step(
         t.stop()
         return loss
 
-    @endpoint
-    async def push_weights_DEPRECATED(  # noqa: N802
-        self, policy_version: int, vllm_tp_DEPRECATED: int = 1
-    ) -> None:  # noqa: N802
-        """[Deprecated] This method pushes weights to torchstore in the vllm format,
-        which is buggy and not scalable to other models.
-        Deprecated in favor of push_weights."""
-        return await self._push_weights_DEPRECATED(policy_version, vllm_tp_DEPRECATED)
-
-    async def _push_weights_DEPRECATED(  # noqa: N802
-        self, policy_version: int, vllm_tp_DEPRECATED: int
-    ) -> None:  # noqa: N802
-        # Save to torchstore. Hacking in to the Checkpointer's prepped state-dict for now.
-        # TODO:
-        # 1. Checkpoint invokes state-dict flattening during dcp_save for [MODEL].
-        #    May need to replicate the same in this code path.
-        # 2. Unify CheckpointManager and TorchStore weights save control path.
-        if "model" not in self.engine.checkpointer.states:
-            raise RuntimeError("Model state not found in checkpointer state")
-
-        sd = self.engine.checkpointer.states["model"].state_dict()
-        flattened_state_dict, _ = flatten_state_dict(sd)
-
-        if self.engine.checkpointer.sd_adapter is None:
-            raise RuntimeError(
-                "Trying to save checkpoint in HF safetensors format, but sd_adapter is not provided."
-            )
-        hf_state_dict = self.engine.checkpointer.sd_adapter.to_hf(flattened_state_dict)
-
-        # TODO: Figure out how to gracefully handle which model to-vLLM conversion is needed
-        vllm_ready_hf_sd = _qwen3_hf_to_vllm(
-            sd=hf_state_dict,
-            num_layers=self.engine.model_args.n_layers,
-            vllm_tp=vllm_tp_DEPRECATED,
-        )
-
-        key = f"{self.state_dict_key}{DELIM}{policy_version}"
-        if self.use_dcp:
-            # TODO - DCP should probably be being saved to NFS explicitly?
-            # Right now it will only save everything locally
-            storage_writer = torch.distributed.checkpoint.FileSystemWriter(
-                key, single_file_per_rank=False, thread_count=8
-            )
-            metadata = dcp.save(
-                storage_writer=storage_writer, state_dict=vllm_ready_hf_sd
-            )
-            await ts.put(key, metadata)
-
-            # Delete old weight versions if they exist
-            if self.rank == 0:
-                cleanup_old_weight_versions(
-                    state_dict_key=self.state_dict_key,
-                    delim=DELIM,
-                    current_policy_version=policy_version,
-                )
-        else:
-            await ts.put_state_dict(vllm_ready_hf_sd, key)
-
     @endpoint
     async def push_weights(self, policy_version: int) -> None:
         """Push weights to torchstore in HF format."""
         t = Tracer("rl_trainer_perf/push_weights", timer="gpu", track_memory=True)
         t.start()
         logger.info(f"Pushing weights for policy version {policy_version}")
-        if not self.use_vllm_builtin_load:
-            result = await self._push_weights_DEPRECATED(
-                policy_version, self.vllm_tp_DEPRECATED
-            )
-            t.step("push_weights_DEPRECATED")
-            return result
 
         start_time = time.perf_counter()
         if "model" not in self.engine.checkpointer.states: