Merge branch 'main' of https://github.com/pytorch-labs/forge into metric_logging

Felipe Mello · Felipe Mello · commit 11ea544113a0 · 2025-09-23T09:37:38.000-07:00
diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml
@@ -32,6 +32,10 @@ jobs:
           eval "$(ssh-agent -s)"
           ssh-add - <<< '${{ secrets.FORGE_GITHUB_CI_FOR_TORCHSTORE }}'
           python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git
+      - name: Install torchtitan
+        run: |
+          pip install --pre torchtitan==0.1.0.dev20250826+cpu --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install tyro
       - name: Install dependencies
         run: python -m pip install --no-build-isolation -e ".[dev]"
       - name: Run unit tests with coverage
diff --git a/.gitignore b/.gitignore
@@ -193,3 +193,6 @@ cover/
 wandb/
 
 assets/wheels/vllm*.whl
+
+# DCP artifacts
+model_state_dict/
diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml
@@ -0,0 +1,131 @@
+# Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_8b.yaml
+
+# Global configuration
+group_size: 8
+batch_size: 16
+max_req_tokens: 512
+max_res_tokens: 512
+model: "Qwen/Qwen3-8B"
+off_by_n: 1 # Off by one by default
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_config:
+    model: ${model}
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    enforce_eager: false
+  sampling_config:
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 8B
+    hf_assets_path: hf://${model}
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${batch_size}
+    seq_len: 2048
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: -1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${batch_size}
+  max_policy_age: ${off_by_n}
+  # This should match the dp_size of TorchTitan
+  # Here it's set explicitly to 2, because we've set
+  # 2 GPUs for the trainer and we're using full FSDP.
+  dp_size: 2
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 8B
+    hf_assets_path: hf://${model}
+  training:
+    dtype: bfloat16
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  dataset:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+  policy:
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 1
+    with_gpus: true
+  trainer:
+    procs: 2
+    num_replicas: 1
+    with_gpus: true
+  replay_buffer:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+  ref_model:
+    procs: 1
+    num_replicas: 1
+    with_gpus: true
+  compute_advantages:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
diff --git a/src/forge/actors/trainer.py b/src/forge/actors/trainer.py
@@ -7,6 +7,7 @@
 import logging
 import math
 import os
+import shutil
 import time
 from collections.abc import Mapping
 from dataclasses import dataclass, field, fields
@@ -39,7 +40,46 @@
 from forge.data.utils import batch_to_device
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+logger.setLevel(logging.DEBUG)
+
+
+def cleanup_old_weight_versions(
+    state_dict_key: str,
+    delim: str,
+    current_policy_version: int,
+) -> None:
+    """Delete old weight versions, keeping only current and N-1 versions.
+
+    TODO - issues/194: provide a more robust way to handle eviction.
+
+    Args:
+        state_dict_key: The base key for state dict storage
+        delim: The delimiter used between key and version
+        current_policy_version: The current policy version to keep
+    """
+    if current_policy_version <= 1:
+        return  # No cleanup needed for versions 0 or 1
+
+    prefix = f"{state_dict_key}{delim}"
+    current_weights = f"{prefix}{current_policy_version}"
+    previous_weights = f"{prefix}{current_policy_version - 1}"
+
+    # Find all weight directories that match our pattern
+    parent_dir = os.path.dirname(prefix) or "."
+    if os.path.exists(parent_dir):
+        for item in os.listdir(parent_dir):
+            item_path = os.path.join(parent_dir, item)
+            if (
+                item.startswith(os.path.basename(prefix))
+                and item != os.path.basename(current_weights)
+                and item != os.path.basename(previous_weights)
+                and os.path.isdir(item_path)
+            ):
+                try:
+                    shutil.rmtree(item_path, ignore_errors=True)
+                    logger.debug(f"Removed old weights at {item_path}")
+                except OSError as e:
+                    logger.debug(f"Error deleting {item_path}: {e}")
 
 
 @dataclass
@@ -67,6 +107,7 @@ def __post_init__(self):
         in monarch for now.
 
         """
+        super().__init__()
         # Instantiate dict fields
         for f in fields(self):
             attr = getattr(self, f.name)
@@ -223,13 +264,26 @@ async def push_weights(self, policy_version: int) -> None:
             )
         hf_state_dict = self.engine.checkpointer.sd_adapter.to_hf(flattened_state_dict)
         # TODO: Figure out how to gracefully handle which model to-vLLM conversion is needed
-        vllm_ready_hf_sd = _qwen3_hf_to_vllm(sd=hf_state_dict, num_layers=28)
+        vllm_ready_hf_sd = _qwen3_hf_to_vllm(
+            sd=hf_state_dict, num_layers=self.engine.model_args.n_layers
+        )
 
         key = f"{self.state_dict_key}{DELIM}{policy_version}"
         start_time = time.time()
         if self.use_dcp:
+
+            # TODO - DCP should probably be being saved to NFS explicitly?
+            # Right now it will only save everything locally
             metadata = dcp.save(checkpoint_id=key, state_dict=vllm_ready_hf_sd)
             await ts.put(key, metadata)
+
+            # Delete old weight versions if they exist
+            if self.rank == 0:
+                cleanup_old_weight_versions(
+                    state_dict_key=self.state_dict_key,
+                    delim=DELIM,
+                    current_policy_version=policy_version,
+                )
         else:
             await ts.put_state_dict(vllm_ready_hf_sd, key)
         end_time = time.time()
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
@@ -202,15 +202,18 @@ async def get_proc_mesh(
                 # We can't currently do this because HostMesh only supports single
                 # proc_mesh creation at the moment. This will be possible once
                 # we have "proper HostMesh support".
-                def bootstrap(gpu_ids: int):
+                def bootstrap(gpu_ids: list[str]):
                     # This works for single host, needed for vLLM currently.
                     import os
 
                     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(gpu_ids)
                     os.environ["MASTER_ADDR"] = socket.gethostname()
                     # Multiple actors trying to call _get_port doesn't work
                     # os.environ["MASTER_PORT"] = _get_port()
-                    os.environ["MASTER_PORT"] = "12345"
+
+                    # Setting the last digit to the first GPU id allows us to i.e.
+                    # create multiple vLLM instances on the same local host.
+                    os.environ["MASTER_PORT"] = f"1234{gpu_ids[0]}"
                     os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600"
                     os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824"
 
diff --git a/tests/unit_tests/test_reference_actor.py b/tests/unit_tests/test_reference_actor.py
diff --git a/tests/unit_tests/test_trainer.py b/tests/unit_tests/test_trainer.py