From 6aebb0989df46010bbafddbff23a47d5cdd9b2fc Mon Sep 17 00:00:00 2001 From: rithesh Date: Mon, 29 Sep 2025 15:03:03 -0700 Subject: [PATCH 01/17] inital changes --- apps/grpo/__init__.py | 5 + apps/mast/__init__.py | 0 apps/mast/env_setup.sh | 309 +++++++++++++++++++ apps/mast/main.py | 38 +++ apps/mast/qwen3_14b_mast.yaml | 153 ++++++++++ apps/mast/qwen3_1_7b_mast.yaml | 129 ++++++++ apps/mast/qwen3_32b_mast.yaml | 153 ++++++++++ apps/mast/qwen3_4b_mast.yaml | 152 ++++++++++ apps/mast/qwen3_8b_mast.yaml | 152 ++++++++++ src/forge/actors/policy.py | 35 +-- src/forge/controller/actor.py | 8 +- src/forge/controller/launcher/__init__.py | 5 + src/forge/controller/launcher/mast.py | 350 ++++++++++++++++++++++ src/forge/controller/provisioner.py | 102 ++++++- src/forge/controller/service/replica.py | 5 +- src/forge/types.py | 10 + 16 files changed, 1574 insertions(+), 32 deletions(-) create mode 100644 apps/grpo/__init__.py create mode 100644 apps/mast/__init__.py create mode 100755 apps/mast/env_setup.sh create mode 100644 apps/mast/main.py create mode 100644 apps/mast/qwen3_14b_mast.yaml create mode 100644 apps/mast/qwen3_1_7b_mast.yaml create mode 100644 apps/mast/qwen3_32b_mast.yaml create mode 100644 apps/mast/qwen3_4b_mast.yaml create mode 100644 apps/mast/qwen3_8b_mast.yaml create mode 100644 src/forge/controller/launcher/__init__.py create mode 100644 src/forge/controller/launcher/mast.py diff --git a/apps/grpo/__init__.py b/apps/grpo/__init__.py new file mode 100644 index 000000000..2e41cd717 --- /dev/null +++ b/apps/grpo/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/apps/mast/__init__.py b/apps/mast/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/apps/mast/env_setup.sh b/apps/mast/env_setup.sh new file mode 100755 index 000000000..6728c87db --- /dev/null +++ b/apps/mast/env_setup.sh @@ -0,0 +1,309 @@ +#!/bin/bash + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# setup_forge_env.sh - Setup conda environment and install forge with mounting +set -e # Exit on any error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Logging functions +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to mount a single workspace to /mnt/wsfuse +mount_workspace() { + local workspace_url="$1" + local mount_dir="/mnt/wsfuse" + + if [ -z "$workspace_url" ]; then + log_error "No workspace URL provided for mounting" + return 1 + fi + + log_info "Setting up mount directory: $mount_dir" + + # Create the directory if it doesn't exist + if [ ! -d "$mount_dir" ]; then + log_info "Creating mount directory: $mount_dir" + sudo mkdir -p "$mount_dir" || { + log_error "Failed to create mount directory (may need sudo privileges)" + return 1 + } + fi + + # Check if the directory is already mounted + if mountpoint -q "$mount_dir" 2>/dev/null; then + log_warn "Directory $mount_dir is already mounted, skipping mount" + return 0 + fi + + # Check if oilfs command exists + if ! command -v oilfs >/dev/null 2>&1; then + log_error "oilfs command not found. Please ensure it's installed and in PATH" + return 1 + fi + + log_info "Mounting workspace $workspace_url to $mount_dir" + + # Store original LD_LIBRARY_PATH to restore after mounting (similar to Python code) + original_ld_library_path="${LD_LIBRARY_PATH:-}" + + # Temporarily unset LD_LIBRARY_PATH for mounting + unset LD_LIBRARY_PATH + + # Mount the workspace + if oilfs "$workspace_url" "$mount_dir"; then + log_info "Successfully mounted $workspace_url to $mount_dir" + else + log_error "Failed to mount $workspace_url to $mount_dir" + # Restore original LD_LIBRARY_PATH + if [ -n "$original_ld_library_path" ]; then + export LD_LIBRARY_PATH="$original_ld_library_path" + fi + return 1 + fi + + # Restore original LD_LIBRARY_PATH + if [ -n "$original_ld_library_path" ]; then + export LD_LIBRARY_PATH="$original_ld_library_path" + fi + + # Verify mount was successful + if [ -d "$mount_dir/huggingface_models" ]; then + log_info "Mount verification successful - found expected directory structure" + else + log_warn "Mount verification: Expected directory structure not found, but mount appears successful" + fi + + return 0 +} + +# Function to safely deactivate conda +safe_conda_deactivate() { + if command -v conda >/dev/null 2>&1; then + if conda info --envs >/dev/null 2>&1; then + conda deactivate 2>/dev/null || log_warn "Could not deactivate conda (might not be in an environment)" + else + log_warn "Conda not properly initialized, skipping deactivate" + fi + else + log_warn "Conda command not found, skipping deactivate" + fi +} + +# Function to safely activate conda environment +safe_conda_activate() { + local env_name="$1" + + if command -v conda >/dev/null 2>&1; then + if conda info --envs >/dev/null 2>&1; then + conda activate "$env_name" + else + log_warn "Conda not properly initialized" + log_info "Attempting to use xl_conda.sh activation instead..." + source "$CONDA_SCRIPT_PATH" activate "$env_name" + fi + else + log_warn "Conda command not found" + log_info "Attempting to use xl_conda.sh activation instead..." + source "$CONDA_SCRIPT_PATH" activate "$env_name" + fi +} + +# Check if required environment variables are set +if [ -z "$USER" ]; then + log_error "USER environment variable is not set" + exit 1 +fi + +# Define paths +FBSOURCE_PATH="/data/users/$USER/fbsource" +CONDA_SCRIPT_PATH="$FBSOURCE_PATH/genai/xlformers/dev/xl_conda.sh" +FORGE_BASE_DIR="/data/users/$USER" +FORGE_REPO_DIR="$FORGE_BASE_DIR/forge" +MONARCH_DIR="$HOME/monarch_no_torch_latest" + +# Workspace URL for mounting +WORKSPACE_URL="ws://ws.ai.pci0ai/genai_fair_llm" + +log_info "Starting forge environment setup for user: $USER" + +# Step 1: Mount workspace (do this early in case other steps need the mounted files) +log_info "Step 1: Mounting workspace..." +mount_workspace "$WORKSPACE_URL" +if [ $? -ne 0 ]; then + log_warn "Failed to mount workspace, continuing with setup..." + log_warn "Some functionality may not be available without the mounted workspace" +fi + +# Step 2: Check if conda script exists and source it +log_info "Step 2: Activating conda environment..." +if [ ! -f "$CONDA_SCRIPT_PATH" ]; then + log_error "Conda script not found at: $CONDA_SCRIPT_PATH" + log_error "Please ensure fbsource is properly set up" + exit 1 +fi + +log_info "Sourcing conda script: $CONDA_SCRIPT_PATH" +source "$CONDA_SCRIPT_PATH" activate forge:8448524 + +if [ $? -ne 0 ]; then + log_error "Failed to activate conda environment forge-8448524" + exit 1 +fi + +log_info "Conda environment activated successfully" + +# Step 3: Create and navigate to forge base directory +log_info "Step 3: Setting up forge directory..." +if [ ! -d "$FORGE_BASE_DIR" ]; then + log_info "Creating forge base directory: $FORGE_BASE_DIR" + mkdir -p "$FORGE_BASE_DIR" +fi + +cd "$FORGE_BASE_DIR" +log_info "Changed to directory: $(pwd)" + +# Step 4: Clone or update forge repository +log_info "Step 4: Setting up forge git repository..." +if [ -d "$FORGE_REPO_DIR" ]; then + log_warn "Forge repository already exists at: $FORGE_REPO_DIR" + cd "$FORGE_REPO_DIR" + + if [ -d ".git" ]; then + log_info "Updating existing repository..." + git fetch origin + if [ $? -eq 0 ]; then + log_info "Repository updated successfully" + else + log_warn "Failed to fetch updates, continuing with existing code" + fi + else + log_error "Directory exists but is not a git repository" + log_info "Removing directory and cloning fresh..." + cd "$FORGE_BASE_DIR" + rm -rf "$FORGE_REPO_DIR" + git clone git@github.com:meta-pytorch/forge.git + if [ $? -ne 0 ]; then + log_error "Failed to clone forge repository" + exit 1 + fi + cd "$FORGE_REPO_DIR" + fi +else + log_info "Cloning forge repository..." + git clone git@github.com:meta-pytorch/forge.git + if [ $? -ne 0 ]; then + log_error "Failed to clone forge repository" + log_error "Please ensure:" + log_error "1. You have SSH access to github.com" + log_error "2. Your SSH key is added to GitHub" + log_error "3. You have access to meta-pytorch/forge repository" + exit 1 + fi + cd "$FORGE_REPO_DIR" +fi + +log_info "Current directory: $(pwd)" + +# Step 5: Install forge package +log_info "Step 5: Installing forge package..." +pip install --no-deps --force-reinstall . +if [ $? -ne 0 ]; then + log_error "Failed to install forge package" + exit 1 +fi +log_info "Forge package installed successfully" + +# Step 6: Navigate to monarch directory +log_info "Step 6: Setting up monarch directory..." +if [ ! -d "$MONARCH_DIR" ]; then + log_info "Creating monarch directory: $MONARCH_DIR" + mkdir -p "$MONARCH_DIR" +fi + +cd "$MONARCH_DIR" +log_info "Changed to directory: $(pwd)" + +# Step 7: Fetch monarch package +log_info "Step 7: Fetching monarch package..." +# TODO: Remove hardcodedm version +fbpkg fetch monarch_no_torch:23 +if [ $? -ne 0 ]; then + log_error "Failed to fetch monarch_no_torch:23" + log_error "Please ensure fbpkg is properly configured" + exit 1 +fi +log_info "Monarch package fetched successfully" + +# Step 8: Install monarch wheel +log_info "Step 8: Installing monarch wheel..." +WHEEL_FILE="monarch-0.0.0-py3.10-none-any.whl" +if [ ! -f "$WHEEL_FILE" ]; then + log_error "Wheel file not found: $WHEEL_FILE" + log_error "Available files in directory:" + ls -la *.whl 2>/dev/null || log_error "No wheel files found" + exit 1 +fi + +pip install --force-reinstall "$WHEEL_FILE" +if [ $? -ne 0 ]; then + log_error "Failed to install monarch wheel" + exit 1 +fi +log_info "Monarch wheel installed successfully" + +log_info "Environment activation completed" + +# Final verification +log_info "Setup completed successfully!" + +# Check mount status +if mountpoint -q "/mnt/wsfuse" 2>/dev/null; then + log_info "Workspace mount: ✓ Active at /mnt/wsfuse" +else + log_warn "Workspace mount: ✗ Not mounted" +fi + +# Check current environment +if command -v conda >/dev/null 2>&1 && conda info --envs >/dev/null 2>&1; then + CURRENT_ENV=$(conda info --show-active-prefix 2>/dev/null | sed 's/.*\///' || echo "unknown") + log_info "Current conda environment: $CURRENT_ENV" +else + log_info "Current environment: Using xl_conda.sh managed environment" +fi + +log_info "Current directory: $(pwd)" +log_info "Python location: $(which python)" + +# Show installed packages +log_info "Key installed packages:" +pip list | grep -E "(forge|monarch)" || log_warn "No forge/monarch packages found in pip list" + +log_info "Environment setup complete! You can now run your scripts." +log_info "Mounted workspace available at: /mnt/wsfuse" + +# Step 9: Ask user to deactivate and activate conda env conda environment +echo "" +log_info "Installation completed successfully!" +echo "" +log_info "Re-activate the conda environment to make the changes take effect:" +log_info "conda deactivate && conda activate forge-8448524" diff --git a/apps/mast/main.py b/apps/mast/main.py new file mode 100644 index 000000000..fd1819ed2 --- /dev/null +++ b/apps/mast/main.py @@ -0,0 +1,38 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import asyncio +import getpass + +from apps.grpo.main import main as grpo_main +from forge.cli.config import parse +from forge.controller.provisioner import init_provisioner, JOB_NAME_KEY, SCHEDULER_KEY + +from forge.types import Scheduler +from omegaconf import DictConfig + + +async def main(cfg: DictConfig): + """Main module for launching mast jobs for GRPO training.""" + if cfg.get(SCHEDULER_KEY, Scheduler.MAST.value) != Scheduler.MAST.value: + raise ValueError("Schuduler must be MAST.") + + if cfg.get(JOB_NAME_KEY, None) is not None: + # prepend user name to the job to avoid name collision + cfg[JOB_NAME_KEY] = f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}" + + # init mast provisioner + await init_provisioner(cfg) + await grpo_main(cfg) + + +if __name__ == "__main__": + + @parse + def _main(cfg): + asyncio.run(main(cfg)) + + _main() # @parse grabs the cfg from CLI diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml new file mode 100644 index 000000000..2429077fc --- /dev/null +++ b/apps/mast/qwen3_14b_mast.yaml @@ -0,0 +1,153 @@ + +# Grouped Relative Policy Optimization (GRPO) + +# Global configuration +group_size: 8 +batch_size: 16 +max_req_tokens: 512 +max_res_tokens: 512 +model: "Qwen/Qwen3-14B" +off_by_n: 1 # Off by one by default +scheduler: mast +job_name: forge-qwen-14B +checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20 + + +# Dataset configuration +dataset: + path: "openai/gsm8k" + revision: "main" + data_split: "train" + streaming: true + model: ${model} + +# Policy configuration +policy: + engine_config: + model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56 + tensor_parallel_size: 2 + pipeline_parallel_size: 1 + enforce_eager: false + # TODO: Had to disable this becasue vLLm wouldn't like + # need to revisit. + disable_custom_all_reduce: true + sampling_config: + n: ${group_size} + max_tokens: ${max_res_tokens} + temperature: 1.0 + top_p: 1.0 + checkpoint_path: ${checkpoint_folder} + +# Trainer configuration +trainer: + model: + name: qwen3 + flavor: 14B + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56 + optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + lr_scheduler: + warmup_steps: 1 + training: + local_batch_size: ${batch_size} + seq_len: 2048 + max_norm: 1.0 + steps: 1000000 + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 4 + tensor_parallel_degree: 2 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: true + checkpoint: + enable: true + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56 + initial_load_in_hf: true + last_save_in_hf: true + interval: 500 + async_mode: "disabled" + folder: ${checkpoint_folder} + activation_checkpoint: + mode: selective + selective_ac_option: op + comm: + # TODO: revisit this. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in PCI + init_timeout_seconds: 3600 + +# Replay buffer configuration +replay_buffer: + batch_size: ${batch_size} + max_policy_age: ${off_by_n} + dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree + +# Reference model configuration +ref_model: + model: + name: qwen3 + flavor: 14B + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56 + training: + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 1 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + checkpoint: + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56 + initial_load_in_hf: true + +# All resource allocations +services: + dataset: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: dataset + policy: + procs: ${policy.engine_config.tensor_parallel_size} + num_replicas: 14 + with_gpus: true + hosts: 1 + mesh_name: policy + trainer: + procs: 8 + num_replicas: 1 + with_gpus: true + hosts: 1 + mesh_name: trainer + replay_buffer: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: replay_buffer + ref_model: + procs: ${ref_model.parallelism.tensor_parallel_degree} + num_replicas: 14 + with_gpus: true + hosts: 1 + mesh_name: ref_model + compute_advantages: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: compute_advantages + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: reward_actor diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml new file mode 100644 index 000000000..21a58df0b --- /dev/null +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -0,0 +1,129 @@ +# Grouped Relative Policy Optimization (GRPO) +# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml + +# Global configuration +group_size: 8 +batch_size: 16 +max_req_tokens: 512 +max_res_tokens: 512 +model: "Qwen/Qwen3-1.7B" +off_by_n: 1 # Off by one by default + +# Dataset configuration +dataset: + path: "openai/gsm8k" + revision: "main" + data_split: "train" + streaming: true + model: ${model} + +# Policy configuration +policy: + engine_config: + model: ${model} + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + enforce_eager: false + sampling_config: + n: ${group_size} + max_tokens: ${max_res_tokens} + temperature: 1.0 + top_p: 1.0 + +# Trainer configuration +trainer: + model: + name: qwen3 + flavor: 1.7B + hf_assets_path: hf://${model} + optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + lr_scheduler: + warmup_steps: 1 + training: + local_batch_size: ${batch_size} + seq_len: 2048 + max_norm: 1.0 + steps: 1000000 + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 1 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: true + checkpoint: + enable: false + initial_load_path: hf://${model} + initial_load_in_hf: true + last_save_in_hf: true + interval: 500 + async_mode: "disabled" + activation_checkpoint: + mode: selective + selective_ac_option: op + +# Replay buffer configuration +replay_buffer: + batch_size: ${batch_size} + max_policy_age: ${off_by_n} + dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree + +# Reference model configuration +ref_model: + model: + name: qwen3 + flavor: 1.7B + hf_assets_path: hf://${model} + training: + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 1 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + checkpoint: + enable: true + initial_load_path: hf://${model} + initial_load_in_hf: true + +# All resource allocations +services: + policy: + procs: ${policy.engine_config.tensor_parallel_size} + num_replicas: 2 + with_gpus: true + ref_model: + procs: 2 + num_replicas: 1 + with_gpus: true + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + +actors: + dataset: + procs: 1 + with_gpus: false + trainer: + procs: 1 + with_gpus: true + replay_buffer: + procs: 1 + with_gpus: false + compute_advantages: + procs: 1 + with_gpus: false diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml new file mode 100644 index 000000000..3c77adfa3 --- /dev/null +++ b/apps/mast/qwen3_32b_mast.yaml @@ -0,0 +1,153 @@ +# Grouped Relative Policy Optimization (GRPO) + +# Global configuration +group_size: 8 +batch_size: 16 +max_req_tokens: 512 +max_res_tokens: 512 +model: "Qwen/Qwen3-32B" +off_by_n: 1 # Off by one by default +scheduler: mast +job_name: forge-qwen-32B +checkpoint_folder: /mnt/wsfuse/$user$/forge_runs/${job_name}/20 + + +# Dataset configuration +dataset: + path: "openai/gsm8k" + revision: "main" + data_split: "train" + streaming: true + model: ${model} + +# Policy configuration +policy: + engine_config: + model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470 + tensor_parallel_size: 2 + pipeline_parallel_size: 1 + enforce_eager: false + # TODO: Had to disable this becasue vLLm wouldn't like + # need to revisit. + disable_custom_all_reduce: true + sampling_config: + n: ${group_size} + max_tokens: ${max_res_tokens} + temperature: 1.0 + top_p: 1.0 + checkpoint_path: ${checkpoint_folder} + +# Trainer configuration +trainer: + model: + name: qwen3 + flavor: 32B + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470 + optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + lr_scheduler: + warmup_steps: 1 + training: + local_batch_size: ${batch_size} + seq_len: 2048 + max_norm: 1.0 + steps: 1000000 + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 4 + tensor_parallel_degree: 2 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: true + checkpoint: + enable: true + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470 + initial_load_in_hf: true + last_save_in_hf: true + interval: 500 + async_mode: "disabled" + folder: ${checkpoint_folder} + activation_checkpoint: + mode: selective + selective_ac_option: op + comm: + # TODO: revisit this. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in PCI + init_timeout_seconds: 3600 + +# Replay buffer configuration +replay_buffer: + batch_size: ${batch_size} + max_policy_age: ${off_by_n} + dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree + +# Reference model configuration +ref_model: + model: + name: qwen3 + flavor: 32B + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470 + training: + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 2 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + checkpoint: + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470 + initial_load_in_hf: true + +# All resource allocations +services: + dataset: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: dataset + policy: + procs: ${policy.engine_config.tensor_parallel_size} + num_replicas: 2 + with_gpus: true + hosts: 1 + mesh_name: policy + trainer: + # procs: ${trainer.parallelism.data_parallel_shard_degree} + procs: 8 + num_replicas: 1 + with_gpus: true + hosts: 1 + mesh_name: trainer + replay_buffer: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: replay_buffer + ref_model: + procs: ${ref_model.parallelism.tensor_parallel_degree} + num_replicas: 2 + with_gpus: true + hosts: 1 + mesh_name: ref_model + compute_advantages: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: compute_advantages + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: reward_actor diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml new file mode 100644 index 000000000..1690494e8 --- /dev/null +++ b/apps/mast/qwen3_4b_mast.yaml @@ -0,0 +1,152 @@ +# Grouped Relative Policy Optimization (GRPO) + +# Global configuration +group_size: 8 +batch_size: 16 +max_req_tokens: 512 +max_res_tokens: 512 +model: "Qwen/Qwen3-4B" +off_by_n: 1 # Off by one by default +scheduler: mast +job_name: forge-qwen-4B +checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20 + + +# Dataset configuration +dataset: + path: "openai/gsm8k" + revision: "main" + data_split: "train" + streaming: true + model: ${model} + +# Policy configuration +policy: + engine_config: + model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed + tensor_parallel_size: 2 + pipeline_parallel_size: 1 + enforce_eager: false + # TODO: Had to disable this becasue vLLm wouldn't like + # need to revisit. + disable_custom_all_reduce: true + sampling_config: + n: ${group_size} + max_tokens: ${max_res_tokens} + temperature: 1.0 + top_p: 1.0 + checkpoint_path: ${checkpoint_folder} + +# Trainer configuration +trainer: + model: + name: qwen3 + flavor: 4B + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed + optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + lr_scheduler: + warmup_steps: 1 + training: + local_batch_size: ${batch_size} + seq_len: 2048 + max_norm: 1.0 + steps: 1000000 + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 4 + tensor_parallel_degree: 2 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: true + checkpoint: + enable: true + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed + initial_load_in_hf: true + last_save_in_hf: true + interval: 500 + async_mode: "disabled" + folder: ${checkpoint_folder} + activation_checkpoint: + mode: selective + selective_ac_option: op + comm: + # TODO: revisit this. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in PCI + init_timeout_seconds: 3600 + +# Replay buffer configuration +replay_buffer: + batch_size: ${batch_size} + max_policy_age: ${off_by_n} + dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree + +# Reference model configuration +ref_model: + model: + name: qwen3 + flavor: 4B + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed + training: + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 1 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + checkpoint: + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed + initial_load_in_hf: true + +# All resource allocations +services: + dataset: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: dataset + policy: + procs: ${policy.engine_config.tensor_parallel_size} + num_replicas: 2 + with_gpus: true + hosts: 1 + mesh_name: policy + trainer: + procs: 8 + num_replicas: 1 + with_gpus: true + hosts: 1 + mesh_name: trainer + replay_buffer: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: replay_buffer + ref_model: + procs: ${ref_model.parallelism.tensor_parallel_degree} + num_replicas: 2 + with_gpus: true + hosts: 1 + mesh_name: ref_model + compute_advantages: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: compute_advantages + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: reward_actor diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml new file mode 100644 index 000000000..d9ed947ff --- /dev/null +++ b/apps/mast/qwen3_8b_mast.yaml @@ -0,0 +1,152 @@ +# Grouped Relative Policy Optimization (GRPO) + +# Global configuration +group_size: 8 +batch_size: 16 +max_req_tokens: 512 +max_res_tokens: 512 +model: "Qwen/Qwen3-8B" +off_by_n: 1 # Off by one by default +scheduler: mast +job_name: forge-qwen-8B +checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20 + + +# Dataset configuration +dataset: + path: "openai/gsm8k" + revision: "main" + data_split: "train" + streaming: true + model: ${model} + +# Policy configuration +policy: + engine_config: + model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model + tensor_parallel_size: 2 + pipeline_parallel_size: 1 + enforce_eager: false + # TODO: Had to disable this becasue vLLm wouldn't like + # need to revisit. + disable_custom_all_reduce: true + sampling_config: + n: ${group_size} + max_tokens: ${max_res_tokens} + temperature: 1.0 + top_p: 1.0= + checkpoint_path: ${checkpoint_folder} + +# Trainer configuration +trainer: + model: + name: qwen3 + flavor: 8B + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model + optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + lr_scheduler: + warmup_steps: 1 + training: + local_batch_size: ${batch_size} + seq_len: 2048 + max_norm: 1.0 + steps: 1000000 + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 4 + tensor_parallel_degree: 2 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: true + checkpoint: + enable: true + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model + initial_load_in_hf: true + last_save_in_hf: true + interval: 500 + async_mode: "disabled" + folder: ${checkpoint_folder} + activation_checkpoint: + mode: selective + selective_ac_option: op + comm: + # TODO: revisit this. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in PCI + init_timeout_seconds: 3600 + +# Replay buffer configuration +replay_buffer: + batch_size: ${batch_size} + max_policy_age: ${off_by_n} + dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree + +# Reference model configuration +ref_model: + model: + name: qwen3 + flavor: 8B + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model + training: + dtype: bfloat16 + gc_freq: 1 + compile: + enable: false + parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 1 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + checkpoint: + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model + initial_load_in_hf: true + +# All resource allocations +services: + dataset: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: dataset + policy: + procs: ${policy.engine_config.tensor_parallel_size} + num_replicas: 2 + with_gpus: true + hosts: 1 + mesh_name: policy + trainer: + procs: 8 + num_replicas: 1 + with_gpus: true + hosts: 1 + mesh_name: trainer + replay_buffer: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: replay_buffer + ref_model: + procs: ${ref_model.parallelism.tensor_parallel_degree} + num_replicas: 2 + with_gpus: true + hosts: 1 + mesh_name: ref_model + compute_advantages: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: compute_advantages + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: reward_actor diff --git a/src/forge/actors/policy.py b/src/forge/actors/policy.py index 464674f2c..00d788bf4 100644 --- a/src/forge/actors/policy.py +++ b/src/forge/actors/policy.py @@ -19,6 +19,23 @@ import torch.distributed.checkpoint as dcp import torchstore as ts +from forge.actors._torchstore_utils import ( + DcpHandle, + extract_param_name, + get_dcp_whole_state_dict_key, + get_param_key, + get_param_prefix, + load_tensor_from_dcp, +) + +from forge.controller import ForgeActor, get_proc_mesh, stop_proc_mesh +from forge.data.sharding import VLLMSharding +from forge.data_models.completion import Completion +from forge.data_models.prompt import to_prompt +from forge.interfaces import Policy as PolicyInterface +from forge.observability.metrics import record_metric, Reduce +from forge.observability.perf_tracker import Tracer +from forge.types import ProcessConfig from monarch.actor import current_rank, endpoint, ProcMesh from torchstore.state_dict_utils import DELIM from vllm.config import VllmConfig @@ -43,23 +60,6 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm.worker.worker_base import WorkerWrapperBase -from forge.actors._torchstore_utils import ( - extract_param_name, - get_dcp_whole_state_dict_key, - get_param_key, - get_param_prefix, - load_tensor_from_dcp, -) - -from forge.controller import ForgeActor, get_proc_mesh, stop_proc_mesh -from forge.data.sharding import VLLMSharding -from forge.data_models.completion import Completion -from forge.data_models.prompt import to_prompt -from forge.interfaces import Policy as PolicyInterface -from forge.observability.metrics import record_metric, Reduce -from forge.observability.perf_tracker import Tracer -from forge.types import ProcessConfig - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -173,6 +173,7 @@ async def launch( # pyright: ignore[reportIncompatibleMethodOverride] procs=cls.procs, hosts=cls.hosts, with_gpus=cls.with_gpus, + mesh_name=cls.mesh_name, ) worker_procs = await get_proc_mesh(process_config=process_config) diff --git a/src/forge/controller/actor.py b/src/forge/controller/actor.py index f9790ffd3..2c1cf3655 100644 --- a/src/forge/controller/actor.py +++ b/src/forge/controller/actor.py @@ -10,12 +10,12 @@ import sys from typing import Any, Type, TypeVar -from monarch.actor import Actor, current_rank, current_size, endpoint - from forge.controller.proc_mesh import get_proc_mesh, stop_proc_mesh from forge.types import ProcessConfig, ServiceConfig +from monarch.actor import Actor, current_rank, current_size, endpoint + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) T = TypeVar("T", bound="ForgeActor") @@ -58,6 +58,7 @@ def options( hosts: int | None = None, with_gpus: bool = False, num_replicas: int = 1, + mesh_name: str | None = None, **kwargs, ) -> Type[T]: """ @@ -91,6 +92,7 @@ def options( "hosts": hosts, "with_gpus": with_gpus, "num_replicas": num_replicas, + "mesh_name": mesh_name, "_extra_config": kwargs, } @@ -116,6 +118,7 @@ async def as_service( "hosts": cls.hosts, "with_gpus": cls.with_gpus, "num_replicas": cls.num_replicas, + "mesh_name": cls.mesh_name, **cls._extra_config, # all extra fields } cfg = ServiceConfig(**cfg_kwargs) @@ -181,6 +184,7 @@ async def launch(cls, *args, **kwargs) -> "ForgeActor": procs=cls.procs, hosts=cls.hosts, with_gpus=cls.with_gpus, + mesh_name=cls.mesh_name, ) proc_mesh = await get_proc_mesh(process_config=cfg) diff --git a/src/forge/controller/launcher/__init__.py b/src/forge/controller/launcher/__init__.py new file mode 100644 index 000000000..2e41cd717 --- /dev/null +++ b/src/forge/controller/launcher/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py new file mode 100644 index 000000000..8cc3ac323 --- /dev/null +++ b/src/forge/controller/launcher/mast.py @@ -0,0 +1,350 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import asyncio +import functools +import getpass +import logging +import os +import socket +import subprocess +import uuid +from typing import Optional + +import torchx.specs as specs +from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints + +try: + from monarch._src.actor.actor_mesh import current_rank + from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig + from monarch._src.actor.shape import NDSlice, Shape + from monarch.tools.components.meta import hyperactor + from torchx.specs import AppState + from torchx.specs.fb.component_helpers import Packages +except ImportError as e: + print(f"Warning: Monarch imports failed: {e}") + print("Monarch functionality will be limited") +from forge.controller.provisioner import BaseProvisioner, GpuManager, JOB_NAME_KEY +from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host +from monarch.tools import commands +from monarch.tools.commands import info +from monarch.tools.config import Config, Workspace +from omegaconf import DictConfig + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +SCHEDULER_NAME = "mast_conda" +SKU = "gtt_any" +TIMEOUT_SEC = 1 * 60 * 60 # Kill the job if idle for 1 hour + +USER = getpass.getuser() +WORK_DIR = f"/data/users/{USER}" # on DEVGPU +EDITABLE_WORKSPACES = ["forge"] +REMOTE_WORK_DIR = "/packages/monarch_default_workspace/workspace/" + +EDITABLE_WORKSPACE_PATHS = [ + f"{WORK_DIR}/{workspace}" for workspace in EDITABLE_WORKSPACES +] + + +def _get_port() -> str: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("localhost", 0)) + addr = s.getsockname() + port = addr[1] + return str(port) + + +class MastSetupActor(Actor): + @endpoint + def get_info(self) -> [str, str]: + return socket.gethostname(), _get_port() + + @endpoint + def mount(self, mount_dst: str): + point = current_rank() + # The last dimension is the local proc count. + last_label = point.extent.labels[-1] + proc_count = point.size(last_label) + if current_rank().rank % proc_count != 0: + # Only use one rank per host to mount the directory + return + self.mount_mnt_directory(mount_dst) + + def mount_mnt_directory(self, mount_dst: str) -> None: + # Sanity check of the mounted directory + sanity_path = os.path.join(mount_dst, "huggingface_models/") + if os.path.exists(sanity_path): + print(f"Found directory {sanity_path}; skip mounting.") + return + + # Otherwise, mount the directory + if not os.path.exists(mount_dst): + os.makedirs(mount_dst, exist_ok=True) + + # Store original LD_LIBRARY_PATH to restore after mounting + original_ld_library_path = os.environ.get("LD_LIBRARY_PATH", "") + + try: + clean_env = os.environ.copy() + if "LD_LIBRARY_PATH" in clean_env: + del clean_env["LD_LIBRARY_PATH"] + + subprocess.run( + [ + "/packages/oil.oilfs/oilfs-wrapper", + "ws://ws.ai.pci0ai/genai_fair_llm", + mount_dst, + ], + capture_output=True, + text=True, + check=True, + env=clean_env, + ) + print("Done mounting") + except subprocess.CalledProcessError as e: + print( + f"Get error during mounting {e}, Stderr: {e.stderr}, Stdout: {e.stdout}" + ) + finally: + # Restore original LD_LIBRARY_PATH + if original_ld_library_path: + os.environ["LD_LIBRARY_PATH"] = original_ld_library_path + elif "LD_LIBRARY_PATH" in os.environ: + del os.environ["LD_LIBRARY_PATH"] + + assert os.path.exists( + sanity_path + ), f"Did not find directory {sanity_path}; something wrong with mounting." + + +class MastProvisioner(BaseProvisioner): + def __init__(self, cfg: DictConfig | None = None): + self._server_names = [] + self._proc_server_map = {} + self._lock = asyncio.Lock() + self._this_host_id = uuid.uuid1() + available_local_devices = None + cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None) + if cuda_visible_devices is not None and cuda_visible_devices.strip(): + try: + available_local_devices = set( + int(x.strip()) for x in cuda_visible_devices.split(",") if x.strip() + ) + except ValueError as e: + raise ValueError( + f"Invalid CUDA_VISIBLE_DEVICES format: '{cuda_visible_devices}'. " + f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}" + ) from e + self._host_gpu_map = { + self._this_host_id: GpuManager(available_local_devices), + } + assert cfg is not None + self.cfg = cfg + job_name = cfg.get(JOB_NAME_KEY, None) + self.job_name = job_name or self.create_job_name() + + async def initialize(self): + """Call this after creating the instance""" + await self.launch_mast_job() + + async def get_mast_allocator( + self, + job_name: str, + task_group: str, + ): + allocator = MastAllocator( + MastAllocatorConfig( + job_name=job_name, + remote_allocator_port=26600, # This is the default monarch port + ), + ) + alloc_constraints = AllocConstraints( + {MastAllocator.ALLOC_LABEL_TASK_GROUP: task_group} + ) + + return allocator, alloc_constraints + + async def create_host_mesh(self, name: str, num_hosts: int): + """Creates a remote server and a HostMesh on it.""" + logger.debug(f"Creating remote server for mesh: {name}") + server_name = f"{SCHEDULER_NAME}:///{self.job_name}" + alloc, alloc_constraints = await self.get_mast_allocator( + task_group=name, job_name=self.job_name + ) + return ( + HostMesh( + shape=Shape(["hosts"], NDSlice.new_row_major([num_hosts])), + allocator=alloc, + alloc_constraints=alloc_constraints, + ), + server_name, + ) + + async def get_proc_mesh( + self, + num_procs: int, + with_gpus: bool = False, + num_hosts: int | None = None, + mesh_name: Optional[str] = None, + ): + """Gets a proc mesh. + + num_hosts = None implies that you want a local allocation, this may change. + + """ + async with self._lock: + server_name = None + if num_hosts is not None and num_hosts > 0: + assert mesh_name is not None + host_mesh, server_name = await self.create_host_mesh( + name=mesh_name, + num_hosts=num_hosts, + ) + host_id = uuid.uuid1() + gpu_manager = GpuManager() + self._host_gpu_map[host_id] = gpu_manager + else: + host_mesh = this_host() + gpu_manager = self._host_gpu_map[self._this_host_id] + host_mesh._host_id = self._this_host_id + + if with_gpus: + + def bootstrap(gpu_ids: list[str]): + # This works for single host, needed for vLLM currently. + import os + + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(gpu_ids) + os.environ["MASTER_ADDR"] = socket.gethostname() + os.environ["MASTER_PORT"] = f"1234{gpu_ids[0]}" + os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600" + os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824" + + gpu_ids = gpu_manager.get_gpus(num_procs) + procs = host_mesh.spawn_procs( + per_host={"gpus": num_procs}, + bootstrap=functools.partial(bootstrap, gpu_ids=gpu_ids), + ) + await procs.initialized + setup = await procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor) + hostname, port = await setup.get_info.choose() + await setup.mount.call(mount_dst="/mnt/wsfuse") + procs._hostname = hostname + procs._port = port + procs._gpu_ids = gpu_ids + else: + procs = host_mesh.spawn_procs(per_host={"gpus": num_procs}) + + procs._host = host_mesh + + # If we created a server, track so we can tear it down later. + if server_name: + self._server_names.append(server_name) + self._proc_server_map[procs] = server_name + + return procs + + async def stop_proc_mesh(self, proc_mesh: ProcMesh): + """Stops a proc mesh.""" + async with self._lock: + if hasattr(proc_mesh, "_gpu_ids"): + gpu_manager = self._host_gpu_map[proc_mesh._host._host_id] + gpu_manager.release_gpus(proc_mesh._gpu_ids) + await proc_mesh.stop() + if proc_mesh in self._proc_server_map: + server_name = self._proc_server_map[proc_mesh] + commands.kill(server_name) + + async def shutdown(self): + """Tears down all remaining remote allocations.""" + async with self._lock: + for server_name in self._server_names: + commands.kill(server_name) + + async def launch_mast_job(self): + handle = self.create_server_handle() + server_spec = info(handle) + if server_spec and server_spec.state == AppState.RUNNING: + print(f"Job {self.job_name} is already running. Skipping launch.") + return server_spec + + config = Config( + scheduler="mast_conda", + scheduler_args={ + # NOTE: TODO: support passing these args from CLI + "hpcIdentity": "genai_llm_pretraining_data", + "hpcJobOncall": "monarch", + "hpcClusterUuid": "MastProdCluster", + "rmAttribution": "pytorch4all_clients_approved", + }, + appdef=self.build_appdef(), + workspace=Workspace( + dirs=[workspace_dir for workspace_dir in EDITABLE_WORKSPACE_PATHS], + ), + ) + + await commands.get_or_create(self.job_name, config) + return server_spec + + def add_additional_packages(self, packages: Packages) -> Packages: + packages.add_package("oil.oilfs:stable") + packages.add_package("manifold.manifoldfs") + return packages + + def build_appdef(self) -> specs.AppDef: + + # create the app definition for the worker + REMOTE_END_PYTHONPATH = ":".join( + [f"{REMOTE_WORK_DIR}{workspace}" for workspace in EDITABLE_WORKSPACE_PATHS] + ) + + default_envs = { + **hyperactor.DEFAULT_NVRT_ENVS, + **hyperactor.DEFAULT_NCCL_ENVS, + **hyperactor.DEFAULT_TORCH_ENVS, + **{"TORCHX_RUN_PYTHONPATH": f"{REMOTE_END_PYTHONPATH}:{REMOTE_WORK_DIR}"}, + **{ + "HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS": "600", + "HYPERACTOR_CODE_MAX_FRAME_LENGTH": "1073741824", + }, + } + + packages = Packages() + meshes = [] + for mesh_name, config in self.cfg["services"].items(): + num_replicas = config["num_replicas"] + with_gpus = bool(config["with_gpus"]) + num_hosts = int(config.get("hosts", 0)) + # Create list of mesh names with indices and num_hosts + if with_gpus and num_hosts > 0: + mesh_list = [ + f"{mesh_name}_{i}:{num_hosts}:{SKU}" for i in range(num_replicas) + ] + meshes.extend(mesh_list) + + appdef = hyperactor.host_mesh_conda( + meshes=meshes, + additional_packages=self.add_additional_packages(packages), + timeout_sec=TIMEOUT_SEC, + env=default_envs, + ) + + for role in appdef.roles: + role.resource.capabilities["server_sub_types"] = [ + # role.resource.capabilities["server_sub_types"][2] # hardcoded to ROCE + role.resource.capabilities["server_sub_types"][1] # GTT + ] + + return appdef + + def create_job_name(self): + return f"{USER}-forge-{uuid.uuid4().hex[:6]}" + + def create_server_handle(self) -> str: + return f"{SCHEDULER_NAME}:///{self.job_name}" diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index 1951eab76..d8b3b5300 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -12,8 +12,17 @@ import os import socket import uuid +from abc import ABC, abstractmethod +from typing import Optional import monarch + +from forge.observability.metric_actors import ( + get_or_create_metric_logger, + setup_metric_logger, +) + +from forge.types import ProcessConfig, Scheduler from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer from monarch._src.actor.shape import NDSlice, Shape from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host @@ -21,13 +30,14 @@ from monarch.tools.components import hyperactor from monarch.tools.config import Config -from forge.observability.metric_actors import get_or_create_metric_logger - -from forge.types import ProcessConfig +from omegaconf import DictConfig logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) +JOB_NAME_KEY = "job_name" +SCHEDULER_KEY = "scheduler" + def _get_port() -> str: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: @@ -76,7 +86,54 @@ def release_gpus(self, gpu_ids: list[str]) -> None: self.available_gpus.add(int(gpu_id)) -class Provisioner: +class BaseProvisioner(ABC): + """Abstract base class for resource provisioners.""" + + @abstractmethod + async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh: + """Creates a remote server and a HostMesh on it. + Args: + name: Name identifier for the host mesh + num_hosts: Number of hosts to create + Returns: + HostMesh: The created host mesh + """ + pass + + @abstractmethod + async def get_proc_mesh( + self, + num_procs: int, + with_gpus: bool = False, + num_hosts: Optional[int] = None, + mesh_name: Optional[str] = None, + ) -> ProcMesh: + """Gets a proc mesh. + Args: + num_procs: Number of processes needed + with_gpus: Whether GPU support is required + num_hosts: Number of hosts (None implies local allocation) + mesh_name: Name identifier for the proc mesh + Returns: + ProcMesh: The allocated process mesh + """ + pass + + @abstractmethod + async def stop_proc_mesh(self, proc_mesh: ProcMesh) -> None: + """Stops a proc mesh. + Args: + proc_mesh: The process mesh to stop + """ + pass + + @abstractmethod + async def shutdown(self) -> None: + """Tears down all remaining remote allocations.""" + pass + + +class Provisioner(BaseProvisioner): """A global resource provisioner.""" def __init__(self): @@ -145,7 +202,11 @@ async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh: ) async def get_proc_mesh( - self, num_procs: int, with_gpus: bool = False, num_hosts: int | None = None + self, + num_procs: int, + with_gpus: bool = False, + num_hosts: int | None = None, + mesh_name: Optional[str] = None, ): """Gets a proc mesh. @@ -245,28 +306,47 @@ async def shutdown(self): commands.kill(server_name) -_provisioner: Provisioner | None = None +_provisioner: BaseProvisioner | None = None -def _get_provisioner(): +async def init_provisioner(cfg: DictConfig | None = None): global _provisioner if not _provisioner: - _provisioner = Provisioner() + scheduler = Scheduler.LOCAL + if cfg is not None: + scheduler = cfg.get(SCHEDULER_KEY, Scheduler.LOCAL.value) + if scheduler == Scheduler.MAST.value: + from forge.controller.launcher.mast import MastProvisioner + + _provisioner = MastProvisioner(cfg=cfg) + await _provisioner.initialize() + else: + _provisioner = Provisioner() + return _provisioner + + +async def _get_provisioner(): + if not _provisioner: + await init_provisioner() return _provisioner async def get_proc_mesh(config: ProcessConfig) -> ProcMesh: - return await _get_provisioner().get_proc_mesh( + provisioner = await _get_provisioner() + return await provisioner.get_proc_mesh( num_procs=config.procs, with_gpus=config.with_gpus, num_hosts=config.hosts, + mesh_name=config.mesh_name, ) async def stop_proc_mesh(proc_mesh: ProcMesh): - return await _get_provisioner().stop_proc_mesh(proc_mesh=proc_mesh) + provisioner = await _get_provisioner() + return await provisioner.stop_proc_mesh(proc_mesh=proc_mesh) async def shutdown(): logger.info("Shutting down provisioner..") - await _get_provisioner().shutdown() + provisioner = await _get_provisioner() + return await provisioner.shutdown() diff --git a/src/forge/controller/service/replica.py b/src/forge/controller/service/replica.py index 09b0a2ce6..7331fa401 100644 --- a/src/forge/controller/service/replica.py +++ b/src/forge/controller/service/replica.py @@ -13,11 +13,11 @@ from enum import Enum from typing import Optional -from monarch.actor import ActorError - from forge.controller import ForgeActor from forge.types import ProcessConfig +from monarch.actor import ActorError + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -159,6 +159,7 @@ async def initialize(self): # Deploy the actor and its underlying resources logger.debug(f"Launching actor for replica {self.idx}") + self.proc_config.mesh_name = f"{self.proc_config.mesh_name}_{self.idx}" self.actor = await self.actor_def.launch( *self.actor_args, **self.actor_kwargs, diff --git a/src/forge/types.py b/src/forge/types.py index cc41d2185..271797d95 100644 --- a/src/forge/types.py +++ b/src/forge/types.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field +from enum import Enum from typing import Any, TypedDict, Union @@ -87,6 +88,12 @@ class State: metadata: dict[str, Any] = field(default_factory=dict) +class Scheduler(Enum): + MAST = "mast" + SLURM = "slurm" + LOCAL = "local" + + @dataclass class ProcessConfig: """A proc_mesh config for the torchx scheduler.""" @@ -94,6 +101,7 @@ class ProcessConfig: procs: int = 1 with_gpus: bool = False hosts: int | None = None + mesh_name: str | None = None @dataclass @@ -118,6 +126,7 @@ class ServiceConfig: health_poll_rate: float = 0.2 replica_max_concurrent_requests: int = 10 return_first_rank_result: bool = True + mesh_name: str | None = None def to_process_config(self) -> ProcessConfig: """Extract ProcessConfig from this ServiceConfig. @@ -127,6 +136,7 @@ def to_process_config(self) -> ProcessConfig: procs=self.procs, with_gpus=self.with_gpus, hosts=self.hosts, + mesh_name=self.mesh_name, ) From 721c7055da75e9b60a9c9d0a5dba296a50710e54 Mon Sep 17 00:00:00 2001 From: rithesh Date: Mon, 29 Sep 2025 15:52:28 -0700 Subject: [PATCH 02/17] minor change --- apps/mast/qwen3_1_7b_mast.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml index 21a58df0b..597be0ff2 100644 --- a/apps/mast/qwen3_1_7b_mast.yaml +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -106,8 +106,8 @@ services: num_replicas: 2 with_gpus: true ref_model: - procs: 2 - num_replicas: 1 + procs: 1 + num_replicas: 2 with_gpus: true reward_actor: procs: 1 From db5db98ac958b90a1ec23ed021fb76eac961c4a9 Mon Sep 17 00:00:00 2001 From: rithesh Date: Tue, 30 Sep 2025 13:42:12 -0700 Subject: [PATCH 03/17] interim changes --- apps/mast/qwen3_1_7b_mast.yaml | 34 +++++++++++++++++++++++---- src/forge/controller/launcher/mast.py | 9 +++++++ 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml index 597be0ff2..2c266c2bc 100644 --- a/apps/mast/qwen3_1_7b_mast.yaml +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -8,6 +8,20 @@ max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-1.7B" off_by_n: 1 # Off by one by default +scheduler: mast +job_name: forge-qwen3-1_7b-2a48e + +# Main loop configuration +rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas + +# Observability configuration +metric_logging: + wandb: + project: "grpo-training" + group: "grpo_exp_${oc.env:USER}" + reduce_across_ranks: True + console: + reduce_across_ranks: True # Dataset configuration dataset: @@ -20,7 +34,7 @@ dataset: # Policy configuration policy: engine_config: - model: ${model} + model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5 tensor_parallel_size: 1 pipeline_parallel_size: 1 enforce_eager: false @@ -35,7 +49,7 @@ trainer: model: name: qwen3 flavor: 1.7B - hf_assets_path: hf://${model} + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5 optimizer: name: AdamW lr: 1e-5 @@ -61,7 +75,7 @@ trainer: disable_loss_parallel: true checkpoint: enable: false - initial_load_path: hf://${model} + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5 initial_load_in_hf: true last_save_in_hf: true interval: 500 @@ -81,7 +95,7 @@ ref_model: model: name: qwen3 flavor: 1.7B - hf_assets_path: hf://${model} + hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5 training: dtype: bfloat16 gc_freq: 1 @@ -96,7 +110,7 @@ ref_model: expert_parallel_degree: 1 checkpoint: enable: true - initial_load_path: hf://${model} + initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5 initial_load_in_hf: true # All resource allocations @@ -105,25 +119,35 @@ services: procs: ${policy.engine_config.tensor_parallel_size} num_replicas: 2 with_gpus: true + mesh_name: policy + hosts: 1 ref_model: procs: 1 num_replicas: 2 with_gpus: true + mesh_name: ref_model + hosts: 1 reward_actor: procs: 1 num_replicas: 1 with_gpus: false + mesh_name: reward_actor actors: dataset: procs: 1 with_gpus: false + mesh_name: dataset trainer: procs: 1 with_gpus: true + mesh_name: trainer + hosts: 1 replay_buffer: procs: 1 with_gpus: false + mesh_name: replay_buffer compute_advantages: procs: 1 with_gpus: false + mesh_name: compute_advantages diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py index 8cc3ac323..7bda56c6b 100644 --- a/src/forge/controller/launcher/mast.py +++ b/src/forge/controller/launcher/mast.py @@ -317,6 +317,7 @@ def build_appdef(self) -> specs.AppDef: packages = Packages() meshes = [] + # Process both services and actors configurations for mesh_name, config in self.cfg["services"].items(): num_replicas = config["num_replicas"] with_gpus = bool(config["with_gpus"]) @@ -328,6 +329,14 @@ def build_appdef(self) -> specs.AppDef: ] meshes.extend(mesh_list) + for mesh_name, config in self.cfg["actors"].items(): + num_replicas = 1 + with_gpus = bool(config["with_gpus"]) + num_hosts = int(config.get("hosts", 0)) + # single actors with GPUs + if with_gpus: + meshes.append(f"{mesh_name}:{num_replicas}:{SKU}") + appdef = hyperactor.host_mesh_conda( meshes=meshes, additional_packages=self.add_additional_packages(packages), From 75aa422a727deff4487bbc24bc254ed6ff677bed Mon Sep 17 00:00:00 2001 From: rithesh Date: Wed, 1 Oct 2025 08:24:21 -0700 Subject: [PATCH 04/17] fix the bug --- src/forge/controller/provisioner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index d8b3b5300..05b52e8af 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -17,10 +17,7 @@ import monarch -from forge.observability.metric_actors import ( - get_or_create_metric_logger, - setup_metric_logger, -) +from forge.observability.metric_actors import get_or_create_metric_logger from forge.types import ProcessConfig, Scheduler from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer From 5be96991996c22f38417c21d64a8dd8449237c35 Mon Sep 17 00:00:00 2001 From: rithesh Date: Wed, 1 Oct 2025 08:44:15 -0700 Subject: [PATCH 05/17] some more fixes --- apps/mast/README.md | 31 ++++++++++++++++ apps/mast/env_setup.sh | 47 +++---------------------- apps/mast/qwen3_1_7b_mast.yaml | 2 +- src/forge/controller/launcher/mast.py | 2 +- src/forge/controller/service/replica.py | 5 ++- 5 files changed, 41 insertions(+), 46 deletions(-) create mode 100644 apps/mast/README.md diff --git a/apps/mast/README.md b/apps/mast/README.md new file mode 100644 index 000000000..6cd48d32d --- /dev/null +++ b/apps/mast/README.md @@ -0,0 +1,31 @@ +# Forge MAST Environment Setup + +A simple setup script to automatically configure your environment for running Forge with MAST jobs. + +## Quick Start + +### 1. Run the Setup Script + +The `env_setup.sh` script will automatically: +- ✅ Activate the required conda environment (`forge-8448524`) +- ✅ Clone/update the Forge repository +- ✅ Install Forge package dependencies +- ✅ Mount the required oilfs workspace to `/mnt/wsfuse` +- ✅ Configure your environment for MAST job submission + +```bash +# Make the script executable +chmod +x env_setup.sh + +# Run the setup +./apps/mast/env_setup.sh + +``` + +### 2. Submit MAST job + +``` +pip install --force-reinstall --no-deps . && python -m apps.mast.main --config apps/mast/qwen3_1_7b_mast.yaml +``` + +⚠️ Important Note: `pip install --force-reinstall --no-deps .` is required every time you make a change to the local codebase. This ensures your latest changes are installed before job submission. diff --git a/apps/mast/env_setup.sh b/apps/mast/env_setup.sh index 6728c87db..88d2edf6f 100755 --- a/apps/mast/env_setup.sh +++ b/apps/mast/env_setup.sh @@ -139,7 +139,6 @@ FBSOURCE_PATH="/data/users/$USER/fbsource" CONDA_SCRIPT_PATH="$FBSOURCE_PATH/genai/xlformers/dev/xl_conda.sh" FORGE_BASE_DIR="/data/users/$USER" FORGE_REPO_DIR="$FORGE_BASE_DIR/forge" -MONARCH_DIR="$HOME/monarch_no_torch_latest" # Workspace URL for mounting WORKSPACE_URL="ws://ws.ai.pci0ai/genai_fair_llm" @@ -163,10 +162,10 @@ if [ ! -f "$CONDA_SCRIPT_PATH" ]; then fi log_info "Sourcing conda script: $CONDA_SCRIPT_PATH" -source "$CONDA_SCRIPT_PATH" activate forge:8448524 +source "$CONDA_SCRIPT_PATH" activate forge:e146614 if [ $? -ne 0 ]; then - log_error "Failed to activate conda environment forge-8448524" + log_error "Failed to activate conda environment forge-e146614" exit 1 fi @@ -233,44 +232,6 @@ if [ $? -ne 0 ]; then fi log_info "Forge package installed successfully" -# Step 6: Navigate to monarch directory -log_info "Step 6: Setting up monarch directory..." -if [ ! -d "$MONARCH_DIR" ]; then - log_info "Creating monarch directory: $MONARCH_DIR" - mkdir -p "$MONARCH_DIR" -fi - -cd "$MONARCH_DIR" -log_info "Changed to directory: $(pwd)" - -# Step 7: Fetch monarch package -log_info "Step 7: Fetching monarch package..." -# TODO: Remove hardcodedm version -fbpkg fetch monarch_no_torch:23 -if [ $? -ne 0 ]; then - log_error "Failed to fetch monarch_no_torch:23" - log_error "Please ensure fbpkg is properly configured" - exit 1 -fi -log_info "Monarch package fetched successfully" - -# Step 8: Install monarch wheel -log_info "Step 8: Installing monarch wheel..." -WHEEL_FILE="monarch-0.0.0-py3.10-none-any.whl" -if [ ! -f "$WHEEL_FILE" ]; then - log_error "Wheel file not found: $WHEEL_FILE" - log_error "Available files in directory:" - ls -la *.whl 2>/dev/null || log_error "No wheel files found" - exit 1 -fi - -pip install --force-reinstall "$WHEEL_FILE" -if [ $? -ne 0 ]; then - log_error "Failed to install monarch wheel" - exit 1 -fi -log_info "Monarch wheel installed successfully" - log_info "Environment activation completed" # Final verification @@ -301,9 +262,9 @@ pip list | grep -E "(forge|monarch)" || log_warn "No forge/monarch packages foun log_info "Environment setup complete! You can now run your scripts." log_info "Mounted workspace available at: /mnt/wsfuse" -# Step 9: Ask user to deactivate and activate conda env conda environment +# Step 6: Ask user to deactivate and activate conda env conda environment echo "" log_info "Installation completed successfully!" echo "" log_info "Re-activate the conda environment to make the changes take effect:" -log_info "conda deactivate && conda activate forge-8448524" +log_info "conda deactivate && conda activate forge-e146614" diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml index 2c266c2bc..a1804e68f 100644 --- a/apps/mast/qwen3_1_7b_mast.yaml +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -9,7 +9,7 @@ max_res_tokens: 512 model: "Qwen/Qwen3-1.7B" off_by_n: 1 # Off by one by default scheduler: mast -job_name: forge-qwen3-1_7b-2a48e +job_name: forge-qwen3-1_7b # Main loop configuration rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py index 7bda56c6b..856ca622e 100644 --- a/src/forge/controller/launcher/mast.py +++ b/src/forge/controller/launcher/mast.py @@ -232,7 +232,7 @@ def bootstrap(gpu_ids: list[str]): bootstrap=functools.partial(bootstrap, gpu_ids=gpu_ids), ) await procs.initialized - setup = await procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor) + setup = procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor) hostname, port = await setup.get_info.choose() await setup.mount.call(mount_dst="/mnt/wsfuse") procs._hostname = hostname diff --git a/src/forge/controller/service/replica.py b/src/forge/controller/service/replica.py index 7331fa401..b0804cba6 100644 --- a/src/forge/controller/service/replica.py +++ b/src/forge/controller/service/replica.py @@ -159,7 +159,10 @@ async def initialize(self): # Deploy the actor and its underlying resources logger.debug(f"Launching actor for replica {self.idx}") - self.proc_config.mesh_name = f"{self.proc_config.mesh_name}_{self.idx}" + mesh_name_with_replica = f"{self.proc_config.mesh_name}_{self.idx}" + self.proc_config.mesh_name = mesh_name_with_replica + if hasattr(self.actor_def, "mesh_name"): + setattr(self.actor_def, "mesh_name", mesh_name_with_replica) self.actor = await self.actor_def.launch( *self.actor_args, **self.actor_kwargs, From 7e69a6b6d85ee2b558e8c7efb6202651ab3f91d6 Mon Sep 17 00:00:00 2001 From: Allen Wang <9057208+allenwang28@users.noreply.github.com> Date: Wed, 1 Oct 2025 09:57:19 -0700 Subject: [PATCH 06/17] park --- src/forge/controller/launcher/mast.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py index 856ca622e..cec5e87a6 100644 --- a/src/forge/controller/launcher/mast.py +++ b/src/forge/controller/launcher/mast.py @@ -27,13 +27,14 @@ except ImportError as e: print(f"Warning: Monarch imports failed: {e}") print("Monarch functionality will be limited") -from forge.controller.provisioner import BaseProvisioner, GpuManager, JOB_NAME_KEY from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host from monarch.tools import commands from monarch.tools.commands import info from monarch.tools.config import Config, Workspace from omegaconf import DictConfig +from forge.controller.provisioner import BaseProvisioner, GpuManager, JOB_NAME_KEY + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -278,7 +279,9 @@ async def launch_mast_job(self): scheduler="mast_conda", scheduler_args={ # NOTE: TODO: support passing these args from CLI - "hpcIdentity": "genai_llm_pretraining_data", + "hpcIdentity": "hyper_monarch", + # "hpcIdentity": "genai_llm_pretraining_data", + # "hpcIdentity": "pytorch_distributed", "hpcJobOncall": "monarch", "hpcClusterUuid": "MastProdCluster", "rmAttribution": "pytorch4all_clients_approved", From 649fe83d71fae85a9937496fa46dbb07b086bd36 Mon Sep 17 00:00:00 2001 From: Allen Wang <9057208+allenwang28@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:07:36 -0700 Subject: [PATCH 07/17] parking changes --- apps/grpo/main.py | 1 + src/forge/controller/launcher/mast.py | 16 ++++++++++++++ src/forge/controller/provisioner.py | 31 +++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/apps/grpo/main.py b/apps/grpo/main.py index 7545aa561..42865334e 100644 --- a/apps/grpo/main.py +++ b/apps/grpo/main.py @@ -316,6 +316,7 @@ async def main(cfg: DictConfig): metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}}) mlogger = await get_or_create_metric_logger() await mlogger.init_backends.call_one(metric_logging_cfg) + print("SUCCESSFULLY CREATED AND INITIALIZED MLOGGER") # ---- Setup services ---- # await ts.initialize(strategy=ts.ControllerStorageVolumes()) diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py index cec5e87a6..1aacd87dd 100644 --- a/src/forge/controller/launcher/mast.py +++ b/src/forge/controller/launcher/mast.py @@ -17,6 +17,8 @@ import torchx.specs as specs from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints +from forge.observability.metric_actors import get_or_create_metric_logger + try: from monarch._src.actor.actor_mesh import current_rank from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig @@ -249,11 +251,18 @@ def bootstrap(gpu_ids: list[str]): self._server_names.append(server_name) self._proc_server_map[procs] = server_name + _ = await get_or_create_metric_logger(procs) + return procs async def stop_proc_mesh(self, proc_mesh: ProcMesh): """Stops a proc mesh.""" async with self._lock: + # Deregister local logger from global logger + if hasattr(proc_mesh, "_local_fetcher"): + global_logger = await get_or_create_metric_logger(proc_mesh) + await global_logger.deregister_fetcher.call_one(proc_mesh) + if hasattr(proc_mesh, "_gpu_ids"): gpu_manager = self._host_gpu_map[proc_mesh._host._host_id] gpu_manager.release_gpus(proc_mesh._gpu_ids) @@ -315,9 +324,16 @@ def build_appdef(self) -> specs.AppDef: **{ "HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS": "600", "HYPERACTOR_CODE_MAX_FRAME_LENGTH": "1073741824", + "TORCHINDUCTOR_COMPILE_THREADS": "1", + "TORCH_COMPILE_DISABLE": "1", + "TORCHDYNAMO_VERBOSE": "1", + "VLLM_TORCH_COMPILE_LEVEL": "0", + "VLLM_USE_TRITON_FLASH_ATTN": "0", }, } + print("DEFAULT ENVS: ", default_envs) + packages = Packages() meshes = [] # Process both services and actors configurations diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index 05b52e8af..5b0d23783 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -16,10 +16,6 @@ from typing import Optional import monarch - -from forge.observability.metric_actors import get_or_create_metric_logger - -from forge.types import ProcessConfig, Scheduler from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer from monarch._src.actor.shape import NDSlice, Shape from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host @@ -29,6 +25,10 @@ from omegaconf import DictConfig +from forge.observability.metric_actors import get_or_create_metric_logger + +from forge.types import ProcessConfig, Scheduler + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -252,6 +252,29 @@ def bootstrap(gpu_ids: list[str]): os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600" os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824" + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = "0" + os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + os.environ["NVTE_TORCH_COMPILE"] = "0" + os.environ["NVTE_BIAS_GELU_NVFUSION"] = "0" + os.environ["NVTE_CUDA_INCLUDE_DIR"] = "/usr/local/cuda/include" + os.environ["NVTE_DISABLE_NVRTC"] = "1" + os.environ["NVTE_FUSED_ATTN"] = "1" + os.environ["NVTE_FUSED_ATTN_USE_FAv2_BWD"] = "1" + os.environ["NCCL_SET_THREAD_NAME"] = "1'" + os.environ[ + "NCCL_DEBUG_SUBSYS" + ] = "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC" + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "3" + os.environ["NCCL_NET_OVERHEAD"] = "2750" + os.environ["NCCL_IB_SPLIT_DATA_ON_QPS"] = "0" + os.environ["NCCL_IB_QPS_PER_CONNECTION"] = "16" + os.environ["NCCL_CTRAN_ENABLE"] = "0" + os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1" + os.environ["PYTORCH_JIT"] = "0" + os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" + os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + os.environ["GLOG_minloglevel"] = "1" + gpu_ids = gpu_manager.get_gpus(num_procs) procs = host_mesh.spawn_procs( per_host={"gpus": num_procs}, From 0d683a454f09259382f51d916dd05dc390abe010 Mon Sep 17 00:00:00 2001 From: rithesh Date: Wed, 1 Oct 2025 14:21:45 -0700 Subject: [PATCH 08/17] config changes --- apps/mast/qwen3_1_7b_mast.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml index a1804e68f..993bf0570 100644 --- a/apps/mast/qwen3_1_7b_mast.yaml +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -9,7 +9,8 @@ max_res_tokens: 512 model: "Qwen/Qwen3-1.7B" off_by_n: 1 # Off by one by default scheduler: mast -job_name: forge-qwen3-1_7b +job_name: forge-qwen3-1_7b-1190 +checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20 # Main loop configuration rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas @@ -74,7 +75,7 @@ trainer: expert_parallel_degree: 1 disable_loss_parallel: true checkpoint: - enable: false + enable: true initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5 initial_load_in_hf: true last_save_in_hf: true @@ -83,6 +84,7 @@ trainer: activation_checkpoint: mode: selective selective_ac_option: op + dcp_path: ${checkpoint_folder} # Replay buffer configuration replay_buffer: From 097642d951155850821ca61a1e5a89d5054ad1a6 Mon Sep 17 00:00:00 2001 From: rithesh Date: Wed, 1 Oct 2025 15:30:40 -0700 Subject: [PATCH 09/17] working changes --- apps/mast/env_setup.sh | 2 +- apps/mast/main.py | 19 +++++++- apps/mast/qwen3_14b_mast.yaml | 71 ++++++++++++++++------------- apps/mast/qwen3_1_7b_mast.yaml | 11 ++++- apps/mast/qwen3_32b_mast.yaml | 69 ++++++++++++++++------------ apps/mast/qwen3_4b_mast.yaml | 70 ++++++++++++++++------------ apps/mast/qwen3_8b_mast.yaml | 70 ++++++++++++++++------------ src/forge/controller/provisioner.py | 14 +++--- 8 files changed, 193 insertions(+), 133 deletions(-) diff --git a/apps/mast/env_setup.sh b/apps/mast/env_setup.sh index 88d2edf6f..4318e05f0 100755 --- a/apps/mast/env_setup.sh +++ b/apps/mast/env_setup.sh @@ -70,7 +70,7 @@ mount_workspace() { unset LD_LIBRARY_PATH # Mount the workspace - if oilfs "$workspace_url" "$mount_dir"; then + if sudo oilfs "$workspace_url" "$mount_dir"; then log_info "Successfully mounted $workspace_url to $mount_dir" else log_error "Failed to mount $workspace_url to $mount_dir" diff --git a/apps/mast/main.py b/apps/mast/main.py index fd1819ed2..8029f35e1 100644 --- a/apps/mast/main.py +++ b/apps/mast/main.py @@ -6,6 +6,7 @@ import asyncio import getpass +import uuid from apps.grpo.main import main as grpo_main from forge.cli.config import parse @@ -14,6 +15,9 @@ from forge.types import Scheduler from omegaconf import DictConfig +DEFAULT_CHECKPOINT_FOLDER_KEY = "checkpoint_folder" +DEFAULT_CHECKPOINT_FOLDER = "/mnt/wsfuse/teamforge/forge_runs/" + async def main(cfg: DictConfig): """Main module for launching mast jobs for GRPO training.""" @@ -21,8 +25,19 @@ async def main(cfg: DictConfig): raise ValueError("Schuduler must be MAST.") if cfg.get(JOB_NAME_KEY, None) is not None: - # prepend user name to the job to avoid name collision - cfg[JOB_NAME_KEY] = f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}" + # prepend user name and append guid to the job to avoid name collision + cfg[JOB_NAME_KEY] = ( + f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}" + ) + print(f"Overriding mast job name to {cfg[JOB_NAME_KEY]}") + + if cfg.get(DEFAULT_CHECKPOINT_FOLDER_KEY, DEFAULT_CHECKPOINT_FOLDER) is not None: + # append job_name to CP folder path to avoid path collision + if cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] == DEFAULT_CHECKPOINT_FOLDER: + cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] = ( + f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}" + ) + print(f"Overriding checkpoint folder to {cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}") # init mast provisioner await init_provisioner(cfg) diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml index 2429077fc..198ecf8f2 100644 --- a/apps/mast/qwen3_14b_mast.yaml +++ b/apps/mast/qwen3_14b_mast.yaml @@ -1,5 +1,5 @@ - # Grouped Relative Policy Optimization (GRPO) +# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml # Global configuration group_size: 8 @@ -9,9 +9,20 @@ max_res_tokens: 512 model: "Qwen/Qwen3-14B" off_by_n: 1 # Off by one by default scheduler: mast -job_name: forge-qwen-14B -checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20 +job_name: forge-qwen3-14b +checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ + +# Main loop configuration +rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas +# Observability configuration +metric_logging: + wandb: + project: "grpo-training" + group: "grpo_exp_${oc.env:USER}" + reduce_across_ranks: True + console: + reduce_across_ranks: True # Dataset configuration dataset: @@ -29,14 +40,13 @@ policy: pipeline_parallel_size: 1 enforce_eager: false # TODO: Had to disable this becasue vLLm wouldn't like - # need to revisit. + # needs to revisited. disable_custom_all_reduce: true sampling_config: n: ${group_size} max_tokens: ${max_res_tokens} temperature: 1.0 top_p: 1.0 - checkpoint_path: ${checkpoint_folder} # Trainer configuration trainer: @@ -74,14 +84,14 @@ trainer: last_save_in_hf: true interval: 500 async_mode: "disabled" - folder: ${checkpoint_folder} activation_checkpoint: mode: selective selective_ac_option: op comm: - # TODO: revisit this. causing NCCL timeouts on inits when loading CP - # from oilfs if the traienr is not in the same region as in PCI - init_timeout_seconds: 3600 + # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in oilfs + init_timeout_seconds: 1200 + dcp_path: ${checkpoint_folder} # Replay buffer configuration replay_buffer: @@ -108,46 +118,45 @@ ref_model: context_parallel_degree: 1 expert_parallel_degree: 1 checkpoint: + enable: true initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56 initial_load_in_hf: true # All resource allocations services: - dataset: - procs: 1 - num_replicas: 1 - with_gpus: false - mesh_name: dataset policy: procs: ${policy.engine_config.tensor_parallel_size} - num_replicas: 14 + num_replicas: 2 with_gpus: true - hosts: 1 mesh_name: policy + hosts: 1 + ref_model: + procs: 1 + num_replicas: 2 + with_gpus: true + mesh_name: ref_model + hosts: 1 + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: reward_actor + +actors: + dataset: + procs: 1 + with_gpus: false + mesh_name: dataset trainer: procs: 8 - num_replicas: 1 with_gpus: true - hosts: 1 mesh_name: trainer + hosts: 1 replay_buffer: procs: 1 - num_replicas: 1 with_gpus: false mesh_name: replay_buffer - ref_model: - procs: ${ref_model.parallelism.tensor_parallel_degree} - num_replicas: 14 - with_gpus: true - hosts: 1 - mesh_name: ref_model compute_advantages: procs: 1 - num_replicas: 1 with_gpus: false mesh_name: compute_advantages - reward_actor: - procs: 1 - num_replicas: 1 - with_gpus: false - mesh_name: reward_actor diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml index 993bf0570..44a3fa906 100644 --- a/apps/mast/qwen3_1_7b_mast.yaml +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -9,8 +9,8 @@ max_res_tokens: 512 model: "Qwen/Qwen3-1.7B" off_by_n: 1 # Off by one by default scheduler: mast -job_name: forge-qwen3-1_7b-1190 -checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20 +job_name: forge-qwen3-1_7b +checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ # Main loop configuration rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas @@ -39,6 +39,9 @@ policy: tensor_parallel_size: 1 pipeline_parallel_size: 1 enforce_eager: false + # TODO: Had to disable this becasue vLLm wouldn't like + # needs to revisited. + disable_custom_all_reduce: true sampling_config: n: ${group_size} max_tokens: ${max_res_tokens} @@ -84,6 +87,10 @@ trainer: activation_checkpoint: mode: selective selective_ac_option: op + comm: + # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in oilfs + init_timeout_seconds: 1200 dcp_path: ${checkpoint_folder} # Replay buffer configuration diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml index 3c77adfa3..a6818b41c 100644 --- a/apps/mast/qwen3_32b_mast.yaml +++ b/apps/mast/qwen3_32b_mast.yaml @@ -1,4 +1,5 @@ # Grouped Relative Policy Optimization (GRPO) +# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml # Global configuration group_size: 8 @@ -8,9 +9,20 @@ max_res_tokens: 512 model: "Qwen/Qwen3-32B" off_by_n: 1 # Off by one by default scheduler: mast -job_name: forge-qwen-32B -checkpoint_folder: /mnt/wsfuse/$user$/forge_runs/${job_name}/20 +job_name: forge-qwen3-32b +checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ +# Main loop configuration +rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas + +# Observability configuration +metric_logging: + wandb: + project: "grpo-training" + group: "grpo_exp_${oc.env:USER}" + reduce_across_ranks: True + console: + reduce_across_ranks: True # Dataset configuration dataset: @@ -28,14 +40,13 @@ policy: pipeline_parallel_size: 1 enforce_eager: false # TODO: Had to disable this becasue vLLm wouldn't like - # need to revisit. + # needs to revisited. disable_custom_all_reduce: true sampling_config: n: ${group_size} max_tokens: ${max_res_tokens} temperature: 1.0 top_p: 1.0 - checkpoint_path: ${checkpoint_folder} # Trainer configuration trainer: @@ -73,14 +84,14 @@ trainer: last_save_in_hf: true interval: 500 async_mode: "disabled" - folder: ${checkpoint_folder} activation_checkpoint: mode: selective selective_ac_option: op comm: - # TODO: revisit this. causing NCCL timeouts on inits when loading CP - # from oilfs if the traienr is not in the same region as in PCI - init_timeout_seconds: 3600 + # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in oilfs + init_timeout_seconds: 1200 + dcp_path: ${checkpoint_folder} # Replay buffer configuration replay_buffer: @@ -107,47 +118,45 @@ ref_model: context_parallel_degree: 1 expert_parallel_degree: 1 checkpoint: + enable: true initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470 initial_load_in_hf: true # All resource allocations services: - dataset: - procs: 1 - num_replicas: 1 - with_gpus: false - mesh_name: dataset policy: procs: ${policy.engine_config.tensor_parallel_size} num_replicas: 2 with_gpus: true - hosts: 1 mesh_name: policy + hosts: 1 + ref_model: + procs: 1 + num_replicas: 2 + with_gpus: true + mesh_name: ref_model + hosts: 1 + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: reward_actor + +actors: + dataset: + procs: 1 + with_gpus: false + mesh_name: dataset trainer: - # procs: ${trainer.parallelism.data_parallel_shard_degree} procs: 8 - num_replicas: 1 with_gpus: true - hosts: 1 mesh_name: trainer + hosts: 1 replay_buffer: procs: 1 - num_replicas: 1 with_gpus: false mesh_name: replay_buffer - ref_model: - procs: ${ref_model.parallelism.tensor_parallel_degree} - num_replicas: 2 - with_gpus: true - hosts: 1 - mesh_name: ref_model compute_advantages: procs: 1 - num_replicas: 1 with_gpus: false mesh_name: compute_advantages - reward_actor: - procs: 1 - num_replicas: 1 - with_gpus: false - mesh_name: reward_actor diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml index 1690494e8..a2962122b 100644 --- a/apps/mast/qwen3_4b_mast.yaml +++ b/apps/mast/qwen3_4b_mast.yaml @@ -1,4 +1,5 @@ # Grouped Relative Policy Optimization (GRPO) +# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml # Global configuration group_size: 8 @@ -8,9 +9,20 @@ max_res_tokens: 512 model: "Qwen/Qwen3-4B" off_by_n: 1 # Off by one by default scheduler: mast -job_name: forge-qwen-4B -checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20 +job_name: forge-qwen3-4b +checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ +# Main loop configuration +rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas + +# Observability configuration +metric_logging: + wandb: + project: "grpo-training" + group: "grpo_exp_${oc.env:USER}" + reduce_across_ranks: True + console: + reduce_across_ranks: True # Dataset configuration dataset: @@ -28,14 +40,13 @@ policy: pipeline_parallel_size: 1 enforce_eager: false # TODO: Had to disable this becasue vLLm wouldn't like - # need to revisit. + # needs to revisited. disable_custom_all_reduce: true sampling_config: n: ${group_size} max_tokens: ${max_res_tokens} temperature: 1.0 top_p: 1.0 - checkpoint_path: ${checkpoint_folder} # Trainer configuration trainer: @@ -73,14 +84,14 @@ trainer: last_save_in_hf: true interval: 500 async_mode: "disabled" - folder: ${checkpoint_folder} activation_checkpoint: mode: selective selective_ac_option: op comm: - # TODO: revisit this. causing NCCL timeouts on inits when loading CP - # from oilfs if the traienr is not in the same region as in PCI - init_timeout_seconds: 3600 + # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in oilfs + init_timeout_seconds: 1200 + dcp_path: ${checkpoint_folder} # Replay buffer configuration replay_buffer: @@ -107,46 +118,45 @@ ref_model: context_parallel_degree: 1 expert_parallel_degree: 1 checkpoint: + enable: true initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed initial_load_in_hf: true # All resource allocations services: - dataset: - procs: 1 - num_replicas: 1 - with_gpus: false - mesh_name: dataset policy: procs: ${policy.engine_config.tensor_parallel_size} num_replicas: 2 with_gpus: true - hosts: 1 mesh_name: policy - trainer: - procs: 8 - num_replicas: 1 + hosts: 1 + ref_model: + procs: 1 + num_replicas: 2 with_gpus: true + mesh_name: ref_model hosts: 1 - mesh_name: trainer - replay_buffer: + reward_actor: procs: 1 num_replicas: 1 with_gpus: false - mesh_name: replay_buffer - ref_model: - procs: ${ref_model.parallelism.tensor_parallel_degree} - num_replicas: 2 + mesh_name: reward_actor + +actors: + dataset: + procs: 8 + with_gpus: false + mesh_name: dataset + trainer: + procs: 1 with_gpus: true + mesh_name: trainer hosts: 1 - mesh_name: ref_model - compute_advantages: + replay_buffer: procs: 1 - num_replicas: 1 with_gpus: false - mesh_name: compute_advantages - reward_actor: + mesh_name: replay_buffer + compute_advantages: procs: 1 - num_replicas: 1 with_gpus: false - mesh_name: reward_actor + mesh_name: compute_advantages diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml index d9ed947ff..e711adbdb 100644 --- a/apps/mast/qwen3_8b_mast.yaml +++ b/apps/mast/qwen3_8b_mast.yaml @@ -1,4 +1,5 @@ # Grouped Relative Policy Optimization (GRPO) +# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml # Global configuration group_size: 8 @@ -8,9 +9,20 @@ max_res_tokens: 512 model: "Qwen/Qwen3-8B" off_by_n: 1 # Off by one by default scheduler: mast -job_name: forge-qwen-8B -checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20 +job_name: forge-qwen3-8b +checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ +# Main loop configuration +rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas + +# Observability configuration +metric_logging: + wandb: + project: "grpo-training" + group: "grpo_exp_${oc.env:USER}" + reduce_across_ranks: True + console: + reduce_across_ranks: True # Dataset configuration dataset: @@ -28,14 +40,13 @@ policy: pipeline_parallel_size: 1 enforce_eager: false # TODO: Had to disable this becasue vLLm wouldn't like - # need to revisit. + # needs to revisited. disable_custom_all_reduce: true sampling_config: n: ${group_size} max_tokens: ${max_res_tokens} temperature: 1.0 - top_p: 1.0= - checkpoint_path: ${checkpoint_folder} + top_p: 1.0 # Trainer configuration trainer: @@ -73,14 +84,14 @@ trainer: last_save_in_hf: true interval: 500 async_mode: "disabled" - folder: ${checkpoint_folder} activation_checkpoint: mode: selective selective_ac_option: op comm: - # TODO: revisit this. causing NCCL timeouts on inits when loading CP - # from oilfs if the traienr is not in the same region as in PCI - init_timeout_seconds: 3600 + # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP + # from oilfs if the traienr is not in the same region as in oilfs + init_timeout_seconds: 1200 + dcp_path: ${checkpoint_folder} # Replay buffer configuration replay_buffer: @@ -107,46 +118,45 @@ ref_model: context_parallel_degree: 1 expert_parallel_degree: 1 checkpoint: + enable: true initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model initial_load_in_hf: true # All resource allocations services: - dataset: - procs: 1 - num_replicas: 1 - with_gpus: false - mesh_name: dataset policy: procs: ${policy.engine_config.tensor_parallel_size} num_replicas: 2 with_gpus: true - hosts: 1 mesh_name: policy + hosts: 1 + ref_model: + procs: 1 + num_replicas: 2 + with_gpus: true + mesh_name: ref_model + hosts: 1 + reward_actor: + procs: 1 + num_replicas: 1 + with_gpus: false + mesh_name: reward_actor + +actors: + dataset: + procs: 1 + with_gpus: false + mesh_name: dataset trainer: procs: 8 - num_replicas: 1 with_gpus: true - hosts: 1 mesh_name: trainer + hosts: 1 replay_buffer: procs: 1 - num_replicas: 1 with_gpus: false mesh_name: replay_buffer - ref_model: - procs: ${ref_model.parallelism.tensor_parallel_degree} - num_replicas: 2 - with_gpus: true - hosts: 1 - mesh_name: ref_model compute_advantages: procs: 1 - num_replicas: 1 with_gpus: false mesh_name: compute_advantages - reward_actor: - procs: 1 - num_replicas: 1 - with_gpus: false - mesh_name: reward_actor diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index 5b0d23783..6a6f7508b 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -16,6 +16,10 @@ from typing import Optional import monarch + +from forge.observability.metric_actors import get_or_create_metric_logger + +from forge.types import ProcessConfig, Scheduler from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer from monarch._src.actor.shape import NDSlice, Shape from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host @@ -25,10 +29,6 @@ from omegaconf import DictConfig -from forge.observability.metric_actors import get_or_create_metric_logger - -from forge.types import ProcessConfig, Scheduler - logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -261,9 +261,9 @@ def bootstrap(gpu_ids: list[str]): os.environ["NVTE_FUSED_ATTN"] = "1" os.environ["NVTE_FUSED_ATTN_USE_FAv2_BWD"] = "1" os.environ["NCCL_SET_THREAD_NAME"] = "1'" - os.environ[ - "NCCL_DEBUG_SUBSYS" - ] = "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC" + os.environ["NCCL_DEBUG_SUBSYS"] = ( + "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC" + ) os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "3" os.environ["NCCL_NET_OVERHEAD"] = "2750" os.environ["NCCL_IB_SPLIT_DATA_ON_QPS"] = "0" From f2416346dee00428622d7c20899e851a9d8ebce6 Mon Sep 17 00:00:00 2001 From: rithesh Date: Wed, 1 Oct 2025 15:37:21 -0700 Subject: [PATCH 10/17] minor changes --- apps/grpo/main.py | 1 - apps/mast/__init__.py | 5 +++++ apps/mast/main.py | 10 ++++------ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/apps/grpo/main.py b/apps/grpo/main.py index 42865334e..7545aa561 100644 --- a/apps/grpo/main.py +++ b/apps/grpo/main.py @@ -316,7 +316,6 @@ async def main(cfg: DictConfig): metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}}) mlogger = await get_or_create_metric_logger() await mlogger.init_backends.call_one(metric_logging_cfg) - print("SUCCESSFULLY CREATED AND INITIALIZED MLOGGER") # ---- Setup services ---- # await ts.initialize(strategy=ts.ControllerStorageVolumes()) diff --git a/apps/mast/__init__.py b/apps/mast/__init__.py index e69de29bb..2e41cd717 100644 --- a/apps/mast/__init__.py +++ b/apps/mast/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/apps/mast/main.py b/apps/mast/main.py index 8029f35e1..382b793b2 100644 --- a/apps/mast/main.py +++ b/apps/mast/main.py @@ -25,17 +25,15 @@ async def main(cfg: DictConfig): raise ValueError("Schuduler must be MAST.") if cfg.get(JOB_NAME_KEY, None) is not None: - # prepend user name and append guid to the job to avoid name collision - cfg[JOB_NAME_KEY] = ( - f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}" - ) + # prepend user name to the job to avoid name collision + cfg[JOB_NAME_KEY] = f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}" print(f"Overriding mast job name to {cfg[JOB_NAME_KEY]}") if cfg.get(DEFAULT_CHECKPOINT_FOLDER_KEY, DEFAULT_CHECKPOINT_FOLDER) is not None: - # append job_name to CP folder path to avoid path collision + # append job_name and guid to CP folder path to avoid path collision if cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] == DEFAULT_CHECKPOINT_FOLDER: cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] = ( - f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}" + f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}" ) print(f"Overriding checkpoint folder to {cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}") From 1c794232b98cb6c32ce82b83660fb14d59105c19 Mon Sep 17 00:00:00 2001 From: rithesh Date: Wed, 1 Oct 2025 15:46:25 -0700 Subject: [PATCH 11/17] lints --- apps/mast/main.py | 6 ++--- src/forge/actors/policy.py | 35 ++++++++++++------------- src/forge/controller/actor.py | 4 +-- src/forge/controller/provisioner.py | 14 +++++----- src/forge/controller/service/replica.py | 6 ++--- 5 files changed, 32 insertions(+), 33 deletions(-) diff --git a/apps/mast/main.py b/apps/mast/main.py index 382b793b2..92d81082c 100644 --- a/apps/mast/main.py +++ b/apps/mast/main.py @@ -32,9 +32,9 @@ async def main(cfg: DictConfig): if cfg.get(DEFAULT_CHECKPOINT_FOLDER_KEY, DEFAULT_CHECKPOINT_FOLDER) is not None: # append job_name and guid to CP folder path to avoid path collision if cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] == DEFAULT_CHECKPOINT_FOLDER: - cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] = ( - f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}" - ) + cfg[ + DEFAULT_CHECKPOINT_FOLDER_KEY + ] = f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}" print(f"Overriding checkpoint folder to {cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}") # init mast provisioner diff --git a/src/forge/actors/policy.py b/src/forge/actors/policy.py index 00d788bf4..4b61f096c 100644 --- a/src/forge/actors/policy.py +++ b/src/forge/actors/policy.py @@ -18,24 +18,6 @@ import torch import torch.distributed.checkpoint as dcp import torchstore as ts - -from forge.actors._torchstore_utils import ( - DcpHandle, - extract_param_name, - get_dcp_whole_state_dict_key, - get_param_key, - get_param_prefix, - load_tensor_from_dcp, -) - -from forge.controller import ForgeActor, get_proc_mesh, stop_proc_mesh -from forge.data.sharding import VLLMSharding -from forge.data_models.completion import Completion -from forge.data_models.prompt import to_prompt -from forge.interfaces import Policy as PolicyInterface -from forge.observability.metrics import record_metric, Reduce -from forge.observability.perf_tracker import Tracer -from forge.types import ProcessConfig from monarch.actor import current_rank, endpoint, ProcMesh from torchstore.state_dict_utils import DELIM from vllm.config import VllmConfig @@ -60,6 +42,23 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm.worker.worker_base import WorkerWrapperBase +from forge.actors._torchstore_utils import ( + extract_param_name, + get_dcp_whole_state_dict_key, + get_param_key, + get_param_prefix, + load_tensor_from_dcp, +) + +from forge.controller import ForgeActor, get_proc_mesh, stop_proc_mesh +from forge.data.sharding import VLLMSharding +from forge.data_models.completion import Completion +from forge.data_models.prompt import to_prompt +from forge.interfaces import Policy as PolicyInterface +from forge.observability.metrics import record_metric, Reduce +from forge.observability.perf_tracker import Tracer +from forge.types import ProcessConfig + logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) diff --git a/src/forge/controller/actor.py b/src/forge/controller/actor.py index 2c1cf3655..8bf1d4765 100644 --- a/src/forge/controller/actor.py +++ b/src/forge/controller/actor.py @@ -10,12 +10,12 @@ import sys from typing import Any, Type, TypeVar +from monarch.actor import Actor, current_rank, current_size, endpoint + from forge.controller.proc_mesh import get_proc_mesh, stop_proc_mesh from forge.types import ProcessConfig, ServiceConfig -from monarch.actor import Actor, current_rank, current_size, endpoint - logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) T = TypeVar("T", bound="ForgeActor") diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index 6a6f7508b..5b0d23783 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -16,10 +16,6 @@ from typing import Optional import monarch - -from forge.observability.metric_actors import get_or_create_metric_logger - -from forge.types import ProcessConfig, Scheduler from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer from monarch._src.actor.shape import NDSlice, Shape from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host @@ -29,6 +25,10 @@ from omegaconf import DictConfig +from forge.observability.metric_actors import get_or_create_metric_logger + +from forge.types import ProcessConfig, Scheduler + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -261,9 +261,9 @@ def bootstrap(gpu_ids: list[str]): os.environ["NVTE_FUSED_ATTN"] = "1" os.environ["NVTE_FUSED_ATTN_USE_FAv2_BWD"] = "1" os.environ["NCCL_SET_THREAD_NAME"] = "1'" - os.environ["NCCL_DEBUG_SUBSYS"] = ( - "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC" - ) + os.environ[ + "NCCL_DEBUG_SUBSYS" + ] = "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC" os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "3" os.environ["NCCL_NET_OVERHEAD"] = "2750" os.environ["NCCL_IB_SPLIT_DATA_ON_QPS"] = "0" diff --git a/src/forge/controller/service/replica.py b/src/forge/controller/service/replica.py index b0804cba6..dfdb10169 100644 --- a/src/forge/controller/service/replica.py +++ b/src/forge/controller/service/replica.py @@ -13,11 +13,11 @@ from enum import Enum from typing import Optional +from monarch.actor import ActorError + from forge.controller import ForgeActor from forge.types import ProcessConfig -from monarch.actor import ActorError - logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -162,7 +162,7 @@ async def initialize(self): mesh_name_with_replica = f"{self.proc_config.mesh_name}_{self.idx}" self.proc_config.mesh_name = mesh_name_with_replica if hasattr(self.actor_def, "mesh_name"): - setattr(self.actor_def, "mesh_name", mesh_name_with_replica) + self.actor_def.mesh_name = mesh_name_with_replica self.actor = await self.actor_def.launch( *self.actor_args, **self.actor_kwargs, From cccaf5094625dbec0b8bd465b511b03f3b4370fd Mon Sep 17 00:00:00 2001 From: rithesh Date: Wed, 1 Oct 2025 15:55:38 -0700 Subject: [PATCH 12/17] clean up some changes --- src/forge/controller/provisioner.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index 5b0d23783..f4344909d 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -252,29 +252,6 @@ def bootstrap(gpu_ids: list[str]): os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600" os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824" - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = "0" - os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" - os.environ["NVTE_TORCH_COMPILE"] = "0" - os.environ["NVTE_BIAS_GELU_NVFUSION"] = "0" - os.environ["NVTE_CUDA_INCLUDE_DIR"] = "/usr/local/cuda/include" - os.environ["NVTE_DISABLE_NVRTC"] = "1" - os.environ["NVTE_FUSED_ATTN"] = "1" - os.environ["NVTE_FUSED_ATTN_USE_FAv2_BWD"] = "1" - os.environ["NCCL_SET_THREAD_NAME"] = "1'" - os.environ[ - "NCCL_DEBUG_SUBSYS" - ] = "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC" - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "3" - os.environ["NCCL_NET_OVERHEAD"] = "2750" - os.environ["NCCL_IB_SPLIT_DATA_ON_QPS"] = "0" - os.environ["NCCL_IB_QPS_PER_CONNECTION"] = "16" - os.environ["NCCL_CTRAN_ENABLE"] = "0" - os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1" - os.environ["PYTORCH_JIT"] = "0" - os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" - os.environ["GLOG_minloglevel"] = "1" - gpu_ids = gpu_manager.get_gpus(num_procs) procs = host_mesh.spawn_procs( per_host={"gpus": num_procs}, From 91216bafb4907388c5c227b8cf328d8bae85a748 Mon Sep 17 00:00:00 2001 From: rithesh Date: Wed, 1 Oct 2025 16:08:40 -0700 Subject: [PATCH 13/17] failing tests --- src/forge/controller/actor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/forge/controller/actor.py b/src/forge/controller/actor.py index 8bf1d4765..bb495b641 100644 --- a/src/forge/controller/actor.py +++ b/src/forge/controller/actor.py @@ -26,6 +26,7 @@ class ForgeActor(Actor): hosts: int | None = None with_gpus: bool = False num_replicas: int = 1 + mesh_name: str | None = None _extra_config: dict[str, Any] = {} def __init__(self, *args, **kwargs): From ab5a197c050911016795ebeca041f62f77ca9d58 Mon Sep 17 00:00:00 2001 From: rithesh Date: Thu, 2 Oct 2025 14:16:10 -0700 Subject: [PATCH 14/17] some design changes --- apps/grpo/main.py | 7 +- apps/mast/main.py | 9 +- apps/mast/qwen3_14b_mast.yaml | 2 +- apps/mast/qwen3_1_7b_mast.yaml | 2 +- apps/mast/qwen3_32b_mast.yaml | 2 +- apps/mast/qwen3_4b_mast.yaml | 2 +- apps/mast/qwen3_8b_mast.yaml | 2 +- src/forge/controller/launcher.py | 318 ++++++++++++++++++ src/forge/controller/launcher/__init__.py | 5 - src/forge/controller/launcher/mast.py | 378 ---------------------- src/forge/controller/provisioner.py | 150 ++------- src/forge/types.py | 2 +- 12 files changed, 362 insertions(+), 517 deletions(-) create mode 100644 src/forge/controller/launcher.py delete mode 100644 src/forge/controller/launcher/__init__.py delete mode 100644 src/forge/controller/launcher/mast.py diff --git a/apps/grpo/main.py b/apps/grpo/main.py index 7545aa561..138e406b0 100644 --- a/apps/grpo/main.py +++ b/apps/grpo/main.py @@ -7,7 +7,6 @@ # Usage: python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml import asyncio - import time import uuid from dataclasses import dataclass @@ -27,7 +26,8 @@ from forge.actors.trainer import RLTrainer from forge.cli.config import parse from forge.controller.actor import ForgeActor -from forge.controller.provisioner import shutdown + +from forge.controller.provisioner import init_provisioner, shutdown from forge.data.rewards import MathReward, ThinkingReward from forge.observability.metric_actors import get_or_create_metric_logger from forge.observability.metrics import record_metric, Reduce @@ -312,6 +312,9 @@ async def main(cfg: DictConfig): max_req_tokens = cfg.max_req_tokens max_res_tokens = cfg.max_res_tokens + # init provisioner + await init_provisioner(cfg) + # initialize before spawning services metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}}) mlogger = await get_or_create_metric_logger() diff --git a/apps/mast/main.py b/apps/mast/main.py index 92d81082c..9627bcc24 100644 --- a/apps/mast/main.py +++ b/apps/mast/main.py @@ -10,9 +10,10 @@ from apps.grpo.main import main as grpo_main from forge.cli.config import parse -from forge.controller.provisioner import init_provisioner, JOB_NAME_KEY, SCHEDULER_KEY +from forge.controller.launcher import JOB_NAME_KEY, LAUNCHER_KEY +from forge.controller.provisioner import init_provisioner -from forge.types import Scheduler +from forge.types import Launcher from omegaconf import DictConfig DEFAULT_CHECKPOINT_FOLDER_KEY = "checkpoint_folder" @@ -21,8 +22,8 @@ async def main(cfg: DictConfig): """Main module for launching mast jobs for GRPO training.""" - if cfg.get(SCHEDULER_KEY, Scheduler.MAST.value) != Scheduler.MAST.value: - raise ValueError("Schuduler must be MAST.") + if cfg.get(LAUNCHER_KEY, Launcher.MAST.value) != Launcher.MAST.value: + raise ValueError("Launcher must be MAST.") if cfg.get(JOB_NAME_KEY, None) is not None: # prepend user name to the job to avoid name collision diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml index 198ecf8f2..83d5b8103 100644 --- a/apps/mast/qwen3_14b_mast.yaml +++ b/apps/mast/qwen3_14b_mast.yaml @@ -8,7 +8,7 @@ max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-14B" off_by_n: 1 # Off by one by default -scheduler: mast +launcher: mast job_name: forge-qwen3-14b checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml index 44a3fa906..58d879579 100644 --- a/apps/mast/qwen3_1_7b_mast.yaml +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -8,7 +8,7 @@ max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-1.7B" off_by_n: 1 # Off by one by default -scheduler: mast +launcher: mast job_name: forge-qwen3-1_7b checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml index a6818b41c..0db8f4af3 100644 --- a/apps/mast/qwen3_32b_mast.yaml +++ b/apps/mast/qwen3_32b_mast.yaml @@ -8,7 +8,7 @@ max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-32B" off_by_n: 1 # Off by one by default -scheduler: mast +launcher: mast job_name: forge-qwen3-32b checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml index a2962122b..92119055a 100644 --- a/apps/mast/qwen3_4b_mast.yaml +++ b/apps/mast/qwen3_4b_mast.yaml @@ -8,7 +8,7 @@ max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-4B" off_by_n: 1 # Off by one by default -scheduler: mast +launcher: mast job_name: forge-qwen3-4b checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml index e711adbdb..7f2f99694 100644 --- a/apps/mast/qwen3_8b_mast.yaml +++ b/apps/mast/qwen3_8b_mast.yaml @@ -8,7 +8,7 @@ max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-8B" off_by_n: 1 # Off by one by default -scheduler: mast +launcher: mast job_name: forge-qwen3-8b checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py new file mode 100644 index 000000000..2db56b2ee --- /dev/null +++ b/src/forge/controller/launcher.py @@ -0,0 +1,318 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import getpass +import os +import socket +import subprocess +import uuid +from typing import Any + +import monarch + +import torchx.specs as specs + +from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints +from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer +from monarch.actor import Actor, endpoint, ProcMesh +from monarch.tools import commands +from monarch.tools.commands import info +from monarch.tools.components import hyperactor +from monarch.tools.config import Config, Workspace +from omegaconf import DictConfig + +from forge.types import Launcher + +try: + from monarch._src.actor.actor_mesh import current_rank + from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig + from monarch.tools.components.meta import hyperactor as meta_hyperactor + from torchx.specs import AppState + from torchx.specs.fb.component_helpers import Packages +except ImportError as e: + print(f"Warning: Monarch meta/fb inetrnal imports failed: {e}") + print("Monarch functionality will be limited") + +JOB_NAME_KEY = "job_name" +LAUNCHER_KEY = "launcher" + + +def _get_port() -> str: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("localhost", 0)) + addr = s.getsockname() + port = addr[1] + return str(port) + + +class SetupActor(Actor): + @endpoint + def get_info(self) -> [str, str]: + return socket.gethostname(), _get_port() + + +class MastSetupActor(SetupActor): + @endpoint + def mount(self, mount_dst: str): + point = current_rank() + # The last dimension is the local proc count. + last_label = point.extent.labels[-1] + proc_count = point.size(last_label) + if current_rank().rank % proc_count != 0: + # Only use one rank per host to mount the directory + return + self.mount_mnt_directory(mount_dst) + + def mount_mnt_directory(self, mount_dst: str) -> None: + # Sanity check of the mounted directory + sanity_path = os.path.join(mount_dst, "huggingface_models/") + if os.path.exists(sanity_path): + print(f"Found directory {sanity_path}; skip mounting.") + return + + # Otherwise, mount the directory + if not os.path.exists(mount_dst): + os.makedirs(mount_dst, exist_ok=True) + + # Store original LD_LIBRARY_PATH to restore after mounting + original_ld_library_path = os.environ.get("LD_LIBRARY_PATH", "") + + try: + clean_env = os.environ.copy() + if "LD_LIBRARY_PATH" in clean_env: + del clean_env["LD_LIBRARY_PATH"] + + subprocess.run( + [ + "/packages/oil.oilfs/oilfs-wrapper", + "ws://ws.ai.pci0ai/genai_fair_llm", + mount_dst, + ], + capture_output=True, + text=True, + check=True, + env=clean_env, + ) + print("Done mounting") + except subprocess.CalledProcessError as e: + print( + f"Get error during mounting {e}, Stderr: {e.stderr}, Stdout: {e.stdout}" + ) + finally: + # Restore original LD_LIBRARY_PATH + if original_ld_library_path: + os.environ["LD_LIBRARY_PATH"] = original_ld_library_path + elif "LD_LIBRARY_PATH" in os.environ: + del os.environ["LD_LIBRARY_PATH"] + + assert os.path.exists( + sanity_path + ), f"Did not find directory {sanity_path}; something wrong with mounting." + + +class BaseLauncher: + async def initialize(self) -> None: + pass + + async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]: + pass + + async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]: + pass + + +class Slurmlauncher(BaseLauncher): + def __init__(self, cfg: DictConfig | None = None): + self.cfg = cfg + + async def initialize(self) -> None: + pass + + async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]: + appdef = hyperactor.host_mesh( + image="test", meshes=[f"{name}:{num_hosts}:gpu.small"] + ) + for role in appdef.roles: + # Note - this is hardcoded to SLURM + # We got this with sinfo + role.resource.memMB = 2062607 + role.resource.cpu = 128 + role.resource.gpu = 8 + + # TODO - multi scheduler support + server_config = Config( + scheduler="slurm", + appdef=appdef, + workspace=monarch.tools.config.workspace.Workspace(dirs=[""]), + ) + server_info = await commands.get_or_create( + "forge_job", + server_config, + force_restart=False, + ) + alloc = RemoteAllocator( + world_id=name, + initializer=TorchXRemoteAllocInitializer(server_info.server_handle), + ) + server_name = f"slurm:///{server_info.name}" + return alloc, None, server_name # (Allocator, AllocConstraints, SeverName) + + async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]: + setup = procs.spawn(f"setup-{uuid.uuid1()}", SetupActor) + return await setup.get_info.choose() + + +class Mastlauncher(BaseLauncher): + def __init__(self, cfg: DictConfig | None = None): + assert cfg is not None + self.cfg = cfg + job_name = cfg.get(JOB_NAME_KEY, None) + self.job_name = job_name or self.create_job_name() + self.default_monarch_port = 26600 + self.scheduler_name = "mast_conda" + + # TODO: enabe taking this from config + self.sku = "gtt_any" + self.timeout_sec = 1 * 60 * 60 # Kill the job if idle for 1 hour + self.user = getpass.getuser() + self.work_dir = f"/data/users/{self.user}" + self.edittable_workspaces = ["forge"] + self.remote_work_dir = "/packages/monarch_default_workspace/workspace/" + self.editable_workspace_paths = [ + f"{self.work_dir}/{workspace}" for workspace in self.edittable_workspaces + ] + + async def initialize(self) -> None: + await self.launch_mast_job() + + async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]: + allocator = MastAllocator( + MastAllocatorConfig( + job_name=self.job_name, + remote_allocator_port=self.default_monarch_port, + ), + ) + alloc_constraints = AllocConstraints( + {MastAllocator.ALLOC_LABEL_TASK_GROUP: name} + ) + + return allocator, alloc_constraints, self.create_server_handle() + + async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]: + setup = procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor) + await setup.mount.call(mount_dst="/mnt/wsfuse") + return await setup.get_info.choose() + + async def launch_mast_job(self): + handle = self.create_server_handle() + server_spec = info(handle) + if server_spec and server_spec.state == AppState.RUNNING: + print(f"Job {self.job_name} is already running. Skipping launch.") + return server_spec + + config = Config( + scheduler="mast_conda", + scheduler_args={ + "hpcIdentity": "hyper_monarch", + "hpcJobOncall": "monarch", + "hpcClusterUuid": "MastProdCluster", + "rmAttribution": "pytorch4all_clients_approved", + }, + appdef=self.build_appdef(), + workspace=Workspace( + dirs=[workspace_dir for workspace_dir in self.editable_workspace_paths], + ), + ) + + await commands.get_or_create(self.job_name, config) + return server_spec + + def add_additional_packages(self, packages: "Packages") -> "Packages": + packages.add_package("oil.oilfs:stable") + packages.add_package("manifold.manifoldfs") + return packages + + def build_appdef(self) -> specs.AppDef: + + # create the app definition for the worker + remote_end_python_path = ":".join( + [ + f"{self.remote_work_dir}{workspace}" + for workspace in self.editable_workspace_paths + ] + ) + + default_envs = { + **meta_hyperactor.DEFAULT_NVRT_ENVS, + **meta_hyperactor.DEFAULT_NCCL_ENVS, + **meta_hyperactor.DEFAULT_TORCH_ENVS, + **{ + "TORCHX_RUN_PYTHONPATH": f"{remote_end_python_path}:{self.remote_work_dir}" + }, + **{ + "HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS": "600", + "HYPERACTOR_CODE_MAX_FRAME_LENGTH": "1073741824", + "TORCHINDUCTOR_COMPILE_THREADS": "1", + "TORCH_COMPILE_DISABLE": "1", + "TORCHDYNAMO_VERBOSE": "1", + "VLLM_TORCH_COMPILE_LEVEL": "0", + "VLLM_USE_TRITON_FLASH_ATTN": "0", + }, + } + + print("DEFAULT ENVS: ", default_envs) + + packages = Packages() + meshes = [] + # Process both services and actors configurations + for mesh_name, config in self.cfg["services"].items(): + num_replicas = config["num_replicas"] + with_gpus = bool(config["with_gpus"]) + num_hosts = int(config.get("hosts", 0)) + # Create list of mesh names with indices and num_hosts + if with_gpus and num_hosts > 0: + mesh_list = [ + f"{mesh_name}_{i}:{num_hosts}:{self.sku}" + for i in range(num_replicas) + ] + meshes.extend(mesh_list) + + for mesh_name, config in self.cfg["actors"].items(): + num_replicas = 1 + with_gpus = bool(config["with_gpus"]) + num_hosts = int(config.get("hosts", 0)) + # single actors with GPUs + if with_gpus: + meshes.append(f"{mesh_name}:{num_replicas}:{self.sku}") + + appdef = meta_hyperactor.host_mesh_conda( + meshes=meshes, + additional_packages=self.add_additional_packages(packages), + timeout_sec=self.timeout_sec, + env=default_envs, + ) + + for role in appdef.roles: + role.resource.capabilities["server_sub_types"] = [ + # role.resource.capabilities["server_sub_types"][2] # hardcoded to ROCE + role.resource.capabilities["server_sub_types"][1] # GTT + ] + + return appdef + + def create_job_name(self): + return f"{USER}-forge-{uuid.uuid4().hex[:6]}" + + def create_server_handle(self) -> str: + return f"{self.scheduler_name}:///{self.job_name}" + + +def get_launcher(cfg: DictConfig | None = None) -> BaseLauncher: + launcher = cfg.get(LAUNCHER_KEY, Launcher.LOCAL.value) + if launcher == Launcher.MAST.value: + return Mastlauncher(cfg) + else: + return Slurmlauncher() diff --git a/src/forge/controller/launcher/__init__.py b/src/forge/controller/launcher/__init__.py deleted file mode 100644 index 2e41cd717..000000000 --- a/src/forge/controller/launcher/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py deleted file mode 100644 index 1aacd87dd..000000000 --- a/src/forge/controller/launcher/mast.py +++ /dev/null @@ -1,378 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import asyncio -import functools -import getpass -import logging -import os -import socket -import subprocess -import uuid -from typing import Optional - -import torchx.specs as specs -from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints - -from forge.observability.metric_actors import get_or_create_metric_logger - -try: - from monarch._src.actor.actor_mesh import current_rank - from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig - from monarch._src.actor.shape import NDSlice, Shape - from monarch.tools.components.meta import hyperactor - from torchx.specs import AppState - from torchx.specs.fb.component_helpers import Packages -except ImportError as e: - print(f"Warning: Monarch imports failed: {e}") - print("Monarch functionality will be limited") -from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host -from monarch.tools import commands -from monarch.tools.commands import info -from monarch.tools.config import Config, Workspace -from omegaconf import DictConfig - -from forge.controller.provisioner import BaseProvisioner, GpuManager, JOB_NAME_KEY - -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) - - -SCHEDULER_NAME = "mast_conda" -SKU = "gtt_any" -TIMEOUT_SEC = 1 * 60 * 60 # Kill the job if idle for 1 hour - -USER = getpass.getuser() -WORK_DIR = f"/data/users/{USER}" # on DEVGPU -EDITABLE_WORKSPACES = ["forge"] -REMOTE_WORK_DIR = "/packages/monarch_default_workspace/workspace/" - -EDITABLE_WORKSPACE_PATHS = [ - f"{WORK_DIR}/{workspace}" for workspace in EDITABLE_WORKSPACES -] - - -def _get_port() -> str: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("localhost", 0)) - addr = s.getsockname() - port = addr[1] - return str(port) - - -class MastSetupActor(Actor): - @endpoint - def get_info(self) -> [str, str]: - return socket.gethostname(), _get_port() - - @endpoint - def mount(self, mount_dst: str): - point = current_rank() - # The last dimension is the local proc count. - last_label = point.extent.labels[-1] - proc_count = point.size(last_label) - if current_rank().rank % proc_count != 0: - # Only use one rank per host to mount the directory - return - self.mount_mnt_directory(mount_dst) - - def mount_mnt_directory(self, mount_dst: str) -> None: - # Sanity check of the mounted directory - sanity_path = os.path.join(mount_dst, "huggingface_models/") - if os.path.exists(sanity_path): - print(f"Found directory {sanity_path}; skip mounting.") - return - - # Otherwise, mount the directory - if not os.path.exists(mount_dst): - os.makedirs(mount_dst, exist_ok=True) - - # Store original LD_LIBRARY_PATH to restore after mounting - original_ld_library_path = os.environ.get("LD_LIBRARY_PATH", "") - - try: - clean_env = os.environ.copy() - if "LD_LIBRARY_PATH" in clean_env: - del clean_env["LD_LIBRARY_PATH"] - - subprocess.run( - [ - "/packages/oil.oilfs/oilfs-wrapper", - "ws://ws.ai.pci0ai/genai_fair_llm", - mount_dst, - ], - capture_output=True, - text=True, - check=True, - env=clean_env, - ) - print("Done mounting") - except subprocess.CalledProcessError as e: - print( - f"Get error during mounting {e}, Stderr: {e.stderr}, Stdout: {e.stdout}" - ) - finally: - # Restore original LD_LIBRARY_PATH - if original_ld_library_path: - os.environ["LD_LIBRARY_PATH"] = original_ld_library_path - elif "LD_LIBRARY_PATH" in os.environ: - del os.environ["LD_LIBRARY_PATH"] - - assert os.path.exists( - sanity_path - ), f"Did not find directory {sanity_path}; something wrong with mounting." - - -class MastProvisioner(BaseProvisioner): - def __init__(self, cfg: DictConfig | None = None): - self._server_names = [] - self._proc_server_map = {} - self._lock = asyncio.Lock() - self._this_host_id = uuid.uuid1() - available_local_devices = None - cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None) - if cuda_visible_devices is not None and cuda_visible_devices.strip(): - try: - available_local_devices = set( - int(x.strip()) for x in cuda_visible_devices.split(",") if x.strip() - ) - except ValueError as e: - raise ValueError( - f"Invalid CUDA_VISIBLE_DEVICES format: '{cuda_visible_devices}'. " - f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}" - ) from e - self._host_gpu_map = { - self._this_host_id: GpuManager(available_local_devices), - } - assert cfg is not None - self.cfg = cfg - job_name = cfg.get(JOB_NAME_KEY, None) - self.job_name = job_name or self.create_job_name() - - async def initialize(self): - """Call this after creating the instance""" - await self.launch_mast_job() - - async def get_mast_allocator( - self, - job_name: str, - task_group: str, - ): - allocator = MastAllocator( - MastAllocatorConfig( - job_name=job_name, - remote_allocator_port=26600, # This is the default monarch port - ), - ) - alloc_constraints = AllocConstraints( - {MastAllocator.ALLOC_LABEL_TASK_GROUP: task_group} - ) - - return allocator, alloc_constraints - - async def create_host_mesh(self, name: str, num_hosts: int): - """Creates a remote server and a HostMesh on it.""" - logger.debug(f"Creating remote server for mesh: {name}") - server_name = f"{SCHEDULER_NAME}:///{self.job_name}" - alloc, alloc_constraints = await self.get_mast_allocator( - task_group=name, job_name=self.job_name - ) - return ( - HostMesh( - shape=Shape(["hosts"], NDSlice.new_row_major([num_hosts])), - allocator=alloc, - alloc_constraints=alloc_constraints, - ), - server_name, - ) - - async def get_proc_mesh( - self, - num_procs: int, - with_gpus: bool = False, - num_hosts: int | None = None, - mesh_name: Optional[str] = None, - ): - """Gets a proc mesh. - - num_hosts = None implies that you want a local allocation, this may change. - - """ - async with self._lock: - server_name = None - if num_hosts is not None and num_hosts > 0: - assert mesh_name is not None - host_mesh, server_name = await self.create_host_mesh( - name=mesh_name, - num_hosts=num_hosts, - ) - host_id = uuid.uuid1() - gpu_manager = GpuManager() - self._host_gpu_map[host_id] = gpu_manager - else: - host_mesh = this_host() - gpu_manager = self._host_gpu_map[self._this_host_id] - host_mesh._host_id = self._this_host_id - - if with_gpus: - - def bootstrap(gpu_ids: list[str]): - # This works for single host, needed for vLLM currently. - import os - - os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(gpu_ids) - os.environ["MASTER_ADDR"] = socket.gethostname() - os.environ["MASTER_PORT"] = f"1234{gpu_ids[0]}" - os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600" - os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824" - - gpu_ids = gpu_manager.get_gpus(num_procs) - procs = host_mesh.spawn_procs( - per_host={"gpus": num_procs}, - bootstrap=functools.partial(bootstrap, gpu_ids=gpu_ids), - ) - await procs.initialized - setup = procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor) - hostname, port = await setup.get_info.choose() - await setup.mount.call(mount_dst="/mnt/wsfuse") - procs._hostname = hostname - procs._port = port - procs._gpu_ids = gpu_ids - else: - procs = host_mesh.spawn_procs(per_host={"gpus": num_procs}) - - procs._host = host_mesh - - # If we created a server, track so we can tear it down later. - if server_name: - self._server_names.append(server_name) - self._proc_server_map[procs] = server_name - - _ = await get_or_create_metric_logger(procs) - - return procs - - async def stop_proc_mesh(self, proc_mesh: ProcMesh): - """Stops a proc mesh.""" - async with self._lock: - # Deregister local logger from global logger - if hasattr(proc_mesh, "_local_fetcher"): - global_logger = await get_or_create_metric_logger(proc_mesh) - await global_logger.deregister_fetcher.call_one(proc_mesh) - - if hasattr(proc_mesh, "_gpu_ids"): - gpu_manager = self._host_gpu_map[proc_mesh._host._host_id] - gpu_manager.release_gpus(proc_mesh._gpu_ids) - await proc_mesh.stop() - if proc_mesh in self._proc_server_map: - server_name = self._proc_server_map[proc_mesh] - commands.kill(server_name) - - async def shutdown(self): - """Tears down all remaining remote allocations.""" - async with self._lock: - for server_name in self._server_names: - commands.kill(server_name) - - async def launch_mast_job(self): - handle = self.create_server_handle() - server_spec = info(handle) - if server_spec and server_spec.state == AppState.RUNNING: - print(f"Job {self.job_name} is already running. Skipping launch.") - return server_spec - - config = Config( - scheduler="mast_conda", - scheduler_args={ - # NOTE: TODO: support passing these args from CLI - "hpcIdentity": "hyper_monarch", - # "hpcIdentity": "genai_llm_pretraining_data", - # "hpcIdentity": "pytorch_distributed", - "hpcJobOncall": "monarch", - "hpcClusterUuid": "MastProdCluster", - "rmAttribution": "pytorch4all_clients_approved", - }, - appdef=self.build_appdef(), - workspace=Workspace( - dirs=[workspace_dir for workspace_dir in EDITABLE_WORKSPACE_PATHS], - ), - ) - - await commands.get_or_create(self.job_name, config) - return server_spec - - def add_additional_packages(self, packages: Packages) -> Packages: - packages.add_package("oil.oilfs:stable") - packages.add_package("manifold.manifoldfs") - return packages - - def build_appdef(self) -> specs.AppDef: - - # create the app definition for the worker - REMOTE_END_PYTHONPATH = ":".join( - [f"{REMOTE_WORK_DIR}{workspace}" for workspace in EDITABLE_WORKSPACE_PATHS] - ) - - default_envs = { - **hyperactor.DEFAULT_NVRT_ENVS, - **hyperactor.DEFAULT_NCCL_ENVS, - **hyperactor.DEFAULT_TORCH_ENVS, - **{"TORCHX_RUN_PYTHONPATH": f"{REMOTE_END_PYTHONPATH}:{REMOTE_WORK_DIR}"}, - **{ - "HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS": "600", - "HYPERACTOR_CODE_MAX_FRAME_LENGTH": "1073741824", - "TORCHINDUCTOR_COMPILE_THREADS": "1", - "TORCH_COMPILE_DISABLE": "1", - "TORCHDYNAMO_VERBOSE": "1", - "VLLM_TORCH_COMPILE_LEVEL": "0", - "VLLM_USE_TRITON_FLASH_ATTN": "0", - }, - } - - print("DEFAULT ENVS: ", default_envs) - - packages = Packages() - meshes = [] - # Process both services and actors configurations - for mesh_name, config in self.cfg["services"].items(): - num_replicas = config["num_replicas"] - with_gpus = bool(config["with_gpus"]) - num_hosts = int(config.get("hosts", 0)) - # Create list of mesh names with indices and num_hosts - if with_gpus and num_hosts > 0: - mesh_list = [ - f"{mesh_name}_{i}:{num_hosts}:{SKU}" for i in range(num_replicas) - ] - meshes.extend(mesh_list) - - for mesh_name, config in self.cfg["actors"].items(): - num_replicas = 1 - with_gpus = bool(config["with_gpus"]) - num_hosts = int(config.get("hosts", 0)) - # single actors with GPUs - if with_gpus: - meshes.append(f"{mesh_name}:{num_replicas}:{SKU}") - - appdef = hyperactor.host_mesh_conda( - meshes=meshes, - additional_packages=self.add_additional_packages(packages), - timeout_sec=TIMEOUT_SEC, - env=default_envs, - ) - - for role in appdef.roles: - role.resource.capabilities["server_sub_types"] = [ - # role.resource.capabilities["server_sub_types"][2] # hardcoded to ROCE - role.resource.capabilities["server_sub_types"][1] # GTT - ] - - return appdef - - def create_job_name(self): - return f"{USER}-forge-{uuid.uuid4().hex[:6]}" - - def create_server_handle(self) -> str: - return f"{SCHEDULER_NAME}:///{self.job_name}" diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index f4344909d..dd3d05efd 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -12,43 +12,22 @@ import os import socket import uuid -from abc import ABC, abstractmethod from typing import Optional -import monarch -from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer from monarch._src.actor.shape import NDSlice, Shape -from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host +from monarch.actor import HostMesh, ProcMesh, this_host from monarch.tools import commands -from monarch.tools.components import hyperactor -from monarch.tools.config import Config - from omegaconf import DictConfig +from forge.controller.launcher import BaseLauncher, get_launcher + from forge.observability.metric_actors import get_or_create_metric_logger -from forge.types import ProcessConfig, Scheduler +from forge.types import ProcessConfig logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) -JOB_NAME_KEY = "job_name" -SCHEDULER_KEY = "scheduler" - - -def _get_port() -> str: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("localhost", 0)) - addr = s.getsockname() - port = addr[1] - return str(port) - - -class _SetupActor(Actor): - @endpoint - def get_info(self) -> [str, str]: - return socket.gethostname(), _get_port() - class GpuManager: """Tracks and assigns GPU devices on a host. @@ -83,57 +62,10 @@ def release_gpus(self, gpu_ids: list[str]) -> None: self.available_gpus.add(int(gpu_id)) -class BaseProvisioner(ABC): - """Abstract base class for resource provisioners.""" - - @abstractmethod - async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh: - """Creates a remote server and a HostMesh on it. - Args: - name: Name identifier for the host mesh - num_hosts: Number of hosts to create - Returns: - HostMesh: The created host mesh - """ - pass - - @abstractmethod - async def get_proc_mesh( - self, - num_procs: int, - with_gpus: bool = False, - num_hosts: Optional[int] = None, - mesh_name: Optional[str] = None, - ) -> ProcMesh: - """Gets a proc mesh. - Args: - num_procs: Number of processes needed - with_gpus: Whether GPU support is required - num_hosts: Number of hosts (None implies local allocation) - mesh_name: Name identifier for the proc mesh - Returns: - ProcMesh: The allocated process mesh - """ - pass - - @abstractmethod - async def stop_proc_mesh(self, proc_mesh: ProcMesh) -> None: - """Stops a proc mesh. - Args: - proc_mesh: The process mesh to stop - """ - pass - - @abstractmethod - async def shutdown(self) -> None: - """Tears down all remaining remote allocations.""" - pass - - -class Provisioner(BaseProvisioner): +class Provisioner: """A global resource provisioner.""" - def __init__(self): + def __init__(self, cfg: DictConfig | None = None): self._server_names = [] self._proc_server_map = {} self._lock = asyncio.Lock() @@ -162,39 +94,25 @@ def __init__(self): self._host_gpu_map = { self._this_host_id: GpuManager(available_local_devices), } + self.launcher: BaseLauncher = get_launcher(cfg) + + async def initialize(self): + """Call this after creating the instance""" + await self.launcher.initialize() async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh: """Creates a remote server and a HostMesh on it.""" # no need to lock here because this is already locked behind `get_proc_mesh` logger.debug(f"Creating remote server for alloc {name}") - appdef = hyperactor.host_mesh( - image="test", meshes=[f"{name}:{num_hosts}:gpu.small"] - ) - for role in appdef.roles: - # Note - this is hardcoded to SLURM - # We got this with sinfo - role.resource.memMB = 2062607 - role.resource.cpu = 128 - role.resource.gpu = 8 - - # TODO - multi scheduler support - server_config = Config( - scheduler="slurm", - appdef=appdef, - workspace=monarch.tools.config.workspace.Workspace(dirs=[""]), - ) - server_info = await commands.get_or_create( - "forge_job", - server_config, - force_restart=False, - ) - alloc = RemoteAllocator( - world_id=name, - initializer=TorchXRemoteAllocInitializer(server_info.server_handle), + alloc, alloc_constraints, server_name = await self.launcher.get_allocator( + name, num_hosts ) - server_name = f"slurm:///{server_info.name}" return ( - HostMesh(Shape(["hosts"], NDSlice.new_row_major([num_hosts])), alloc), + HostMesh( + Shape(["hosts"], NDSlice.new_row_major([num_hosts])), + allocator=alloc, + alloc_constraints=alloc_constraints, + ), server_name, ) @@ -215,7 +133,7 @@ async def get_proc_mesh( if num_hosts is not None and num_hosts > 0: created_hosts = len(self._server_names) host_mesh, server_name = await self.create_host_mesh( - name=f"alloc-{created_hosts}", + name=mesh_name, num_hosts=num_hosts, ) host_id = uuid.uuid1() @@ -257,11 +175,10 @@ def bootstrap(gpu_ids: list[str]): per_host={"gpus": num_procs}, bootstrap=functools.partial(bootstrap, gpu_ids=gpu_ids), ) - setup = procs.spawn(f"setup-{uuid.uuid1()}", _SetupActor) # Pick a random host/port, we'll feed this in afterwards # Once we have true HostMesh support, we can do this on proc 0 of each host # then spin up the proc meshes with the environment afterwards. - hostname, port = await setup.get_info.choose() + hostname, port = await self.launcher.remote_setup(procs) procs._hostname = hostname procs._port = port procs._gpu_ids = gpu_ids @@ -303,34 +220,25 @@ async def shutdown(self): commands.kill(server_name) -_provisioner: BaseProvisioner | None = None +_provisioner: Provisioner | None = None async def init_provisioner(cfg: DictConfig | None = None): global _provisioner if not _provisioner: - scheduler = Scheduler.LOCAL - if cfg is not None: - scheduler = cfg.get(SCHEDULER_KEY, Scheduler.LOCAL.value) - if scheduler == Scheduler.MAST.value: - from forge.controller.launcher.mast import MastProvisioner - - _provisioner = MastProvisioner(cfg=cfg) - await _provisioner.initialize() - else: - _provisioner = Provisioner() + _provisioner = Provisioner(cfg) + await _provisioner.initialize() return _provisioner -async def _get_provisioner(): +def _get_provisioner(): if not _provisioner: - await init_provisioner() + raise RuntimeError("Provisioner not initialized") return _provisioner async def get_proc_mesh(config: ProcessConfig) -> ProcMesh: - provisioner = await _get_provisioner() - return await provisioner.get_proc_mesh( + return await _get_provisioner().get_proc_mesh( num_procs=config.procs, with_gpus=config.with_gpus, num_hosts=config.hosts, @@ -339,11 +247,9 @@ async def get_proc_mesh(config: ProcessConfig) -> ProcMesh: async def stop_proc_mesh(proc_mesh: ProcMesh): - provisioner = await _get_provisioner() - return await provisioner.stop_proc_mesh(proc_mesh=proc_mesh) + return await _get_provisioner().stop_proc_mesh(proc_mesh=proc_mesh) async def shutdown(): logger.info("Shutting down provisioner..") - provisioner = await _get_provisioner() - return await provisioner.shutdown() + await _get_provisioner().shutdown() diff --git a/src/forge/types.py b/src/forge/types.py index 271797d95..16585922d 100644 --- a/src/forge/types.py +++ b/src/forge/types.py @@ -88,7 +88,7 @@ class State: metadata: dict[str, Any] = field(default_factory=dict) -class Scheduler(Enum): +class Launcher(Enum): MAST = "mast" SLURM = "slurm" LOCAL = "local" From 9d41973497496f70a47cf3ddfa8dfca8799653ef Mon Sep 17 00:00:00 2001 From: rithesh Date: Thu, 2 Oct 2025 17:25:03 -0700 Subject: [PATCH 15/17] unit test issues --- src/forge/controller/launcher.py | 9 ++++++--- src/forge/controller/provisioner.py | 23 +++++++++++++---------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py index 2db56b2ee..5370b495a 100644 --- a/src/forge/controller/launcher.py +++ b/src/forge/controller/launcher.py @@ -15,6 +15,8 @@ import torchx.specs as specs +from forge.types import Launcher + from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer from monarch.actor import Actor, endpoint, ProcMesh @@ -24,8 +26,6 @@ from monarch.tools.config import Config, Workspace from omegaconf import DictConfig -from forge.types import Launcher - try: from monarch._src.actor.actor_mesh import current_rank from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig @@ -311,7 +311,10 @@ def create_server_handle(self) -> str: def get_launcher(cfg: DictConfig | None = None) -> BaseLauncher: - launcher = cfg.get(LAUNCHER_KEY, Launcher.LOCAL.value) + if cfg is not None: + launcher = cfg.get(LAUNCHER_KEY, Launcher.LOCAL.value) + else: + launcher = Launcher.LOCAL.value if launcher == Launcher.MAST.value: return Mastlauncher(cfg) else: diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index dd3d05efd..8dea22d28 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -14,17 +14,17 @@ import uuid from typing import Optional -from monarch._src.actor.shape import NDSlice, Shape -from monarch.actor import HostMesh, ProcMesh, this_host -from monarch.tools import commands -from omegaconf import DictConfig - from forge.controller.launcher import BaseLauncher, get_launcher from forge.observability.metric_actors import get_or_create_metric_logger from forge.types import ProcessConfig +from monarch._src.actor.shape import NDSlice, Shape +from monarch.actor import HostMesh, ProcMesh, this_host +from monarch.tools import commands +from omegaconf import DictConfig + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -231,14 +231,15 @@ async def init_provisioner(cfg: DictConfig | None = None): return _provisioner -def _get_provisioner(): +async def _get_provisioner(): if not _provisioner: - raise RuntimeError("Provisioner not initialized") + await init_provisioner() return _provisioner async def get_proc_mesh(config: ProcessConfig) -> ProcMesh: - return await _get_provisioner().get_proc_mesh( + provisioner = await _get_provisioner() + return await provisioner.get_proc_mesh( num_procs=config.procs, with_gpus=config.with_gpus, num_hosts=config.hosts, @@ -247,9 +248,11 @@ async def get_proc_mesh(config: ProcessConfig) -> ProcMesh: async def stop_proc_mesh(proc_mesh: ProcMesh): - return await _get_provisioner().stop_proc_mesh(proc_mesh=proc_mesh) + provisioner = await _get_provisioner() + return await provisioner.stop_proc_mesh(proc_mesh=proc_mesh) async def shutdown(): logger.info("Shutting down provisioner..") - await _get_provisioner().shutdown() + provisioner = await _get_provisioner() + return await provisioner.shutdown() From 6dbc5eedb3ab4d9625f4a5a3c7e9acdd837b4230 Mon Sep 17 00:00:00 2001 From: rithesh Date: Fri, 3 Oct 2025 11:23:57 -0700 Subject: [PATCH 16/17] suggested changes --- apps/grpo/main.py | 21 +++++++++++++-- apps/mast/main.py | 19 +++++++++++-- src/forge/controller/launcher.py | 42 ++++++++++++++--------------- src/forge/controller/provisioner.py | 24 ++++++++++------- src/forge/types.py | 18 ++++++++++++- 5 files changed, 87 insertions(+), 37 deletions(-) diff --git a/apps/grpo/main.py b/apps/grpo/main.py index 138e406b0..2439100d9 100644 --- a/apps/grpo/main.py +++ b/apps/grpo/main.py @@ -26,12 +26,20 @@ from forge.actors.trainer import RLTrainer from forge.cli.config import parse from forge.controller.actor import ForgeActor - +from forge.controller.launcher import JOB_NAME_KEY, LAUNCHER_KEY from forge.controller.provisioner import init_provisioner, shutdown from forge.data.rewards import MathReward, ThinkingReward from forge.observability.metric_actors import get_or_create_metric_logger from forge.observability.metrics import record_metric, Reduce from forge.observability.perf_tracker import Tracer + +from forge.types import ( + Launcher, + LauncherConfig, + ProcessConfig, + ProvisionerConfig, + ServiceConfig, +) from forge.util.ops import compute_logprobs from monarch.actor import endpoint from omegaconf import DictConfig @@ -313,7 +321,16 @@ async def main(cfg: DictConfig): max_res_tokens = cfg.max_res_tokens # init provisioner - await init_provisioner(cfg) + await init_provisioner( + ProvisionerConfig( + launcher_config=LauncherConfig( + launcher=Launcher(cfg.get(LAUNCHER_KEY, Launcher.SLURM.value)), + job_name=cfg.get(JOB_NAME_KEY, None), + services={k: ServiceConfig(**v) for k, v in cfg.services.items()}, + actors={k: ProcessConfig(**v) for k, v in cfg.actors.items()}, + ) + ) + ) # initialize before spawning services metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}}) diff --git a/apps/mast/main.py b/apps/mast/main.py index 9627bcc24..cd5de0be9 100644 --- a/apps/mast/main.py +++ b/apps/mast/main.py @@ -13,7 +13,13 @@ from forge.controller.launcher import JOB_NAME_KEY, LAUNCHER_KEY from forge.controller.provisioner import init_provisioner -from forge.types import Launcher +from forge.types import ( + Launcher, + LauncherConfig, + ProcessConfig, + ProvisionerConfig, + ServiceConfig, +) from omegaconf import DictConfig DEFAULT_CHECKPOINT_FOLDER_KEY = "checkpoint_folder" @@ -39,7 +45,16 @@ async def main(cfg: DictConfig): print(f"Overriding checkpoint folder to {cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}") # init mast provisioner - await init_provisioner(cfg) + await init_provisioner( + ProvisionerConfig( + launcher_config=LauncherConfig( + launcher=Launcher(cfg.get(LAUNCHER_KEY, Launcher.MAST.value)), + job_name=cfg.get(JOB_NAME_KEY, None), + services={k: ServiceConfig(**v) for k, v in cfg.services.items()}, + actors={k: ProcessConfig(**v) for k, v in cfg.actors.items()}, + ) + ) + ) await grpo_main(cfg) diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py index 5370b495a..00493e889 100644 --- a/src/forge/controller/launcher.py +++ b/src/forge/controller/launcher.py @@ -15,8 +15,6 @@ import torchx.specs as specs -from forge.types import Launcher - from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer from monarch.actor import Actor, endpoint, ProcMesh @@ -24,7 +22,8 @@ from monarch.tools.commands import info from monarch.tools.components import hyperactor from monarch.tools.config import Config, Workspace -from omegaconf import DictConfig + +from forge.types import Launcher, LauncherConfig try: from monarch._src.actor.actor_mesh import current_rank @@ -125,7 +124,7 @@ async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]: class Slurmlauncher(BaseLauncher): - def __init__(self, cfg: DictConfig | None = None): + def __init__(self, cfg: LauncherConfig | None = None): self.cfg = cfg async def initialize(self) -> None: @@ -166,11 +165,9 @@ async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]: class Mastlauncher(BaseLauncher): - def __init__(self, cfg: DictConfig | None = None): + def __init__(self, cfg: LauncherConfig | None = None): assert cfg is not None self.cfg = cfg - job_name = cfg.get(JOB_NAME_KEY, None) - self.job_name = job_name or self.create_job_name() self.default_monarch_port = 26600 self.scheduler_name = "mast_conda" @@ -184,6 +181,7 @@ def __init__(self, cfg: DictConfig | None = None): self.editable_workspace_paths = [ f"{self.work_dir}/{workspace}" for workspace in self.edittable_workspaces ] + self.job_name = self.cfg.job_name or self.create_job_name() async def initialize(self) -> None: await self.launch_mast_job() @@ -268,10 +266,10 @@ def build_appdef(self) -> specs.AppDef: packages = Packages() meshes = [] # Process both services and actors configurations - for mesh_name, config in self.cfg["services"].items(): - num_replicas = config["num_replicas"] - with_gpus = bool(config["with_gpus"]) - num_hosts = int(config.get("hosts", 0)) + for mesh_name, service in self.cfg.services.items(): + num_replicas = service.num_replicas + with_gpus = bool(service.with_gpus) + num_hosts = int(service.hosts or 0) # Create list of mesh names with indices and num_hosts if with_gpus and num_hosts > 0: mesh_list = [ @@ -280,10 +278,10 @@ def build_appdef(self) -> specs.AppDef: ] meshes.extend(mesh_list) - for mesh_name, config in self.cfg["actors"].items(): + for mesh_name, actor in self.cfg.actors.items(): num_replicas = 1 - with_gpus = bool(config["with_gpus"]) - num_hosts = int(config.get("hosts", 0)) + with_gpus = bool(actor.with_gpus) + num_hosts = int(actor.hosts or 0) # single actors with GPUs if with_gpus: meshes.append(f"{mesh_name}:{num_replicas}:{self.sku}") @@ -304,18 +302,18 @@ def build_appdef(self) -> specs.AppDef: return appdef def create_job_name(self): - return f"{USER}-forge-{uuid.uuid4().hex[:6]}" + return f"{self.user}-forge-{uuid.uuid4().hex[:6]}" def create_server_handle(self) -> str: return f"{self.scheduler_name}:///{self.job_name}" -def get_launcher(cfg: DictConfig | None = None) -> BaseLauncher: - if cfg is not None: - launcher = cfg.get(LAUNCHER_KEY, Launcher.LOCAL.value) - else: - launcher = Launcher.LOCAL.value - if launcher == Launcher.MAST.value: +def get_launcher(cfg: LauncherConfig | None = None) -> BaseLauncher | None: + if not cfg: + return None + if cfg.launcher == Launcher.MAST: return Mastlauncher(cfg) + elif cfg.launcher == Launcher.SLURM: + return Slurmlauncher(cfg) else: - return Slurmlauncher() + raise ValueError(f"Unsupported config provided, got {cfg}") diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index 8dea22d28..7d55b1c44 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -14,16 +14,15 @@ import uuid from typing import Optional +from monarch._src.actor.shape import NDSlice, Shape +from monarch.actor import HostMesh, ProcMesh, this_host +from monarch.tools import commands + from forge.controller.launcher import BaseLauncher, get_launcher from forge.observability.metric_actors import get_or_create_metric_logger -from forge.types import ProcessConfig - -from monarch._src.actor.shape import NDSlice, Shape -from monarch.actor import HostMesh, ProcMesh, this_host -from monarch.tools import commands -from omegaconf import DictConfig +from forge.types import ProcessConfig, ProvisionerConfig logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -65,7 +64,7 @@ def release_gpus(self, gpu_ids: list[str]) -> None: class Provisioner: """A global resource provisioner.""" - def __init__(self, cfg: DictConfig | None = None): + def __init__(self, cfg: ProvisionerConfig | None = None): self._server_names = [] self._proc_server_map = {} self._lock = asyncio.Lock() @@ -94,11 +93,16 @@ def __init__(self, cfg: DictConfig | None = None): self._host_gpu_map = { self._this_host_id: GpuManager(available_local_devices), } - self.launcher: BaseLauncher = get_launcher(cfg) + self.launcher: BaseLauncher | None = get_launcher( + cfg.launcher_config if cfg is not None else None + ) + if not self.launcher: + logger.warning("Launcher not provided, remote allocations will not work.") async def initialize(self): """Call this after creating the instance""" - await self.launcher.initialize() + if self.launcher is not None: + await self.launcher.initialize() async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh: """Creates a remote server and a HostMesh on it.""" @@ -223,7 +227,7 @@ async def shutdown(self): _provisioner: Provisioner | None = None -async def init_provisioner(cfg: DictConfig | None = None): +async def init_provisioner(cfg: ProvisionerConfig | None = None): global _provisioner if not _provisioner: _provisioner = Provisioner(cfg) diff --git a/src/forge/types.py b/src/forge/types.py index 16585922d..f79e3ef2c 100644 --- a/src/forge/types.py +++ b/src/forge/types.py @@ -91,7 +91,6 @@ class State: class Launcher(Enum): MAST = "mast" SLURM = "slurm" - LOCAL = "local" @dataclass @@ -141,3 +140,20 @@ def to_process_config(self) -> ProcessConfig: Scalar = Union[int, float] + + +@dataclass +class LauncherConfig: + """A launcher config for the scheduler.""" + + launcher: Launcher + job_name: str + services: dict[str, ServiceConfig] + actors: dict[str, ProcessConfig] + + +@dataclass +class ProvisionerConfig: + """A config for the forge provisioner.""" + + launcher_config: LauncherConfig From d313c59269d2c2b79014f92136257e82b82193d0 Mon Sep 17 00:00:00 2001 From: rithesh Date: Fri, 3 Oct 2025 11:51:12 -0700 Subject: [PATCH 17/17] failing tests --- src/forge/controller/launcher.py | 11 +++-------- src/forge/controller/provisioner.py | 5 +++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py index 00493e889..cd54c00b0 100644 --- a/src/forge/controller/launcher.py +++ b/src/forge/controller/launcher.py @@ -124,9 +124,6 @@ async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]: class Slurmlauncher(BaseLauncher): - def __init__(self, cfg: LauncherConfig | None = None): - self.cfg = cfg - async def initialize(self) -> None: pass @@ -309,11 +306,9 @@ def create_server_handle(self) -> str: def get_launcher(cfg: LauncherConfig | None = None) -> BaseLauncher | None: - if not cfg: - return None - if cfg.launcher == Launcher.MAST: + if not cfg or cfg.launcher == Launcher.SLURM: + return Slurmlauncher() + elif cfg.launcher == Launcher.MAST: return Mastlauncher(cfg) - elif cfg.launcher == Launcher.SLURM: - return Slurmlauncher(cfg) else: raise ValueError(f"Unsupported config provided, got {cfg}") diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py index 7d55b1c44..d66504707 100644 --- a/src/forge/controller/provisioner.py +++ b/src/forge/controller/provisioner.py @@ -107,6 +107,11 @@ async def initialize(self): async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh: """Creates a remote server and a HostMesh on it.""" # no need to lock here because this is already locked behind `get_proc_mesh` + if not self.launcher: + raise RuntimeError( + "You tried to create a remote allocation by specifying the number of hosts on an actor or service, " + "but no launcher was specified." + ) logger.debug(f"Creating remote server for alloc {name}") alloc, alloc_constraints, server_name = await self.launcher.get_allocator( name, num_hosts