From 6aebb0989df46010bbafddbff23a47d5cdd9b2fc Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu001.lla1.facebook.com>
Date: Mon, 29 Sep 2025 15:03:03 -0700
Subject: [PATCH 01/17] inital changes

---
 apps/grpo/__init__.py                     |   5 +
 apps/mast/__init__.py                     |   0
 apps/mast/env_setup.sh                    | 309 +++++++++++++++++++
 apps/mast/main.py                         |  38 +++
 apps/mast/qwen3_14b_mast.yaml             | 153 ++++++++++
 apps/mast/qwen3_1_7b_mast.yaml            | 129 ++++++++
 apps/mast/qwen3_32b_mast.yaml             | 153 ++++++++++
 apps/mast/qwen3_4b_mast.yaml              | 152 ++++++++++
 apps/mast/qwen3_8b_mast.yaml              | 152 ++++++++++
 src/forge/actors/policy.py                |  35 +--
 src/forge/controller/actor.py             |   8 +-
 src/forge/controller/launcher/__init__.py |   5 +
 src/forge/controller/launcher/mast.py     | 350 ++++++++++++++++++++++
 src/forge/controller/provisioner.py       | 102 ++++++-
 src/forge/controller/service/replica.py   |   5 +-
 src/forge/types.py                        |  10 +
 16 files changed, 1574 insertions(+), 32 deletions(-)
 create mode 100644 apps/grpo/__init__.py
 create mode 100644 apps/mast/__init__.py
 create mode 100755 apps/mast/env_setup.sh
 create mode 100644 apps/mast/main.py
 create mode 100644 apps/mast/qwen3_14b_mast.yaml
 create mode 100644 apps/mast/qwen3_1_7b_mast.yaml
 create mode 100644 apps/mast/qwen3_32b_mast.yaml
 create mode 100644 apps/mast/qwen3_4b_mast.yaml
 create mode 100644 apps/mast/qwen3_8b_mast.yaml
 create mode 100644 src/forge/controller/launcher/__init__.py
 create mode 100644 src/forge/controller/launcher/mast.py

diff --git a/apps/grpo/__init__.py b/apps/grpo/__init__.py
new file mode 100644
index 000000000..2e41cd717
--- /dev/null
+++ b/apps/grpo/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/apps/mast/__init__.py b/apps/mast/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/apps/mast/env_setup.sh b/apps/mast/env_setup.sh
new file mode 100755
index 000000000..6728c87db
--- /dev/null
+++ b/apps/mast/env_setup.sh
@@ -0,0 +1,309 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# setup_forge_env.sh - Setup conda environment and install forge with mounting
+set -e  # Exit on any error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Logging functions
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Function to mount a single workspace to /mnt/wsfuse
+mount_workspace() {
+    local workspace_url="$1"
+    local mount_dir="/mnt/wsfuse"
+
+    if [ -z "$workspace_url" ]; then
+        log_error "No workspace URL provided for mounting"
+        return 1
+    fi
+
+    log_info "Setting up mount directory: $mount_dir"
+
+    # Create the directory if it doesn't exist
+    if [ ! -d "$mount_dir" ]; then
+        log_info "Creating mount directory: $mount_dir"
+        sudo mkdir -p "$mount_dir" || {
+            log_error "Failed to create mount directory (may need sudo privileges)"
+            return 1
+        }
+    fi
+
+    # Check if the directory is already mounted
+    if mountpoint -q "$mount_dir" 2>/dev/null; then
+        log_warn "Directory $mount_dir is already mounted, skipping mount"
+        return 0
+    fi
+
+    # Check if oilfs command exists
+    if ! command -v oilfs >/dev/null 2>&1; then
+        log_error "oilfs command not found. Please ensure it's installed and in PATH"
+        return 1
+    fi
+
+    log_info "Mounting workspace $workspace_url to $mount_dir"
+
+    # Store original LD_LIBRARY_PATH to restore after mounting (similar to Python code)
+    original_ld_library_path="${LD_LIBRARY_PATH:-}"
+
+    # Temporarily unset LD_LIBRARY_PATH for mounting
+    unset LD_LIBRARY_PATH
+
+    # Mount the workspace
+    if oilfs "$workspace_url" "$mount_dir"; then
+        log_info "Successfully mounted $workspace_url to $mount_dir"
+    else
+        log_error "Failed to mount $workspace_url to $mount_dir"
+        # Restore original LD_LIBRARY_PATH
+        if [ -n "$original_ld_library_path" ]; then
+            export LD_LIBRARY_PATH="$original_ld_library_path"
+        fi
+        return 1
+    fi
+
+    # Restore original LD_LIBRARY_PATH
+    if [ -n "$original_ld_library_path" ]; then
+        export LD_LIBRARY_PATH="$original_ld_library_path"
+    fi
+
+    # Verify mount was successful
+    if [ -d "$mount_dir/huggingface_models" ]; then
+        log_info "Mount verification successful - found expected directory structure"
+    else
+        log_warn "Mount verification: Expected directory structure not found, but mount appears successful"
+    fi
+
+    return 0
+}
+
+# Function to safely deactivate conda
+safe_conda_deactivate() {
+    if command -v conda >/dev/null 2>&1; then
+        if conda info --envs >/dev/null 2>&1; then
+            conda deactivate 2>/dev/null || log_warn "Could not deactivate conda (might not be in an environment)"
+        else
+            log_warn "Conda not properly initialized, skipping deactivate"
+        fi
+    else
+        log_warn "Conda command not found, skipping deactivate"
+    fi
+}
+
+# Function to safely activate conda environment
+safe_conda_activate() {
+    local env_name="$1"
+
+    if command -v conda >/dev/null 2>&1; then
+        if conda info --envs >/dev/null 2>&1; then
+            conda activate "$env_name"
+        else
+            log_warn "Conda not properly initialized"
+            log_info "Attempting to use xl_conda.sh activation instead..."
+            source "$CONDA_SCRIPT_PATH" activate "$env_name"
+        fi
+    else
+        log_warn "Conda command not found"
+        log_info "Attempting to use xl_conda.sh activation instead..."
+        source "$CONDA_SCRIPT_PATH" activate "$env_name"
+    fi
+}
+
+# Check if required environment variables are set
+if [ -z "$USER" ]; then
+    log_error "USER environment variable is not set"
+    exit 1
+fi
+
+# Define paths
+FBSOURCE_PATH="/data/users/$USER/fbsource"
+CONDA_SCRIPT_PATH="$FBSOURCE_PATH/genai/xlformers/dev/xl_conda.sh"
+FORGE_BASE_DIR="/data/users/$USER"
+FORGE_REPO_DIR="$FORGE_BASE_DIR/forge"
+MONARCH_DIR="$HOME/monarch_no_torch_latest"
+
+# Workspace URL for mounting
+WORKSPACE_URL="ws://ws.ai.pci0ai/genai_fair_llm"
+
+log_info "Starting forge environment setup for user: $USER"
+
+# Step 1: Mount workspace (do this early in case other steps need the mounted files)
+log_info "Step 1: Mounting workspace..."
+mount_workspace "$WORKSPACE_URL"
+if [ $? -ne 0 ]; then
+    log_warn "Failed to mount workspace, continuing with setup..."
+    log_warn "Some functionality may not be available without the mounted workspace"
+fi
+
+# Step 2: Check if conda script exists and source it
+log_info "Step 2: Activating conda environment..."
+if [ ! -f "$CONDA_SCRIPT_PATH" ]; then
+    log_error "Conda script not found at: $CONDA_SCRIPT_PATH"
+    log_error "Please ensure fbsource is properly set up"
+    exit 1
+fi
+
+log_info "Sourcing conda script: $CONDA_SCRIPT_PATH"
+source "$CONDA_SCRIPT_PATH" activate forge:8448524
+
+if [ $? -ne 0 ]; then
+    log_error "Failed to activate conda environment forge-8448524"
+    exit 1
+fi
+
+log_info "Conda environment activated successfully"
+
+# Step 3: Create and navigate to forge base directory
+log_info "Step 3: Setting up forge directory..."
+if [ ! -d "$FORGE_BASE_DIR" ]; then
+    log_info "Creating forge base directory: $FORGE_BASE_DIR"
+    mkdir -p "$FORGE_BASE_DIR"
+fi
+
+cd "$FORGE_BASE_DIR"
+log_info "Changed to directory: $(pwd)"
+
+# Step 4: Clone or update forge repository
+log_info "Step 4: Setting up forge git repository..."
+if [ -d "$FORGE_REPO_DIR" ]; then
+    log_warn "Forge repository already exists at: $FORGE_REPO_DIR"
+    cd "$FORGE_REPO_DIR"
+
+    if [ -d ".git" ]; then
+        log_info "Updating existing repository..."
+        git fetch origin
+        if [ $? -eq 0 ]; then
+            log_info "Repository updated successfully"
+        else
+            log_warn "Failed to fetch updates, continuing with existing code"
+        fi
+    else
+        log_error "Directory exists but is not a git repository"
+        log_info "Removing directory and cloning fresh..."
+        cd "$FORGE_BASE_DIR"
+        rm -rf "$FORGE_REPO_DIR"
+        git clone git@github.com:meta-pytorch/forge.git
+        if [ $? -ne 0 ]; then
+            log_error "Failed to clone forge repository"
+            exit 1
+        fi
+        cd "$FORGE_REPO_DIR"
+    fi
+else
+    log_info "Cloning forge repository..."
+    git clone git@github.com:meta-pytorch/forge.git
+    if [ $? -ne 0 ]; then
+        log_error "Failed to clone forge repository"
+        log_error "Please ensure:"
+        log_error "1. You have SSH access to github.com"
+        log_error "2. Your SSH key is added to GitHub"
+        log_error "3. You have access to meta-pytorch/forge repository"
+        exit 1
+    fi
+    cd "$FORGE_REPO_DIR"
+fi
+
+log_info "Current directory: $(pwd)"
+
+# Step 5: Install forge package
+log_info "Step 5: Installing forge package..."
+pip install --no-deps --force-reinstall .
+if [ $? -ne 0 ]; then
+    log_error "Failed to install forge package"
+    exit 1
+fi
+log_info "Forge package installed successfully"
+
+# Step 6: Navigate to monarch directory
+log_info "Step 6: Setting up monarch directory..."
+if [ ! -d "$MONARCH_DIR" ]; then
+    log_info "Creating monarch directory: $MONARCH_DIR"
+    mkdir -p "$MONARCH_DIR"
+fi
+
+cd "$MONARCH_DIR"
+log_info "Changed to directory: $(pwd)"
+
+# Step 7: Fetch monarch package
+log_info "Step 7: Fetching monarch package..."
+# TODO: Remove hardcodedm version
+fbpkg fetch monarch_no_torch:23
+if [ $? -ne 0 ]; then
+    log_error "Failed to fetch monarch_no_torch:23"
+    log_error "Please ensure fbpkg is properly configured"
+    exit 1
+fi
+log_info "Monarch package fetched successfully"
+
+# Step 8: Install monarch wheel
+log_info "Step 8: Installing monarch wheel..."
+WHEEL_FILE="monarch-0.0.0-py3.10-none-any.whl"
+if [ ! -f "$WHEEL_FILE" ]; then
+    log_error "Wheel file not found: $WHEEL_FILE"
+    log_error "Available files in directory:"
+    ls -la *.whl 2>/dev/null || log_error "No wheel files found"
+    exit 1
+fi
+
+pip install --force-reinstall "$WHEEL_FILE"
+if [ $? -ne 0 ]; then
+    log_error "Failed to install monarch wheel"
+    exit 1
+fi
+log_info "Monarch wheel installed successfully"
+
+log_info "Environment activation completed"
+
+# Final verification
+log_info "Setup completed successfully!"
+
+# Check mount status
+if mountpoint -q "/mnt/wsfuse" 2>/dev/null; then
+    log_info "Workspace mount: ✓ Active at /mnt/wsfuse"
+else
+    log_warn "Workspace mount: ✗ Not mounted"
+fi
+
+# Check current environment
+if command -v conda >/dev/null 2>&1 && conda info --envs >/dev/null 2>&1; then
+    CURRENT_ENV=$(conda info --show-active-prefix 2>/dev/null | sed 's/.*\///' || echo "unknown")
+    log_info "Current conda environment: $CURRENT_ENV"
+else
+    log_info "Current environment: Using xl_conda.sh managed environment"
+fi
+
+log_info "Current directory: $(pwd)"
+log_info "Python location: $(which python)"
+
+# Show installed packages
+log_info "Key installed packages:"
+pip list | grep -E "(forge|monarch)" || log_warn "No forge/monarch packages found in pip list"
+
+log_info "Environment setup complete! You can now run your scripts."
+log_info "Mounted workspace available at: /mnt/wsfuse"
+
+# Step 9: Ask user to deactivate and activate conda env conda environment
+echo ""
+log_info "Installation completed successfully!"
+echo ""
+log_info "Re-activate the conda environment to make the changes take effect:"
+log_info "conda deactivate && conda activate forge-8448524"
diff --git a/apps/mast/main.py b/apps/mast/main.py
new file mode 100644
index 000000000..fd1819ed2
--- /dev/null
+++ b/apps/mast/main.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import asyncio
+import getpass
+
+from apps.grpo.main import main as grpo_main
+from forge.cli.config import parse
+from forge.controller.provisioner import init_provisioner, JOB_NAME_KEY, SCHEDULER_KEY
+
+from forge.types import Scheduler
+from omegaconf import DictConfig
+
+
+async def main(cfg: DictConfig):
+    """Main module for launching mast jobs for GRPO training."""
+    if cfg.get(SCHEDULER_KEY, Scheduler.MAST.value) != Scheduler.MAST.value:
+        raise ValueError("Schuduler must be MAST.")
+
+    if cfg.get(JOB_NAME_KEY, None) is not None:
+        # prepend user name to the job to avoid name collision
+        cfg[JOB_NAME_KEY] = f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}"
+
+    # init mast provisioner
+    await init_provisioner(cfg)
+    await grpo_main(cfg)
+
+
+if __name__ == "__main__":
+
+    @parse
+    def _main(cfg):
+        asyncio.run(main(cfg))
+
+    _main()  # @parse grabs the cfg from CLI
diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml
new file mode 100644
index 000000000..2429077fc
--- /dev/null
+++ b/apps/mast/qwen3_14b_mast.yaml
@@ -0,0 +1,153 @@
+
+# Grouped Relative Policy Optimization (GRPO)
+
+# Global configuration
+group_size: 8
+batch_size: 16
+max_req_tokens: 512
+max_res_tokens: 512
+model: "Qwen/Qwen3-14B"
+off_by_n: 1 # Off by one by default
+scheduler: mast
+job_name: forge-qwen-14B
+checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20
+
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_config:
+    model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    enforce_eager: false
+    # TODO: Had to disable this becasue vLLm wouldn't like
+    # need to revisit.
+    disable_custom_all_reduce: true
+  sampling_config:
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+  checkpoint_path: ${checkpoint_folder}
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 14B
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${batch_size}
+    seq_len: 2048
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 4
+    tensor_parallel_degree: 2
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56
+    initial_load_in_hf: true
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+    folder: ${checkpoint_folder}
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+  comm:
+    # TODO: revisit this. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in PCI
+    init_timeout_seconds: 3600
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${batch_size}
+  max_policy_age: ${off_by_n}
+  dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 14B
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56
+  training:
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  dataset:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: dataset
+  policy:
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 14
+    with_gpus: true
+    hosts: 1
+    mesh_name: policy
+  trainer:
+    procs: 8
+    num_replicas: 1
+    with_gpus: true
+    hosts: 1
+    mesh_name: trainer
+  replay_buffer:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  ref_model:
+    procs: ${ref_model.parallelism.tensor_parallel_degree}
+    num_replicas: 14
+    with_gpus: true
+    hosts: 1
+    mesh_name: ref_model
+  compute_advantages:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: compute_advantages
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml
new file mode 100644
index 000000000..21a58df0b
--- /dev/null
+++ b/apps/mast/qwen3_1_7b_mast.yaml
@@ -0,0 +1,129 @@
+# Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
+
+# Global configuration
+group_size: 8
+batch_size: 16
+max_req_tokens: 512
+max_res_tokens: 512
+model: "Qwen/Qwen3-1.7B"
+off_by_n: 1 # Off by one by default
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_config:
+    model: ${model}
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    enforce_eager: false
+  sampling_config:
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 1.7B
+    hf_assets_path: hf://${model}
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${batch_size}
+    seq_len: 2048
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: false
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${batch_size}
+  max_policy_age: ${off_by_n}
+  dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 1.7B
+    hf_assets_path: hf://${model}
+  training:
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    enable: true
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  policy:
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 2
+    with_gpus: true
+  ref_model:
+    procs: 2
+    num_replicas: 1
+    with_gpus: true
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+  trainer:
+    procs: 1
+    with_gpus: true
+  replay_buffer:
+    procs: 1
+    with_gpus: false
+  compute_advantages:
+    procs: 1
+    with_gpus: false
diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml
new file mode 100644
index 000000000..3c77adfa3
--- /dev/null
+++ b/apps/mast/qwen3_32b_mast.yaml
@@ -0,0 +1,153 @@
+# Grouped Relative Policy Optimization (GRPO)
+
+# Global configuration
+group_size: 8
+batch_size: 16
+max_req_tokens: 512
+max_res_tokens: 512
+model: "Qwen/Qwen3-32B"
+off_by_n: 1 # Off by one by default
+scheduler: mast
+job_name: forge-qwen-32B
+checkpoint_folder: /mnt/wsfuse/$user$/forge_runs/${job_name}/20
+
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_config:
+    model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    enforce_eager: false
+    # TODO: Had to disable this becasue vLLm wouldn't like
+    # need to revisit.
+    disable_custom_all_reduce: true
+  sampling_config:
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+  checkpoint_path: ${checkpoint_folder}
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 32B
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${batch_size}
+    seq_len: 2048
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 4
+    tensor_parallel_degree: 2
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470
+    initial_load_in_hf: true
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+    folder: ${checkpoint_folder}
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+  comm:
+    # TODO: revisit this. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in PCI
+    init_timeout_seconds: 3600
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${batch_size}
+  max_policy_age: ${off_by_n}
+  dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 32B
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470
+  training:
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 2
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  dataset:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: dataset
+  policy:
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 2
+    with_gpus: true
+    hosts: 1
+    mesh_name: policy
+  trainer:
+    # procs: ${trainer.parallelism.data_parallel_shard_degree}
+    procs: 8
+    num_replicas: 1
+    with_gpus: true
+    hosts: 1
+    mesh_name: trainer
+  replay_buffer:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  ref_model:
+    procs: ${ref_model.parallelism.tensor_parallel_degree}
+    num_replicas: 2
+    with_gpus: true
+    hosts: 1
+    mesh_name: ref_model
+  compute_advantages:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: compute_advantages
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml
new file mode 100644
index 000000000..1690494e8
--- /dev/null
+++ b/apps/mast/qwen3_4b_mast.yaml
@@ -0,0 +1,152 @@
+# Grouped Relative Policy Optimization (GRPO)
+
+# Global configuration
+group_size: 8
+batch_size: 16
+max_req_tokens: 512
+max_res_tokens: 512
+model: "Qwen/Qwen3-4B"
+off_by_n: 1 # Off by one by default
+scheduler: mast
+job_name: forge-qwen-4B
+checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20
+
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_config:
+    model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    enforce_eager: false
+    # TODO: Had to disable this becasue vLLm wouldn't like
+    # need to revisit.
+    disable_custom_all_reduce: true
+  sampling_config:
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+  checkpoint_path: ${checkpoint_folder}
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 4B
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${batch_size}
+    seq_len: 2048
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 4
+    tensor_parallel_degree: 2
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed
+    initial_load_in_hf: true
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+    folder: ${checkpoint_folder}
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+  comm:
+    # TODO: revisit this. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in PCI
+    init_timeout_seconds: 3600
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${batch_size}
+  max_policy_age: ${off_by_n}
+  dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 4B
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed
+  training:
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  dataset:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: dataset
+  policy:
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 2
+    with_gpus: true
+    hosts: 1
+    mesh_name: policy
+  trainer:
+    procs: 8
+    num_replicas: 1
+    with_gpus: true
+    hosts: 1
+    mesh_name: trainer
+  replay_buffer:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  ref_model:
+    procs: ${ref_model.parallelism.tensor_parallel_degree}
+    num_replicas: 2
+    with_gpus: true
+    hosts: 1
+    mesh_name: ref_model
+  compute_advantages:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: compute_advantages
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml
new file mode 100644
index 000000000..d9ed947ff
--- /dev/null
+++ b/apps/mast/qwen3_8b_mast.yaml
@@ -0,0 +1,152 @@
+# Grouped Relative Policy Optimization (GRPO)
+
+# Global configuration
+group_size: 8
+batch_size: 16
+max_req_tokens: 512
+max_res_tokens: 512
+model: "Qwen/Qwen3-8B"
+off_by_n: 1 # Off by one by default
+scheduler: mast
+job_name: forge-qwen-8B
+checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20
+
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_config:
+    model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    enforce_eager: false
+    # TODO: Had to disable this becasue vLLm wouldn't like
+    # need to revisit.
+    disable_custom_all_reduce: true
+  sampling_config:
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0=
+  checkpoint_path: ${checkpoint_folder}
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 8B
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${batch_size}
+    seq_len: 2048
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 4
+    tensor_parallel_degree: 2
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model
+    initial_load_in_hf: true
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+    folder: ${checkpoint_folder}
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+  comm:
+    # TODO: revisit this. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in PCI
+    init_timeout_seconds: 3600
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${batch_size}
+  max_policy_age: ${off_by_n}
+  dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 8B
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model
+  training:
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  dataset:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: dataset
+  policy:
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 2
+    with_gpus: true
+    hosts: 1
+    mesh_name: policy
+  trainer:
+    procs: 8
+    num_replicas: 1
+    with_gpus: true
+    hosts: 1
+    mesh_name: trainer
+  replay_buffer:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  ref_model:
+    procs: ${ref_model.parallelism.tensor_parallel_degree}
+    num_replicas: 2
+    with_gpus: true
+    hosts: 1
+    mesh_name: ref_model
+  compute_advantages:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: compute_advantages
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
diff --git a/src/forge/actors/policy.py b/src/forge/actors/policy.py
index 464674f2c..00d788bf4 100644
--- a/src/forge/actors/policy.py
+++ b/src/forge/actors/policy.py
@@ -19,6 +19,23 @@
 import torch.distributed.checkpoint as dcp
 import torchstore as ts
 
+from forge.actors._torchstore_utils import (
+    DcpHandle,
+    extract_param_name,
+    get_dcp_whole_state_dict_key,
+    get_param_key,
+    get_param_prefix,
+    load_tensor_from_dcp,
+)
+
+from forge.controller import ForgeActor, get_proc_mesh, stop_proc_mesh
+from forge.data.sharding import VLLMSharding
+from forge.data_models.completion import Completion
+from forge.data_models.prompt import to_prompt
+from forge.interfaces import Policy as PolicyInterface
+from forge.observability.metrics import record_metric, Reduce
+from forge.observability.perf_tracker import Tracer
+from forge.types import ProcessConfig
 from monarch.actor import current_rank, endpoint, ProcMesh
 from torchstore.state_dict_utils import DELIM
 from vllm.config import VllmConfig
@@ -43,23 +60,6 @@
 from vllm.v1.structured_output import StructuredOutputManager
 from vllm.worker.worker_base import WorkerWrapperBase
 
-from forge.actors._torchstore_utils import (
-    extract_param_name,
-    get_dcp_whole_state_dict_key,
-    get_param_key,
-    get_param_prefix,
-    load_tensor_from_dcp,
-)
-
-from forge.controller import ForgeActor, get_proc_mesh, stop_proc_mesh
-from forge.data.sharding import VLLMSharding
-from forge.data_models.completion import Completion
-from forge.data_models.prompt import to_prompt
-from forge.interfaces import Policy as PolicyInterface
-from forge.observability.metrics import record_metric, Reduce
-from forge.observability.perf_tracker import Tracer
-from forge.types import ProcessConfig
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
@@ -173,6 +173,7 @@ async def launch(  # pyright: ignore[reportIncompatibleMethodOverride]
             procs=cls.procs,
             hosts=cls.hosts,
             with_gpus=cls.with_gpus,
+            mesh_name=cls.mesh_name,
         )
         worker_procs = await get_proc_mesh(process_config=process_config)
 
diff --git a/src/forge/controller/actor.py b/src/forge/controller/actor.py
index f9790ffd3..2c1cf3655 100644
--- a/src/forge/controller/actor.py
+++ b/src/forge/controller/actor.py
@@ -10,12 +10,12 @@
 import sys
 from typing import Any, Type, TypeVar
 
-from monarch.actor import Actor, current_rank, current_size, endpoint
-
 from forge.controller.proc_mesh import get_proc_mesh, stop_proc_mesh
 
 from forge.types import ProcessConfig, ServiceConfig
 
+from monarch.actor import Actor, current_rank, current_size, endpoint
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 T = TypeVar("T", bound="ForgeActor")
@@ -58,6 +58,7 @@ def options(
         hosts: int | None = None,
         with_gpus: bool = False,
         num_replicas: int = 1,
+        mesh_name: str | None = None,
         **kwargs,
     ) -> Type[T]:
         """
@@ -91,6 +92,7 @@ def options(
             "hosts": hosts,
             "with_gpus": with_gpus,
             "num_replicas": num_replicas,
+            "mesh_name": mesh_name,
             "_extra_config": kwargs,
         }
 
@@ -116,6 +118,7 @@ async def as_service(
             "hosts": cls.hosts,
             "with_gpus": cls.with_gpus,
             "num_replicas": cls.num_replicas,
+            "mesh_name": cls.mesh_name,
             **cls._extra_config,  # all extra fields
         }
         cfg = ServiceConfig(**cfg_kwargs)
@@ -181,6 +184,7 @@ async def launch(cls, *args, **kwargs) -> "ForgeActor":
             procs=cls.procs,
             hosts=cls.hosts,
             with_gpus=cls.with_gpus,
+            mesh_name=cls.mesh_name,
         )
 
         proc_mesh = await get_proc_mesh(process_config=cfg)
diff --git a/src/forge/controller/launcher/__init__.py b/src/forge/controller/launcher/__init__.py
new file mode 100644
index 000000000..2e41cd717
--- /dev/null
+++ b/src/forge/controller/launcher/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py
new file mode 100644
index 000000000..8cc3ac323
--- /dev/null
+++ b/src/forge/controller/launcher/mast.py
@@ -0,0 +1,350 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import asyncio
+import functools
+import getpass
+import logging
+import os
+import socket
+import subprocess
+import uuid
+from typing import Optional
+
+import torchx.specs as specs
+from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints
+
+try:
+    from monarch._src.actor.actor_mesh import current_rank
+    from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig
+    from monarch._src.actor.shape import NDSlice, Shape
+    from monarch.tools.components.meta import hyperactor
+    from torchx.specs import AppState
+    from torchx.specs.fb.component_helpers import Packages
+except ImportError as e:
+    print(f"Warning: Monarch imports failed: {e}")
+    print("Monarch functionality will be limited")
+from forge.controller.provisioner import BaseProvisioner, GpuManager, JOB_NAME_KEY
+from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
+from monarch.tools import commands
+from monarch.tools.commands import info
+from monarch.tools.config import Config, Workspace
+from omegaconf import DictConfig
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+SCHEDULER_NAME = "mast_conda"
+SKU = "gtt_any"
+TIMEOUT_SEC = 1 * 60 * 60  # Kill the job if idle for 1 hour
+
+USER = getpass.getuser()
+WORK_DIR = f"/data/users/{USER}"  # on DEVGPU
+EDITABLE_WORKSPACES = ["forge"]
+REMOTE_WORK_DIR = "/packages/monarch_default_workspace/workspace/"
+
+EDITABLE_WORKSPACE_PATHS = [
+    f"{WORK_DIR}/{workspace}" for workspace in EDITABLE_WORKSPACES
+]
+
+
+def _get_port() -> str:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("localhost", 0))
+        addr = s.getsockname()
+        port = addr[1]
+        return str(port)
+
+
+class MastSetupActor(Actor):
+    @endpoint
+    def get_info(self) -> [str, str]:
+        return socket.gethostname(), _get_port()
+
+    @endpoint
+    def mount(self, mount_dst: str):
+        point = current_rank()
+        # The last dimension is the local proc count.
+        last_label = point.extent.labels[-1]
+        proc_count = point.size(last_label)
+        if current_rank().rank % proc_count != 0:
+            # Only use one rank per host to mount the directory
+            return
+        self.mount_mnt_directory(mount_dst)
+
+    def mount_mnt_directory(self, mount_dst: str) -> None:
+        # Sanity check of the mounted directory
+        sanity_path = os.path.join(mount_dst, "huggingface_models/")
+        if os.path.exists(sanity_path):
+            print(f"Found directory {sanity_path}; skip mounting.")
+            return
+
+        # Otherwise, mount the directory
+        if not os.path.exists(mount_dst):
+            os.makedirs(mount_dst, exist_ok=True)
+
+        # Store original LD_LIBRARY_PATH to restore after mounting
+        original_ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
+
+        try:
+            clean_env = os.environ.copy()
+            if "LD_LIBRARY_PATH" in clean_env:
+                del clean_env["LD_LIBRARY_PATH"]
+
+            subprocess.run(
+                [
+                    "/packages/oil.oilfs/oilfs-wrapper",
+                    "ws://ws.ai.pci0ai/genai_fair_llm",
+                    mount_dst,
+                ],
+                capture_output=True,
+                text=True,
+                check=True,
+                env=clean_env,
+            )
+            print("Done mounting")
+        except subprocess.CalledProcessError as e:
+            print(
+                f"Get error during mounting {e}, Stderr: {e.stderr}, Stdout: {e.stdout}"
+            )
+        finally:
+            # Restore original LD_LIBRARY_PATH
+            if original_ld_library_path:
+                os.environ["LD_LIBRARY_PATH"] = original_ld_library_path
+            elif "LD_LIBRARY_PATH" in os.environ:
+                del os.environ["LD_LIBRARY_PATH"]
+
+        assert os.path.exists(
+            sanity_path
+        ), f"Did not find directory {sanity_path}; something wrong with mounting."
+
+
+class MastProvisioner(BaseProvisioner):
+    def __init__(self, cfg: DictConfig | None = None):
+        self._server_names = []
+        self._proc_server_map = {}
+        self._lock = asyncio.Lock()
+        self._this_host_id = uuid.uuid1()
+        available_local_devices = None
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        if cuda_visible_devices is not None and cuda_visible_devices.strip():
+            try:
+                available_local_devices = set(
+                    int(x.strip()) for x in cuda_visible_devices.split(",") if x.strip()
+                )
+            except ValueError as e:
+                raise ValueError(
+                    f"Invalid CUDA_VISIBLE_DEVICES format: '{cuda_visible_devices}'. "
+                    f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}"
+                ) from e
+        self._host_gpu_map = {
+            self._this_host_id: GpuManager(available_local_devices),
+        }
+        assert cfg is not None
+        self.cfg = cfg
+        job_name = cfg.get(JOB_NAME_KEY, None)
+        self.job_name = job_name or self.create_job_name()
+
+    async def initialize(self):
+        """Call this after creating the instance"""
+        await self.launch_mast_job()
+
+    async def get_mast_allocator(
+        self,
+        job_name: str,
+        task_group: str,
+    ):
+        allocator = MastAllocator(
+            MastAllocatorConfig(
+                job_name=job_name,
+                remote_allocator_port=26600,  # This is the default monarch port
+            ),
+        )
+        alloc_constraints = AllocConstraints(
+            {MastAllocator.ALLOC_LABEL_TASK_GROUP: task_group}
+        )
+
+        return allocator, alloc_constraints
+
+    async def create_host_mesh(self, name: str, num_hosts: int):
+        """Creates a remote server and a HostMesh on it."""
+        logger.debug(f"Creating remote server for mesh: {name}")
+        server_name = f"{SCHEDULER_NAME}:///{self.job_name}"
+        alloc, alloc_constraints = await self.get_mast_allocator(
+            task_group=name, job_name=self.job_name
+        )
+        return (
+            HostMesh(
+                shape=Shape(["hosts"], NDSlice.new_row_major([num_hosts])),
+                allocator=alloc,
+                alloc_constraints=alloc_constraints,
+            ),
+            server_name,
+        )
+
+    async def get_proc_mesh(
+        self,
+        num_procs: int,
+        with_gpus: bool = False,
+        num_hosts: int | None = None,
+        mesh_name: Optional[str] = None,
+    ):
+        """Gets a proc mesh.
+
+        num_hosts = None implies that you want a local allocation, this may change.
+
+        """
+        async with self._lock:
+            server_name = None
+            if num_hosts is not None and num_hosts > 0:
+                assert mesh_name is not None
+                host_mesh, server_name = await self.create_host_mesh(
+                    name=mesh_name,
+                    num_hosts=num_hosts,
+                )
+                host_id = uuid.uuid1()
+                gpu_manager = GpuManager()
+                self._host_gpu_map[host_id] = gpu_manager
+            else:
+                host_mesh = this_host()
+                gpu_manager = self._host_gpu_map[self._this_host_id]
+                host_mesh._host_id = self._this_host_id
+
+            if with_gpus:
+
+                def bootstrap(gpu_ids: list[str]):
+                    # This works for single host, needed for vLLM currently.
+                    import os
+
+                    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(gpu_ids)
+                    os.environ["MASTER_ADDR"] = socket.gethostname()
+                    os.environ["MASTER_PORT"] = f"1234{gpu_ids[0]}"
+                    os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600"
+                    os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824"
+
+                gpu_ids = gpu_manager.get_gpus(num_procs)
+                procs = host_mesh.spawn_procs(
+                    per_host={"gpus": num_procs},
+                    bootstrap=functools.partial(bootstrap, gpu_ids=gpu_ids),
+                )
+                await procs.initialized
+                setup = await procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor)
+                hostname, port = await setup.get_info.choose()
+                await setup.mount.call(mount_dst="/mnt/wsfuse")
+                procs._hostname = hostname
+                procs._port = port
+                procs._gpu_ids = gpu_ids
+            else:
+                procs = host_mesh.spawn_procs(per_host={"gpus": num_procs})
+
+            procs._host = host_mesh
+
+            # If we created a server, track so we can tear it down later.
+            if server_name:
+                self._server_names.append(server_name)
+                self._proc_server_map[procs] = server_name
+
+        return procs
+
+    async def stop_proc_mesh(self, proc_mesh: ProcMesh):
+        """Stops a proc mesh."""
+        async with self._lock:
+            if hasattr(proc_mesh, "_gpu_ids"):
+                gpu_manager = self._host_gpu_map[proc_mesh._host._host_id]
+                gpu_manager.release_gpus(proc_mesh._gpu_ids)
+            await proc_mesh.stop()
+            if proc_mesh in self._proc_server_map:
+                server_name = self._proc_server_map[proc_mesh]
+                commands.kill(server_name)
+
+    async def shutdown(self):
+        """Tears down all remaining remote allocations."""
+        async with self._lock:
+            for server_name in self._server_names:
+                commands.kill(server_name)
+
+    async def launch_mast_job(self):
+        handle = self.create_server_handle()
+        server_spec = info(handle)
+        if server_spec and server_spec.state == AppState.RUNNING:
+            print(f"Job {self.job_name} is already running. Skipping launch.")
+            return server_spec
+
+        config = Config(
+            scheduler="mast_conda",
+            scheduler_args={
+                # NOTE: TODO: support passing these args from CLI
+                "hpcIdentity": "genai_llm_pretraining_data",
+                "hpcJobOncall": "monarch",
+                "hpcClusterUuid": "MastProdCluster",
+                "rmAttribution": "pytorch4all_clients_approved",
+            },
+            appdef=self.build_appdef(),
+            workspace=Workspace(
+                dirs=[workspace_dir for workspace_dir in EDITABLE_WORKSPACE_PATHS],
+            ),
+        )
+
+        await commands.get_or_create(self.job_name, config)
+        return server_spec
+
+    def add_additional_packages(self, packages: Packages) -> Packages:
+        packages.add_package("oil.oilfs:stable")
+        packages.add_package("manifold.manifoldfs")
+        return packages
+
+    def build_appdef(self) -> specs.AppDef:
+
+        # create the app definition for the worker
+        REMOTE_END_PYTHONPATH = ":".join(
+            [f"{REMOTE_WORK_DIR}{workspace}" for workspace in EDITABLE_WORKSPACE_PATHS]
+        )
+
+        default_envs = {
+            **hyperactor.DEFAULT_NVRT_ENVS,
+            **hyperactor.DEFAULT_NCCL_ENVS,
+            **hyperactor.DEFAULT_TORCH_ENVS,
+            **{"TORCHX_RUN_PYTHONPATH": f"{REMOTE_END_PYTHONPATH}:{REMOTE_WORK_DIR}"},
+            **{
+                "HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS": "600",
+                "HYPERACTOR_CODE_MAX_FRAME_LENGTH": "1073741824",
+            },
+        }
+
+        packages = Packages()
+        meshes = []
+        for mesh_name, config in self.cfg["services"].items():
+            num_replicas = config["num_replicas"]
+            with_gpus = bool(config["with_gpus"])
+            num_hosts = int(config.get("hosts", 0))
+            # Create list of mesh names with indices and num_hosts
+            if with_gpus and num_hosts > 0:
+                mesh_list = [
+                    f"{mesh_name}_{i}:{num_hosts}:{SKU}" for i in range(num_replicas)
+                ]
+                meshes.extend(mesh_list)
+
+        appdef = hyperactor.host_mesh_conda(
+            meshes=meshes,
+            additional_packages=self.add_additional_packages(packages),
+            timeout_sec=TIMEOUT_SEC,
+            env=default_envs,
+        )
+
+        for role in appdef.roles:
+            role.resource.capabilities["server_sub_types"] = [
+                # role.resource.capabilities["server_sub_types"][2]  # hardcoded to ROCE
+                role.resource.capabilities["server_sub_types"][1]  # GTT
+            ]
+
+        return appdef
+
+    def create_job_name(self):
+        return f"{USER}-forge-{uuid.uuid4().hex[:6]}"
+
+    def create_server_handle(self) -> str:
+        return f"{SCHEDULER_NAME}:///{self.job_name}"
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index 1951eab76..d8b3b5300 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -12,8 +12,17 @@
 import os
 import socket
 import uuid
+from abc import ABC, abstractmethod
+from typing import Optional
 
 import monarch
+
+from forge.observability.metric_actors import (
+    get_or_create_metric_logger,
+    setup_metric_logger,
+)
+
+from forge.types import ProcessConfig, Scheduler
 from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
 from monarch._src.actor.shape import NDSlice, Shape
 from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
@@ -21,13 +30,14 @@
 from monarch.tools.components import hyperactor
 from monarch.tools.config import Config
 
-from forge.observability.metric_actors import get_or_create_metric_logger
-
-from forge.types import ProcessConfig
+from omegaconf import DictConfig
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
+JOB_NAME_KEY = "job_name"
+SCHEDULER_KEY = "scheduler"
+
 
 def _get_port() -> str:
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -76,7 +86,54 @@ def release_gpus(self, gpu_ids: list[str]) -> None:
             self.available_gpus.add(int(gpu_id))
 
 
-class Provisioner:
+class BaseProvisioner(ABC):
+    """Abstract base class for resource provisioners."""
+
+    @abstractmethod
+    async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
+        """Creates a remote server and a HostMesh on it.
+        Args:
+            name: Name identifier for the host mesh
+            num_hosts: Number of hosts to create
+        Returns:
+            HostMesh: The created host mesh
+        """
+        pass
+
+    @abstractmethod
+    async def get_proc_mesh(
+        self,
+        num_procs: int,
+        with_gpus: bool = False,
+        num_hosts: Optional[int] = None,
+        mesh_name: Optional[str] = None,
+    ) -> ProcMesh:
+        """Gets a proc mesh.
+        Args:
+            num_procs: Number of processes needed
+            with_gpus: Whether GPU support is required
+            num_hosts: Number of hosts (None implies local allocation)
+            mesh_name: Name identifier for the proc mesh
+        Returns:
+            ProcMesh: The allocated process mesh
+        """
+        pass
+
+    @abstractmethod
+    async def stop_proc_mesh(self, proc_mesh: ProcMesh) -> None:
+        """Stops a proc mesh.
+        Args:
+            proc_mesh: The process mesh to stop
+        """
+        pass
+
+    @abstractmethod
+    async def shutdown(self) -> None:
+        """Tears down all remaining remote allocations."""
+        pass
+
+
+class Provisioner(BaseProvisioner):
     """A global resource provisioner."""
 
     def __init__(self):
@@ -145,7 +202,11 @@ async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
         )
 
     async def get_proc_mesh(
-        self, num_procs: int, with_gpus: bool = False, num_hosts: int | None = None
+        self,
+        num_procs: int,
+        with_gpus: bool = False,
+        num_hosts: int | None = None,
+        mesh_name: Optional[str] = None,
     ):
         """Gets a proc mesh.
 
@@ -245,28 +306,47 @@ async def shutdown(self):
                 commands.kill(server_name)
 
 
-_provisioner: Provisioner | None = None
+_provisioner: BaseProvisioner | None = None
 
 
-def _get_provisioner():
+async def init_provisioner(cfg: DictConfig | None = None):
     global _provisioner
     if not _provisioner:
-        _provisioner = Provisioner()
+        scheduler = Scheduler.LOCAL
+        if cfg is not None:
+            scheduler = cfg.get(SCHEDULER_KEY, Scheduler.LOCAL.value)
+        if scheduler == Scheduler.MAST.value:
+            from forge.controller.launcher.mast import MastProvisioner
+
+            _provisioner = MastProvisioner(cfg=cfg)
+            await _provisioner.initialize()
+        else:
+            _provisioner = Provisioner()
+    return _provisioner
+
+
+async def _get_provisioner():
+    if not _provisioner:
+        await init_provisioner()
     return _provisioner
 
 
 async def get_proc_mesh(config: ProcessConfig) -> ProcMesh:
-    return await _get_provisioner().get_proc_mesh(
+    provisioner = await _get_provisioner()
+    return await provisioner.get_proc_mesh(
         num_procs=config.procs,
         with_gpus=config.with_gpus,
         num_hosts=config.hosts,
+        mesh_name=config.mesh_name,
     )
 
 
 async def stop_proc_mesh(proc_mesh: ProcMesh):
-    return await _get_provisioner().stop_proc_mesh(proc_mesh=proc_mesh)
+    provisioner = await _get_provisioner()
+    return await provisioner.stop_proc_mesh(proc_mesh=proc_mesh)
 
 
 async def shutdown():
     logger.info("Shutting down provisioner..")
-    await _get_provisioner().shutdown()
+    provisioner = await _get_provisioner()
+    return await provisioner.shutdown()
diff --git a/src/forge/controller/service/replica.py b/src/forge/controller/service/replica.py
index 09b0a2ce6..7331fa401 100644
--- a/src/forge/controller/service/replica.py
+++ b/src/forge/controller/service/replica.py
@@ -13,11 +13,11 @@
 from enum import Enum
 from typing import Optional
 
-from monarch.actor import ActorError
-
 from forge.controller import ForgeActor
 from forge.types import ProcessConfig
 
+from monarch.actor import ActorError
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
@@ -159,6 +159,7 @@ async def initialize(self):
             # Deploy the actor and its underlying resources
             logger.debug(f"Launching actor for replica {self.idx}")
 
+            self.proc_config.mesh_name = f"{self.proc_config.mesh_name}_{self.idx}"
             self.actor = await self.actor_def.launch(
                 *self.actor_args,
                 **self.actor_kwargs,
diff --git a/src/forge/types.py b/src/forge/types.py
index cc41d2185..271797d95 100644
--- a/src/forge/types.py
+++ b/src/forge/types.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
+from enum import Enum
 from typing import Any, TypedDict, Union
 
 
@@ -87,6 +88,12 @@ class State:
     metadata: dict[str, Any] = field(default_factory=dict)
 
 
+class Scheduler(Enum):
+    MAST = "mast"
+    SLURM = "slurm"
+    LOCAL = "local"
+
+
 @dataclass
 class ProcessConfig:
     """A proc_mesh config for the torchx scheduler."""
@@ -94,6 +101,7 @@ class ProcessConfig:
     procs: int = 1
     with_gpus: bool = False
     hosts: int | None = None
+    mesh_name: str | None = None
 
 
 @dataclass
@@ -118,6 +126,7 @@ class ServiceConfig:
     health_poll_rate: float = 0.2
     replica_max_concurrent_requests: int = 10
     return_first_rank_result: bool = True
+    mesh_name: str | None = None
 
     def to_process_config(self) -> ProcessConfig:
         """Extract ProcessConfig from this ServiceConfig.
@@ -127,6 +136,7 @@ def to_process_config(self) -> ProcessConfig:
             procs=self.procs,
             with_gpus=self.with_gpus,
             hosts=self.hosts,
+            mesh_name=self.mesh_name,
         )
 
 

From 721c7055da75e9b60a9c9d0a5dba296a50710e54 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu004.lla1.facebook.com>
Date: Mon, 29 Sep 2025 15:52:28 -0700
Subject: [PATCH 02/17] minor change

---
 apps/mast/qwen3_1_7b_mast.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml
index 21a58df0b..597be0ff2 100644
--- a/apps/mast/qwen3_1_7b_mast.yaml
+++ b/apps/mast/qwen3_1_7b_mast.yaml
@@ -106,8 +106,8 @@ services:
     num_replicas: 2
     with_gpus: true
   ref_model:
-    procs: 2
-    num_replicas: 1
+    procs: 1
+    num_replicas: 2
     with_gpus: true
   reward_actor:
     procs: 1

From db5db98ac958b90a1ec23ed021fb76eac961c4a9 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu004.lla1.facebook.com>
Date: Tue, 30 Sep 2025 13:42:12 -0700
Subject: [PATCH 03/17] interim changes

---
 apps/mast/qwen3_1_7b_mast.yaml        | 34 +++++++++++++++++++++++----
 src/forge/controller/launcher/mast.py |  9 +++++++
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml
index 597be0ff2..2c266c2bc 100644
--- a/apps/mast/qwen3_1_7b_mast.yaml
+++ b/apps/mast/qwen3_1_7b_mast.yaml
@@ -8,6 +8,20 @@ max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-1.7B"
 off_by_n: 1 # Off by one by default
+scheduler: mast
+job_name: forge-qwen3-1_7b-2a48e
+
+# Main loop configuration
+rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
+
+# Observability configuration
+metric_logging:
+  wandb:
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
+    reduce_across_ranks: True
+  console:
+    reduce_across_ranks: True
 
 # Dataset configuration
 dataset:
@@ -20,7 +34,7 @@ dataset:
 # Policy configuration
 policy:
   engine_config:
-    model: ${model}
+    model: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5
     tensor_parallel_size: 1
     pipeline_parallel_size: 1
     enforce_eager: false
@@ -35,7 +49,7 @@ trainer:
   model:
     name: qwen3
     flavor: 1.7B
-    hf_assets_path: hf://${model}
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5
   optimizer:
     name: AdamW
     lr: 1e-5
@@ -61,7 +75,7 @@ trainer:
     disable_loss_parallel: true
   checkpoint:
     enable: false
-    initial_load_path: hf://${model}
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5
     initial_load_in_hf: true
     last_save_in_hf: true
     interval: 500
@@ -81,7 +95,7 @@ ref_model:
   model:
     name: qwen3
     flavor: 1.7B
-    hf_assets_path: hf://${model}
+    hf_assets_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5
   training:
     dtype: bfloat16
     gc_freq: 1
@@ -96,7 +110,7 @@ ref_model:
     expert_parallel_degree: 1
   checkpoint:
     enable: true
-    initial_load_path: hf://${model}
+    initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5
     initial_load_in_hf: true
 
 # All resource allocations
@@ -105,25 +119,35 @@ services:
     procs: ${policy.engine_config.tensor_parallel_size}
     num_replicas: 2
     with_gpus: true
+    mesh_name: policy
+    hosts: 1
   ref_model:
     procs: 1
     num_replicas: 2
     with_gpus: true
+    mesh_name: ref_model
+    hosts: 1
   reward_actor:
     procs: 1
     num_replicas: 1
     with_gpus: false
+    mesh_name: reward_actor
 
 actors:
   dataset:
     procs: 1
     with_gpus: false
+    mesh_name: dataset
   trainer:
     procs: 1
     with_gpus: true
+    mesh_name: trainer
+    hosts: 1
   replay_buffer:
     procs: 1
     with_gpus: false
+    mesh_name: replay_buffer
   compute_advantages:
     procs: 1
     with_gpus: false
+    mesh_name: compute_advantages
diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py
index 8cc3ac323..7bda56c6b 100644
--- a/src/forge/controller/launcher/mast.py
+++ b/src/forge/controller/launcher/mast.py
@@ -317,6 +317,7 @@ def build_appdef(self) -> specs.AppDef:
 
         packages = Packages()
         meshes = []
+        # Process both services and actors configurations
         for mesh_name, config in self.cfg["services"].items():
             num_replicas = config["num_replicas"]
             with_gpus = bool(config["with_gpus"])
@@ -328,6 +329,14 @@ def build_appdef(self) -> specs.AppDef:
                 ]
                 meshes.extend(mesh_list)
 
+        for mesh_name, config in self.cfg["actors"].items():
+            num_replicas = 1
+            with_gpus = bool(config["with_gpus"])
+            num_hosts = int(config.get("hosts", 0))
+            # single actors with GPUs
+            if with_gpus:
+                meshes.append(f"{mesh_name}:{num_replicas}:{SKU}")
+
         appdef = hyperactor.host_mesh_conda(
             meshes=meshes,
             additional_packages=self.add_additional_packages(packages),

From 75aa422a727deff4487bbc24bc254ed6ff677bed Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu004.lla1.facebook.com>
Date: Wed, 1 Oct 2025 08:24:21 -0700
Subject: [PATCH 04/17] fix the bug

---
 src/forge/controller/provisioner.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index d8b3b5300..05b52e8af 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -17,10 +17,7 @@
 
 import monarch
 
-from forge.observability.metric_actors import (
-    get_or_create_metric_logger,
-    setup_metric_logger,
-)
+from forge.observability.metric_actors import get_or_create_metric_logger
 
 from forge.types import ProcessConfig, Scheduler
 from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer

From 5be96991996c22f38417c21d64a8dd8449237c35 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu004.lla1.facebook.com>
Date: Wed, 1 Oct 2025 08:44:15 -0700
Subject: [PATCH 05/17] some more fixes

---
 apps/mast/README.md                     | 31 ++++++++++++++++
 apps/mast/env_setup.sh                  | 47 +++----------------------
 apps/mast/qwen3_1_7b_mast.yaml          |  2 +-
 src/forge/controller/launcher/mast.py   |  2 +-
 src/forge/controller/service/replica.py |  5 ++-
 5 files changed, 41 insertions(+), 46 deletions(-)
 create mode 100644 apps/mast/README.md

diff --git a/apps/mast/README.md b/apps/mast/README.md
new file mode 100644
index 000000000..6cd48d32d
--- /dev/null
+++ b/apps/mast/README.md
@@ -0,0 +1,31 @@
+# Forge MAST Environment Setup
+
+A simple setup script to automatically configure your environment for running Forge with MAST jobs.
+
+## Quick Start
+
+### 1. Run the Setup Script
+
+The `env_setup.sh` script will automatically:
+- ✅ Activate the required conda environment (`forge-8448524`)
+- ✅ Clone/update the Forge repository
+- ✅ Install Forge package dependencies
+- ✅ Mount the required oilfs workspace to `/mnt/wsfuse`
+- ✅ Configure your environment for MAST job submission
+
+```bash
+# Make the script executable
+chmod +x env_setup.sh
+
+# Run the setup
+./apps/mast/env_setup.sh
+
+```
+
+### 2. Submit MAST job
+
+```
+pip install --force-reinstall --no-deps . && python -m apps.mast.main --config apps/mast/qwen3_1_7b_mast.yaml
+```
+
+⚠️ Important Note: `pip install --force-reinstall --no-deps .` is required every time you make a change to the local codebase. This ensures your latest changes are installed before job submission.
diff --git a/apps/mast/env_setup.sh b/apps/mast/env_setup.sh
index 6728c87db..88d2edf6f 100755
--- a/apps/mast/env_setup.sh
+++ b/apps/mast/env_setup.sh
@@ -139,7 +139,6 @@ FBSOURCE_PATH="/data/users/$USER/fbsource"
 CONDA_SCRIPT_PATH="$FBSOURCE_PATH/genai/xlformers/dev/xl_conda.sh"
 FORGE_BASE_DIR="/data/users/$USER"
 FORGE_REPO_DIR="$FORGE_BASE_DIR/forge"
-MONARCH_DIR="$HOME/monarch_no_torch_latest"
 
 # Workspace URL for mounting
 WORKSPACE_URL="ws://ws.ai.pci0ai/genai_fair_llm"
@@ -163,10 +162,10 @@ if [ ! -f "$CONDA_SCRIPT_PATH" ]; then
 fi
 
 log_info "Sourcing conda script: $CONDA_SCRIPT_PATH"
-source "$CONDA_SCRIPT_PATH" activate forge:8448524
+source "$CONDA_SCRIPT_PATH" activate forge:e146614
 
 if [ $? -ne 0 ]; then
-    log_error "Failed to activate conda environment forge-8448524"
+    log_error "Failed to activate conda environment forge-e146614"
     exit 1
 fi
 
@@ -233,44 +232,6 @@ if [ $? -ne 0 ]; then
 fi
 log_info "Forge package installed successfully"
 
-# Step 6: Navigate to monarch directory
-log_info "Step 6: Setting up monarch directory..."
-if [ ! -d "$MONARCH_DIR" ]; then
-    log_info "Creating monarch directory: $MONARCH_DIR"
-    mkdir -p "$MONARCH_DIR"
-fi
-
-cd "$MONARCH_DIR"
-log_info "Changed to directory: $(pwd)"
-
-# Step 7: Fetch monarch package
-log_info "Step 7: Fetching monarch package..."
-# TODO: Remove hardcodedm version
-fbpkg fetch monarch_no_torch:23
-if [ $? -ne 0 ]; then
-    log_error "Failed to fetch monarch_no_torch:23"
-    log_error "Please ensure fbpkg is properly configured"
-    exit 1
-fi
-log_info "Monarch package fetched successfully"
-
-# Step 8: Install monarch wheel
-log_info "Step 8: Installing monarch wheel..."
-WHEEL_FILE="monarch-0.0.0-py3.10-none-any.whl"
-if [ ! -f "$WHEEL_FILE" ]; then
-    log_error "Wheel file not found: $WHEEL_FILE"
-    log_error "Available files in directory:"
-    ls -la *.whl 2>/dev/null || log_error "No wheel files found"
-    exit 1
-fi
-
-pip install --force-reinstall "$WHEEL_FILE"
-if [ $? -ne 0 ]; then
-    log_error "Failed to install monarch wheel"
-    exit 1
-fi
-log_info "Monarch wheel installed successfully"
-
 log_info "Environment activation completed"
 
 # Final verification
@@ -301,9 +262,9 @@ pip list | grep -E "(forge|monarch)" || log_warn "No forge/monarch packages foun
 log_info "Environment setup complete! You can now run your scripts."
 log_info "Mounted workspace available at: /mnt/wsfuse"
 
-# Step 9: Ask user to deactivate and activate conda env conda environment
+# Step 6: Ask user to deactivate and activate conda env conda environment
 echo ""
 log_info "Installation completed successfully!"
 echo ""
 log_info "Re-activate the conda environment to make the changes take effect:"
-log_info "conda deactivate && conda activate forge-8448524"
+log_info "conda deactivate && conda activate forge-e146614"
diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml
index 2c266c2bc..a1804e68f 100644
--- a/apps/mast/qwen3_1_7b_mast.yaml
+++ b/apps/mast/qwen3_1_7b_mast.yaml
@@ -9,7 +9,7 @@ max_res_tokens: 512
 model: "Qwen/Qwen3-1.7B"
 off_by_n: 1 # Off by one by default
 scheduler: mast
-job_name: forge-qwen3-1_7b-2a48e
+job_name: forge-qwen3-1_7b
 
 # Main loop configuration
 rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py
index 7bda56c6b..856ca622e 100644
--- a/src/forge/controller/launcher/mast.py
+++ b/src/forge/controller/launcher/mast.py
@@ -232,7 +232,7 @@ def bootstrap(gpu_ids: list[str]):
                     bootstrap=functools.partial(bootstrap, gpu_ids=gpu_ids),
                 )
                 await procs.initialized
-                setup = await procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor)
+                setup = procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor)
                 hostname, port = await setup.get_info.choose()
                 await setup.mount.call(mount_dst="/mnt/wsfuse")
                 procs._hostname = hostname
diff --git a/src/forge/controller/service/replica.py b/src/forge/controller/service/replica.py
index 7331fa401..b0804cba6 100644
--- a/src/forge/controller/service/replica.py
+++ b/src/forge/controller/service/replica.py
@@ -159,7 +159,10 @@ async def initialize(self):
             # Deploy the actor and its underlying resources
             logger.debug(f"Launching actor for replica {self.idx}")
 
-            self.proc_config.mesh_name = f"{self.proc_config.mesh_name}_{self.idx}"
+            mesh_name_with_replica = f"{self.proc_config.mesh_name}_{self.idx}"
+            self.proc_config.mesh_name = mesh_name_with_replica
+            if hasattr(self.actor_def, "mesh_name"):
+                setattr(self.actor_def, "mesh_name", mesh_name_with_replica)
             self.actor = await self.actor_def.launch(
                 *self.actor_args,
                 **self.actor_kwargs,

From 7e69a6b6d85ee2b558e8c7efb6202651ab3f91d6 Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Wed, 1 Oct 2025 09:57:19 -0700
Subject: [PATCH 06/17] park

---
 src/forge/controller/launcher/mast.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py
index 856ca622e..cec5e87a6 100644
--- a/src/forge/controller/launcher/mast.py
+++ b/src/forge/controller/launcher/mast.py
@@ -27,13 +27,14 @@
 except ImportError as e:
     print(f"Warning: Monarch imports failed: {e}")
     print("Monarch functionality will be limited")
-from forge.controller.provisioner import BaseProvisioner, GpuManager, JOB_NAME_KEY
 from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
 from monarch.tools import commands
 from monarch.tools.commands import info
 from monarch.tools.config import Config, Workspace
 from omegaconf import DictConfig
 
+from forge.controller.provisioner import BaseProvisioner, GpuManager, JOB_NAME_KEY
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
@@ -278,7 +279,9 @@ async def launch_mast_job(self):
             scheduler="mast_conda",
             scheduler_args={
                 # NOTE: TODO: support passing these args from CLI
-                "hpcIdentity": "genai_llm_pretraining_data",
+                "hpcIdentity": "hyper_monarch",
+                # "hpcIdentity": "genai_llm_pretraining_data",
+                # "hpcIdentity": "pytorch_distributed",
                 "hpcJobOncall": "monarch",
                 "hpcClusterUuid": "MastProdCluster",
                 "rmAttribution": "pytorch4all_clients_approved",

From 649fe83d71fae85a9937496fa46dbb07b086bd36 Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Wed, 1 Oct 2025 12:07:36 -0700
Subject: [PATCH 07/17] parking changes

---
 apps/grpo/main.py                     |  1 +
 src/forge/controller/launcher/mast.py | 16 ++++++++++++++
 src/forge/controller/provisioner.py   | 31 +++++++++++++++++++++++----
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/apps/grpo/main.py b/apps/grpo/main.py
index 7545aa561..42865334e 100644
--- a/apps/grpo/main.py
+++ b/apps/grpo/main.py
@@ -316,6 +316,7 @@ async def main(cfg: DictConfig):
     metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
     mlogger = await get_or_create_metric_logger()
     await mlogger.init_backends.call_one(metric_logging_cfg)
+    print("SUCCESSFULLY CREATED AND INITIALIZED MLOGGER")
 
     # ---- Setup services ---- #
     await ts.initialize(strategy=ts.ControllerStorageVolumes())
diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py
index cec5e87a6..1aacd87dd 100644
--- a/src/forge/controller/launcher/mast.py
+++ b/src/forge/controller/launcher/mast.py
@@ -17,6 +17,8 @@
 import torchx.specs as specs
 from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints
 
+from forge.observability.metric_actors import get_or_create_metric_logger
+
 try:
     from monarch._src.actor.actor_mesh import current_rank
     from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig
@@ -249,11 +251,18 @@ def bootstrap(gpu_ids: list[str]):
                 self._server_names.append(server_name)
                 self._proc_server_map[procs] = server_name
 
+        _ = await get_or_create_metric_logger(procs)
+
         return procs
 
     async def stop_proc_mesh(self, proc_mesh: ProcMesh):
         """Stops a proc mesh."""
         async with self._lock:
+            # Deregister local logger from global logger
+            if hasattr(proc_mesh, "_local_fetcher"):
+                global_logger = await get_or_create_metric_logger(proc_mesh)
+                await global_logger.deregister_fetcher.call_one(proc_mesh)
+
             if hasattr(proc_mesh, "_gpu_ids"):
                 gpu_manager = self._host_gpu_map[proc_mesh._host._host_id]
                 gpu_manager.release_gpus(proc_mesh._gpu_ids)
@@ -315,9 +324,16 @@ def build_appdef(self) -> specs.AppDef:
             **{
                 "HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS": "600",
                 "HYPERACTOR_CODE_MAX_FRAME_LENGTH": "1073741824",
+                "TORCHINDUCTOR_COMPILE_THREADS": "1",
+                "TORCH_COMPILE_DISABLE": "1",
+                "TORCHDYNAMO_VERBOSE": "1",
+                "VLLM_TORCH_COMPILE_LEVEL": "0",
+                "VLLM_USE_TRITON_FLASH_ATTN": "0",
             },
         }
 
+        print("DEFAULT ENVS: ", default_envs)
+
         packages = Packages()
         meshes = []
         # Process both services and actors configurations
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index 05b52e8af..5b0d23783 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -16,10 +16,6 @@
 from typing import Optional
 
 import monarch
-
-from forge.observability.metric_actors import get_or_create_metric_logger
-
-from forge.types import ProcessConfig, Scheduler
 from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
 from monarch._src.actor.shape import NDSlice, Shape
 from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
@@ -29,6 +25,10 @@
 
 from omegaconf import DictConfig
 
+from forge.observability.metric_actors import get_or_create_metric_logger
+
+from forge.types import ProcessConfig, Scheduler
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
@@ -252,6 +252,29 @@ def bootstrap(gpu_ids: list[str]):
                     os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600"
                     os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824"
 
+                    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = "0"
+                    os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+                    os.environ["NVTE_TORCH_COMPILE"] = "0"
+                    os.environ["NVTE_BIAS_GELU_NVFUSION"] = "0"
+                    os.environ["NVTE_CUDA_INCLUDE_DIR"] = "/usr/local/cuda/include"
+                    os.environ["NVTE_DISABLE_NVRTC"] = "1"
+                    os.environ["NVTE_FUSED_ATTN"] = "1"
+                    os.environ["NVTE_FUSED_ATTN_USE_FAv2_BWD"] = "1"
+                    os.environ["NCCL_SET_THREAD_NAME"] = "1'"
+                    os.environ[
+                        "NCCL_DEBUG_SUBSYS"
+                    ] = "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC"
+                    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "3"
+                    os.environ["NCCL_NET_OVERHEAD"] = "2750"
+                    os.environ["NCCL_IB_SPLIT_DATA_ON_QPS"] = "0"
+                    os.environ["NCCL_IB_QPS_PER_CONNECTION"] = "16"
+                    os.environ["NCCL_CTRAN_ENABLE"] = "0"
+                    os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1"
+                    os.environ["PYTORCH_JIT"] = "0"
+                    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+                    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+                    os.environ["GLOG_minloglevel"] = "1"
+
                 gpu_ids = gpu_manager.get_gpus(num_procs)
                 procs = host_mesh.spawn_procs(
                     per_host={"gpus": num_procs},

From 0d683a454f09259382f51d916dd05dc390abe010 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu001.lla1.facebook.com>
Date: Wed, 1 Oct 2025 14:21:45 -0700
Subject: [PATCH 08/17] config changes

---
 apps/mast/qwen3_1_7b_mast.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml
index a1804e68f..993bf0570 100644
--- a/apps/mast/qwen3_1_7b_mast.yaml
+++ b/apps/mast/qwen3_1_7b_mast.yaml
@@ -9,7 +9,8 @@ max_res_tokens: 512
 model: "Qwen/Qwen3-1.7B"
 off_by_n: 1 # Off by one by default
 scheduler: mast
-job_name: forge-qwen3-1_7b
+job_name: forge-qwen3-1_7b-1190
+checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20
 
 # Main loop configuration
 rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
@@ -74,7 +75,7 @@ trainer:
     expert_parallel_degree: 1
     disable_loss_parallel: true
   checkpoint:
-    enable: false
+    enable: true
     initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-1.7B/snapshots/0060bc56d46589041c1048efd1a397421b1142b5
     initial_load_in_hf: true
     last_save_in_hf: true
@@ -83,6 +84,7 @@ trainer:
   activation_checkpoint:
     mode: selective
     selective_ac_option: op
+  dcp_path: ${checkpoint_folder}
 
 # Replay buffer configuration
 replay_buffer:

From 097642d951155850821ca61a1e5a89d5054ad1a6 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu001.lla1.facebook.com>
Date: Wed, 1 Oct 2025 15:30:40 -0700
Subject: [PATCH 09/17] working changes

---
 apps/mast/env_setup.sh              |  2 +-
 apps/mast/main.py                   | 19 +++++++-
 apps/mast/qwen3_14b_mast.yaml       | 71 ++++++++++++++++-------------
 apps/mast/qwen3_1_7b_mast.yaml      | 11 ++++-
 apps/mast/qwen3_32b_mast.yaml       | 69 ++++++++++++++++------------
 apps/mast/qwen3_4b_mast.yaml        | 70 ++++++++++++++++------------
 apps/mast/qwen3_8b_mast.yaml        | 70 ++++++++++++++++------------
 src/forge/controller/provisioner.py | 14 +++---
 8 files changed, 193 insertions(+), 133 deletions(-)

diff --git a/apps/mast/env_setup.sh b/apps/mast/env_setup.sh
index 88d2edf6f..4318e05f0 100755
--- a/apps/mast/env_setup.sh
+++ b/apps/mast/env_setup.sh
@@ -70,7 +70,7 @@ mount_workspace() {
     unset LD_LIBRARY_PATH
 
     # Mount the workspace
-    if oilfs "$workspace_url" "$mount_dir"; then
+    if sudo oilfs "$workspace_url" "$mount_dir"; then
         log_info "Successfully mounted $workspace_url to $mount_dir"
     else
         log_error "Failed to mount $workspace_url to $mount_dir"
diff --git a/apps/mast/main.py b/apps/mast/main.py
index fd1819ed2..8029f35e1 100644
--- a/apps/mast/main.py
+++ b/apps/mast/main.py
@@ -6,6 +6,7 @@
 
 import asyncio
 import getpass
+import uuid
 
 from apps.grpo.main import main as grpo_main
 from forge.cli.config import parse
@@ -14,6 +15,9 @@
 from forge.types import Scheduler
 from omegaconf import DictConfig
 
+DEFAULT_CHECKPOINT_FOLDER_KEY = "checkpoint_folder"
+DEFAULT_CHECKPOINT_FOLDER = "/mnt/wsfuse/teamforge/forge_runs/"
+
 
 async def main(cfg: DictConfig):
     """Main module for launching mast jobs for GRPO training."""
@@ -21,8 +25,19 @@ async def main(cfg: DictConfig):
         raise ValueError("Schuduler must be MAST.")
 
     if cfg.get(JOB_NAME_KEY, None) is not None:
-        # prepend user name to the job to avoid name collision
-        cfg[JOB_NAME_KEY] = f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}"
+        # prepend user name and append guid to the job to avoid name collision
+        cfg[JOB_NAME_KEY] = (
+            f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}"
+        )
+        print(f"Overriding mast job name to {cfg[JOB_NAME_KEY]}")
+
+    if cfg.get(DEFAULT_CHECKPOINT_FOLDER_KEY, DEFAULT_CHECKPOINT_FOLDER) is not None:
+        # append job_name to CP folder path to avoid path collision
+        if cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] == DEFAULT_CHECKPOINT_FOLDER:
+            cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] = (
+                f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}"
+            )
+        print(f"Overriding checkpoint folder to {cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}")
 
     # init mast provisioner
     await init_provisioner(cfg)
diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml
index 2429077fc..198ecf8f2 100644
--- a/apps/mast/qwen3_14b_mast.yaml
+++ b/apps/mast/qwen3_14b_mast.yaml
@@ -1,5 +1,5 @@
-
 # Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
 
 # Global configuration
 group_size: 8
@@ -9,9 +9,20 @@ max_res_tokens: 512
 model: "Qwen/Qwen3-14B"
 off_by_n: 1 # Off by one by default
 scheduler: mast
-job_name: forge-qwen-14B
-checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20
+job_name: forge-qwen3-14b
+checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
+
+# Main loop configuration
+rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
 
+# Observability configuration
+metric_logging:
+  wandb:
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
+    reduce_across_ranks: True
+  console:
+    reduce_across_ranks: True
 
 # Dataset configuration
 dataset:
@@ -29,14 +40,13 @@ policy:
     pipeline_parallel_size: 1
     enforce_eager: false
     # TODO: Had to disable this becasue vLLm wouldn't like
-    # need to revisit.
+    # needs to revisited.
     disable_custom_all_reduce: true
   sampling_config:
     n: ${group_size}
     max_tokens: ${max_res_tokens}
     temperature: 1.0
     top_p: 1.0
-  checkpoint_path: ${checkpoint_folder}
 
 # Trainer configuration
 trainer:
@@ -74,14 +84,14 @@ trainer:
     last_save_in_hf: true
     interval: 500
     async_mode: "disabled"
-    folder: ${checkpoint_folder}
   activation_checkpoint:
     mode: selective
     selective_ac_option: op
   comm:
-    # TODO: revisit this. causing NCCL timeouts on inits when loading CP
-    # from oilfs if the traienr is not in the same region as in PCI
-    init_timeout_seconds: 3600
+    # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in oilfs
+    init_timeout_seconds: 1200
+  dcp_path: ${checkpoint_folder}
 
 # Replay buffer configuration
 replay_buffer:
@@ -108,46 +118,45 @@ ref_model:
     context_parallel_degree: 1
     expert_parallel_degree: 1
   checkpoint:
+    enable: true
     initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-14B/snapshots/8268fe3026cb304910457689366670e803a6fd56
     initial_load_in_hf: true
 
 # All resource allocations
 services:
-  dataset:
-    procs: 1
-    num_replicas: 1
-    with_gpus: false
-    mesh_name: dataset
   policy:
     procs: ${policy.engine_config.tensor_parallel_size}
-    num_replicas: 14
+    num_replicas: 2
     with_gpus: true
-    hosts: 1
     mesh_name: policy
+    hosts: 1
+  ref_model:
+    procs: 1
+    num_replicas: 2
+    with_gpus: true
+    mesh_name: ref_model
+    hosts: 1
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+    mesh_name: dataset
   trainer:
     procs: 8
-    num_replicas: 1
     with_gpus: true
-    hosts: 1
     mesh_name: trainer
+    hosts: 1
   replay_buffer:
     procs: 1
-    num_replicas: 1
     with_gpus: false
     mesh_name: replay_buffer
-  ref_model:
-    procs: ${ref_model.parallelism.tensor_parallel_degree}
-    num_replicas: 14
-    with_gpus: true
-    hosts: 1
-    mesh_name: ref_model
   compute_advantages:
     procs: 1
-    num_replicas: 1
     with_gpus: false
     mesh_name: compute_advantages
-  reward_actor:
-    procs: 1
-    num_replicas: 1
-    with_gpus: false
-    mesh_name: reward_actor
diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml
index 993bf0570..44a3fa906 100644
--- a/apps/mast/qwen3_1_7b_mast.yaml
+++ b/apps/mast/qwen3_1_7b_mast.yaml
@@ -9,8 +9,8 @@ max_res_tokens: 512
 model: "Qwen/Qwen3-1.7B"
 off_by_n: 1 # Off by one by default
 scheduler: mast
-job_name: forge-qwen3-1_7b-1190
-checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20
+job_name: forge-qwen3-1_7b
+checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
 # Main loop configuration
 rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
@@ -39,6 +39,9 @@ policy:
     tensor_parallel_size: 1
     pipeline_parallel_size: 1
     enforce_eager: false
+    # TODO: Had to disable this becasue vLLm wouldn't like
+    # needs to revisited.
+    disable_custom_all_reduce: true
   sampling_config:
     n: ${group_size}
     max_tokens: ${max_res_tokens}
@@ -84,6 +87,10 @@ trainer:
   activation_checkpoint:
     mode: selective
     selective_ac_option: op
+  comm:
+    # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in oilfs
+    init_timeout_seconds: 1200
   dcp_path: ${checkpoint_folder}
 
 # Replay buffer configuration
diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml
index 3c77adfa3..a6818b41c 100644
--- a/apps/mast/qwen3_32b_mast.yaml
+++ b/apps/mast/qwen3_32b_mast.yaml
@@ -1,4 +1,5 @@
 # Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
 
 # Global configuration
 group_size: 8
@@ -8,9 +9,20 @@ max_res_tokens: 512
 model: "Qwen/Qwen3-32B"
 off_by_n: 1 # Off by one by default
 scheduler: mast
-job_name: forge-qwen-32B
-checkpoint_folder: /mnt/wsfuse/$user$/forge_runs/${job_name}/20
+job_name: forge-qwen3-32b
+checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
+# Main loop configuration
+rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
+
+# Observability configuration
+metric_logging:
+  wandb:
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
+    reduce_across_ranks: True
+  console:
+    reduce_across_ranks: True
 
 # Dataset configuration
 dataset:
@@ -28,14 +40,13 @@ policy:
     pipeline_parallel_size: 1
     enforce_eager: false
     # TODO: Had to disable this becasue vLLm wouldn't like
-    # need to revisit.
+    # needs to revisited.
     disable_custom_all_reduce: true
   sampling_config:
     n: ${group_size}
     max_tokens: ${max_res_tokens}
     temperature: 1.0
     top_p: 1.0
-  checkpoint_path: ${checkpoint_folder}
 
 # Trainer configuration
 trainer:
@@ -73,14 +84,14 @@ trainer:
     last_save_in_hf: true
     interval: 500
     async_mode: "disabled"
-    folder: ${checkpoint_folder}
   activation_checkpoint:
     mode: selective
     selective_ac_option: op
   comm:
-    # TODO: revisit this. causing NCCL timeouts on inits when loading CP
-    # from oilfs if the traienr is not in the same region as in PCI
-    init_timeout_seconds: 3600
+    # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in oilfs
+    init_timeout_seconds: 1200
+  dcp_path: ${checkpoint_folder}
 
 # Replay buffer configuration
 replay_buffer:
@@ -107,47 +118,45 @@ ref_model:
     context_parallel_degree: 1
     expert_parallel_degree: 1
   checkpoint:
+    enable: true
     initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-32B/snapshots/d47b0d4ae4b48fde975756bf360a63a9cca8d470
     initial_load_in_hf: true
 
 # All resource allocations
 services:
-  dataset:
-    procs: 1
-    num_replicas: 1
-    with_gpus: false
-    mesh_name: dataset
   policy:
     procs: ${policy.engine_config.tensor_parallel_size}
     num_replicas: 2
     with_gpus: true
-    hosts: 1
     mesh_name: policy
+    hosts: 1
+  ref_model:
+    procs: 1
+    num_replicas: 2
+    with_gpus: true
+    mesh_name: ref_model
+    hosts: 1
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+    mesh_name: dataset
   trainer:
-    # procs: ${trainer.parallelism.data_parallel_shard_degree}
     procs: 8
-    num_replicas: 1
     with_gpus: true
-    hosts: 1
     mesh_name: trainer
+    hosts: 1
   replay_buffer:
     procs: 1
-    num_replicas: 1
     with_gpus: false
     mesh_name: replay_buffer
-  ref_model:
-    procs: ${ref_model.parallelism.tensor_parallel_degree}
-    num_replicas: 2
-    with_gpus: true
-    hosts: 1
-    mesh_name: ref_model
   compute_advantages:
     procs: 1
-    num_replicas: 1
     with_gpus: false
     mesh_name: compute_advantages
-  reward_actor:
-    procs: 1
-    num_replicas: 1
-    with_gpus: false
-    mesh_name: reward_actor
diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml
index 1690494e8..a2962122b 100644
--- a/apps/mast/qwen3_4b_mast.yaml
+++ b/apps/mast/qwen3_4b_mast.yaml
@@ -1,4 +1,5 @@
 # Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
 
 # Global configuration
 group_size: 8
@@ -8,9 +9,20 @@ max_res_tokens: 512
 model: "Qwen/Qwen3-4B"
 off_by_n: 1 # Off by one by default
 scheduler: mast
-job_name: forge-qwen-4B
-checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20
+job_name: forge-qwen3-4b
+checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
+# Main loop configuration
+rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
+
+# Observability configuration
+metric_logging:
+  wandb:
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
+    reduce_across_ranks: True
+  console:
+    reduce_across_ranks: True
 
 # Dataset configuration
 dataset:
@@ -28,14 +40,13 @@ policy:
     pipeline_parallel_size: 1
     enforce_eager: false
     # TODO: Had to disable this becasue vLLm wouldn't like
-    # need to revisit.
+    # needs to revisited.
     disable_custom_all_reduce: true
   sampling_config:
     n: ${group_size}
     max_tokens: ${max_res_tokens}
     temperature: 1.0
     top_p: 1.0
-  checkpoint_path: ${checkpoint_folder}
 
 # Trainer configuration
 trainer:
@@ -73,14 +84,14 @@ trainer:
     last_save_in_hf: true
     interval: 500
     async_mode: "disabled"
-    folder: ${checkpoint_folder}
   activation_checkpoint:
     mode: selective
     selective_ac_option: op
   comm:
-    # TODO: revisit this. causing NCCL timeouts on inits when loading CP
-    # from oilfs if the traienr is not in the same region as in PCI
-    init_timeout_seconds: 3600
+    # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in oilfs
+    init_timeout_seconds: 1200
+  dcp_path: ${checkpoint_folder}
 
 # Replay buffer configuration
 replay_buffer:
@@ -107,46 +118,45 @@ ref_model:
     context_parallel_degree: 1
     expert_parallel_degree: 1
   checkpoint:
+    enable: true
     initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-4B-Base/snapshots/a81b894c2624d21c88a3ad737ce4f837424b7eed
     initial_load_in_hf: true
 
 # All resource allocations
 services:
-  dataset:
-    procs: 1
-    num_replicas: 1
-    with_gpus: false
-    mesh_name: dataset
   policy:
     procs: ${policy.engine_config.tensor_parallel_size}
     num_replicas: 2
     with_gpus: true
-    hosts: 1
     mesh_name: policy
-  trainer:
-    procs: 8
-    num_replicas: 1
+    hosts: 1
+  ref_model:
+    procs: 1
+    num_replicas: 2
     with_gpus: true
+    mesh_name: ref_model
     hosts: 1
-    mesh_name: trainer
-  replay_buffer:
+  reward_actor:
     procs: 1
     num_replicas: 1
     with_gpus: false
-    mesh_name: replay_buffer
-  ref_model:
-    procs: ${ref_model.parallelism.tensor_parallel_degree}
-    num_replicas: 2
+    mesh_name: reward_actor
+
+actors:
+  dataset:
+    procs: 8
+    with_gpus: false
+    mesh_name: dataset
+  trainer:
+    procs: 1
     with_gpus: true
+    mesh_name: trainer
     hosts: 1
-    mesh_name: ref_model
-  compute_advantages:
+  replay_buffer:
     procs: 1
-    num_replicas: 1
     with_gpus: false
-    mesh_name: compute_advantages
-  reward_actor:
+    mesh_name: replay_buffer
+  compute_advantages:
     procs: 1
-    num_replicas: 1
     with_gpus: false
-    mesh_name: reward_actor
+    mesh_name: compute_advantages
diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml
index d9ed947ff..e711adbdb 100644
--- a/apps/mast/qwen3_8b_mast.yaml
+++ b/apps/mast/qwen3_8b_mast.yaml
@@ -1,4 +1,5 @@
 # Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
 
 # Global configuration
 group_size: 8
@@ -8,9 +9,20 @@ max_res_tokens: 512
 model: "Qwen/Qwen3-8B"
 off_by_n: 1 # Off by one by default
 scheduler: mast
-job_name: forge-qwen-8B
-checkpoint_folder: /mnt/wsfuse/rithesh/forge_runs/${job_name}/20
+job_name: forge-qwen3-8b
+checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
+# Main loop configuration
+rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
+
+# Observability configuration
+metric_logging:
+  wandb:
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
+    reduce_across_ranks: True
+  console:
+    reduce_across_ranks: True
 
 # Dataset configuration
 dataset:
@@ -28,14 +40,13 @@ policy:
     pipeline_parallel_size: 1
     enforce_eager: false
     # TODO: Had to disable this becasue vLLm wouldn't like
-    # need to revisit.
+    # needs to revisited.
     disable_custom_all_reduce: true
   sampling_config:
     n: ${group_size}
     max_tokens: ${max_res_tokens}
     temperature: 1.0
-    top_p: 1.0=
-  checkpoint_path: ${checkpoint_folder}
+    top_p: 1.0
 
 # Trainer configuration
 trainer:
@@ -73,14 +84,14 @@ trainer:
     last_save_in_hf: true
     interval: 500
     async_mode: "disabled"
-    folder: ${checkpoint_folder}
   activation_checkpoint:
     mode: selective
     selective_ac_option: op
   comm:
-    # TODO: revisit this. causing NCCL timeouts on inits when loading CP
-    # from oilfs if the traienr is not in the same region as in PCI
-    init_timeout_seconds: 3600
+    # TODO: needs to be revisited. causing NCCL timeouts on inits when loading CP
+    # from oilfs if the traienr is not in the same region as in oilfs
+    init_timeout_seconds: 1200
+  dcp_path: ${checkpoint_folder}
 
 # Replay buffer configuration
 replay_buffer:
@@ -107,46 +118,45 @@ ref_model:
     context_parallel_degree: 1
     expert_parallel_degree: 1
   checkpoint:
+    enable: true
     initial_load_path: /mnt/wsfuse/huggingface_models/models--Qwen--Qwen3-8B/snapshots/model
     initial_load_in_hf: true
 
 # All resource allocations
 services:
-  dataset:
-    procs: 1
-    num_replicas: 1
-    with_gpus: false
-    mesh_name: dataset
   policy:
     procs: ${policy.engine_config.tensor_parallel_size}
     num_replicas: 2
     with_gpus: true
-    hosts: 1
     mesh_name: policy
+    hosts: 1
+  ref_model:
+    procs: 1
+    num_replicas: 2
+    with_gpus: true
+    mesh_name: ref_model
+    hosts: 1
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+    mesh_name: dataset
   trainer:
     procs: 8
-    num_replicas: 1
     with_gpus: true
-    hosts: 1
     mesh_name: trainer
+    hosts: 1
   replay_buffer:
     procs: 1
-    num_replicas: 1
     with_gpus: false
     mesh_name: replay_buffer
-  ref_model:
-    procs: ${ref_model.parallelism.tensor_parallel_degree}
-    num_replicas: 2
-    with_gpus: true
-    hosts: 1
-    mesh_name: ref_model
   compute_advantages:
     procs: 1
-    num_replicas: 1
     with_gpus: false
     mesh_name: compute_advantages
-  reward_actor:
-    procs: 1
-    num_replicas: 1
-    with_gpus: false
-    mesh_name: reward_actor
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index 5b0d23783..6a6f7508b 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -16,6 +16,10 @@
 from typing import Optional
 
 import monarch
+
+from forge.observability.metric_actors import get_or_create_metric_logger
+
+from forge.types import ProcessConfig, Scheduler
 from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
 from monarch._src.actor.shape import NDSlice, Shape
 from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
@@ -25,10 +29,6 @@
 
 from omegaconf import DictConfig
 
-from forge.observability.metric_actors import get_or_create_metric_logger
-
-from forge.types import ProcessConfig, Scheduler
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
@@ -261,9 +261,9 @@ def bootstrap(gpu_ids: list[str]):
                     os.environ["NVTE_FUSED_ATTN"] = "1"
                     os.environ["NVTE_FUSED_ATTN_USE_FAv2_BWD"] = "1"
                     os.environ["NCCL_SET_THREAD_NAME"] = "1'"
-                    os.environ[
-                        "NCCL_DEBUG_SUBSYS"
-                    ] = "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC"
+                    os.environ["NCCL_DEBUG_SUBSYS"] = (
+                        "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC"
+                    )
                     os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "3"
                     os.environ["NCCL_NET_OVERHEAD"] = "2750"
                     os.environ["NCCL_IB_SPLIT_DATA_ON_QPS"] = "0"

From f2416346dee00428622d7c20899e851a9d8ebce6 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu001.lla1.facebook.com>
Date: Wed, 1 Oct 2025 15:37:21 -0700
Subject: [PATCH 10/17] minor changes

---
 apps/grpo/main.py     |  1 -
 apps/mast/__init__.py |  5 +++++
 apps/mast/main.py     | 10 ++++------
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/apps/grpo/main.py b/apps/grpo/main.py
index 42865334e..7545aa561 100644
--- a/apps/grpo/main.py
+++ b/apps/grpo/main.py
@@ -316,7 +316,6 @@ async def main(cfg: DictConfig):
     metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
     mlogger = await get_or_create_metric_logger()
     await mlogger.init_backends.call_one(metric_logging_cfg)
-    print("SUCCESSFULLY CREATED AND INITIALIZED MLOGGER")
 
     # ---- Setup services ---- #
     await ts.initialize(strategy=ts.ControllerStorageVolumes())
diff --git a/apps/mast/__init__.py b/apps/mast/__init__.py
index e69de29bb..2e41cd717 100644
--- a/apps/mast/__init__.py
+++ b/apps/mast/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/apps/mast/main.py b/apps/mast/main.py
index 8029f35e1..382b793b2 100644
--- a/apps/mast/main.py
+++ b/apps/mast/main.py
@@ -25,17 +25,15 @@ async def main(cfg: DictConfig):
         raise ValueError("Schuduler must be MAST.")
 
     if cfg.get(JOB_NAME_KEY, None) is not None:
-        # prepend user name and append guid to the job to avoid name collision
-        cfg[JOB_NAME_KEY] = (
-            f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}"
-        )
+        # prepend user name to the job to avoid name collision
+        cfg[JOB_NAME_KEY] = f"{getpass.getuser()}-{cfg[JOB_NAME_KEY]}"
         print(f"Overriding mast job name to {cfg[JOB_NAME_KEY]}")
 
     if cfg.get(DEFAULT_CHECKPOINT_FOLDER_KEY, DEFAULT_CHECKPOINT_FOLDER) is not None:
-        # append job_name to CP folder path to avoid path collision
+        # append job_name and guid to CP folder path to avoid path collision
         if cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] == DEFAULT_CHECKPOINT_FOLDER:
             cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] = (
-                f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}"
+                f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}"
             )
         print(f"Overriding checkpoint folder to {cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}")
 

From 1c794232b98cb6c32ce82b83660fb14d59105c19 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu001.lla1.facebook.com>
Date: Wed, 1 Oct 2025 15:46:25 -0700
Subject: [PATCH 11/17] lints

---
 apps/mast/main.py                       |  6 ++---
 src/forge/actors/policy.py              | 35 ++++++++++++-------------
 src/forge/controller/actor.py           |  4 +--
 src/forge/controller/provisioner.py     | 14 +++++-----
 src/forge/controller/service/replica.py |  6 ++---
 5 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/apps/mast/main.py b/apps/mast/main.py
index 382b793b2..92d81082c 100644
--- a/apps/mast/main.py
+++ b/apps/mast/main.py
@@ -32,9 +32,9 @@ async def main(cfg: DictConfig):
     if cfg.get(DEFAULT_CHECKPOINT_FOLDER_KEY, DEFAULT_CHECKPOINT_FOLDER) is not None:
         # append job_name and guid to CP folder path to avoid path collision
         if cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] == DEFAULT_CHECKPOINT_FOLDER:
-            cfg[DEFAULT_CHECKPOINT_FOLDER_KEY] = (
-                f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}"
-            )
+            cfg[
+                DEFAULT_CHECKPOINT_FOLDER_KEY
+            ] = f"{cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}{cfg[JOB_NAME_KEY]}-{uuid.uuid4().hex[:6]}"
         print(f"Overriding checkpoint folder to {cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}")
 
     # init mast provisioner
diff --git a/src/forge/actors/policy.py b/src/forge/actors/policy.py
index 00d788bf4..4b61f096c 100644
--- a/src/forge/actors/policy.py
+++ b/src/forge/actors/policy.py
@@ -18,24 +18,6 @@
 import torch
 import torch.distributed.checkpoint as dcp
 import torchstore as ts
-
-from forge.actors._torchstore_utils import (
-    DcpHandle,
-    extract_param_name,
-    get_dcp_whole_state_dict_key,
-    get_param_key,
-    get_param_prefix,
-    load_tensor_from_dcp,
-)
-
-from forge.controller import ForgeActor, get_proc_mesh, stop_proc_mesh
-from forge.data.sharding import VLLMSharding
-from forge.data_models.completion import Completion
-from forge.data_models.prompt import to_prompt
-from forge.interfaces import Policy as PolicyInterface
-from forge.observability.metrics import record_metric, Reduce
-from forge.observability.perf_tracker import Tracer
-from forge.types import ProcessConfig
 from monarch.actor import current_rank, endpoint, ProcMesh
 from torchstore.state_dict_utils import DELIM
 from vllm.config import VllmConfig
@@ -60,6 +42,23 @@
 from vllm.v1.structured_output import StructuredOutputManager
 from vllm.worker.worker_base import WorkerWrapperBase
 
+from forge.actors._torchstore_utils import (
+    extract_param_name,
+    get_dcp_whole_state_dict_key,
+    get_param_key,
+    get_param_prefix,
+    load_tensor_from_dcp,
+)
+
+from forge.controller import ForgeActor, get_proc_mesh, stop_proc_mesh
+from forge.data.sharding import VLLMSharding
+from forge.data_models.completion import Completion
+from forge.data_models.prompt import to_prompt
+from forge.interfaces import Policy as PolicyInterface
+from forge.observability.metrics import record_metric, Reduce
+from forge.observability.perf_tracker import Tracer
+from forge.types import ProcessConfig
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
diff --git a/src/forge/controller/actor.py b/src/forge/controller/actor.py
index 2c1cf3655..8bf1d4765 100644
--- a/src/forge/controller/actor.py
+++ b/src/forge/controller/actor.py
@@ -10,12 +10,12 @@
 import sys
 from typing import Any, Type, TypeVar
 
+from monarch.actor import Actor, current_rank, current_size, endpoint
+
 from forge.controller.proc_mesh import get_proc_mesh, stop_proc_mesh
 
 from forge.types import ProcessConfig, ServiceConfig
 
-from monarch.actor import Actor, current_rank, current_size, endpoint
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 T = TypeVar("T", bound="ForgeActor")
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index 6a6f7508b..5b0d23783 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -16,10 +16,6 @@
 from typing import Optional
 
 import monarch
-
-from forge.observability.metric_actors import get_or_create_metric_logger
-
-from forge.types import ProcessConfig, Scheduler
 from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
 from monarch._src.actor.shape import NDSlice, Shape
 from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
@@ -29,6 +25,10 @@
 
 from omegaconf import DictConfig
 
+from forge.observability.metric_actors import get_or_create_metric_logger
+
+from forge.types import ProcessConfig, Scheduler
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
@@ -261,9 +261,9 @@ def bootstrap(gpu_ids: list[str]):
                     os.environ["NVTE_FUSED_ATTN"] = "1"
                     os.environ["NVTE_FUSED_ATTN_USE_FAv2_BWD"] = "1"
                     os.environ["NCCL_SET_THREAD_NAME"] = "1'"
-                    os.environ["NCCL_DEBUG_SUBSYS"] = (
-                        "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC"
-                    )
+                    os.environ[
+                        "NCCL_DEBUG_SUBSYS"
+                    ] = "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC"
                     os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "3"
                     os.environ["NCCL_NET_OVERHEAD"] = "2750"
                     os.environ["NCCL_IB_SPLIT_DATA_ON_QPS"] = "0"
diff --git a/src/forge/controller/service/replica.py b/src/forge/controller/service/replica.py
index b0804cba6..dfdb10169 100644
--- a/src/forge/controller/service/replica.py
+++ b/src/forge/controller/service/replica.py
@@ -13,11 +13,11 @@
 from enum import Enum
 from typing import Optional
 
+from monarch.actor import ActorError
+
 from forge.controller import ForgeActor
 from forge.types import ProcessConfig
 
-from monarch.actor import ActorError
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
@@ -162,7 +162,7 @@ async def initialize(self):
             mesh_name_with_replica = f"{self.proc_config.mesh_name}_{self.idx}"
             self.proc_config.mesh_name = mesh_name_with_replica
             if hasattr(self.actor_def, "mesh_name"):
-                setattr(self.actor_def, "mesh_name", mesh_name_with_replica)
+                self.actor_def.mesh_name = mesh_name_with_replica
             self.actor = await self.actor_def.launch(
                 *self.actor_args,
                 **self.actor_kwargs,

From cccaf5094625dbec0b8bd465b511b03f3b4370fd Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu001.lla1.facebook.com>
Date: Wed, 1 Oct 2025 15:55:38 -0700
Subject: [PATCH 12/17] clean up some changes

---
 src/forge/controller/provisioner.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index 5b0d23783..f4344909d 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -252,29 +252,6 @@ def bootstrap(gpu_ids: list[str]):
                     os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600"
                     os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824"
 
-                    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = "0"
-                    os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
-                    os.environ["NVTE_TORCH_COMPILE"] = "0"
-                    os.environ["NVTE_BIAS_GELU_NVFUSION"] = "0"
-                    os.environ["NVTE_CUDA_INCLUDE_DIR"] = "/usr/local/cuda/include"
-                    os.environ["NVTE_DISABLE_NVRTC"] = "1"
-                    os.environ["NVTE_FUSED_ATTN"] = "1"
-                    os.environ["NVTE_FUSED_ATTN_USE_FAv2_BWD"] = "1"
-                    os.environ["NCCL_SET_THREAD_NAME"] = "1'"
-                    os.environ[
-                        "NCCL_DEBUG_SUBSYS"
-                    ] = "INIT,COLL,P2P,SHM,NET,GRAPH,TUNING,ENV,ALLOC"
-                    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "3"
-                    os.environ["NCCL_NET_OVERHEAD"] = "2750"
-                    os.environ["NCCL_IB_SPLIT_DATA_ON_QPS"] = "0"
-                    os.environ["NCCL_IB_QPS_PER_CONNECTION"] = "16"
-                    os.environ["NCCL_CTRAN_ENABLE"] = "0"
-                    os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1"
-                    os.environ["PYTORCH_JIT"] = "0"
-                    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-                    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-                    os.environ["GLOG_minloglevel"] = "1"
-
                 gpu_ids = gpu_manager.get_gpus(num_procs)
                 procs = host_mesh.spawn_procs(
                     per_host={"gpus": num_procs},

From 91216bafb4907388c5c227b8cf328d8bae85a748 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu001.lla1.facebook.com>
Date: Wed, 1 Oct 2025 16:08:40 -0700
Subject: [PATCH 13/17] failing tests

---
 src/forge/controller/actor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/forge/controller/actor.py b/src/forge/controller/actor.py
index 8bf1d4765..bb495b641 100644
--- a/src/forge/controller/actor.py
+++ b/src/forge/controller/actor.py
@@ -26,6 +26,7 @@ class ForgeActor(Actor):
     hosts: int | None = None
     with_gpus: bool = False
     num_replicas: int = 1
+    mesh_name: str | None = None
     _extra_config: dict[str, Any] = {}
 
     def __init__(self, *args, **kwargs):

From ab5a197c050911016795ebeca041f62f77ca9d58 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu004.rva5.facebook.com>
Date: Thu, 2 Oct 2025 14:16:10 -0700
Subject: [PATCH 14/17] some design changes

---
 apps/grpo/main.py                         |   7 +-
 apps/mast/main.py                         |   9 +-
 apps/mast/qwen3_14b_mast.yaml             |   2 +-
 apps/mast/qwen3_1_7b_mast.yaml            |   2 +-
 apps/mast/qwen3_32b_mast.yaml             |   2 +-
 apps/mast/qwen3_4b_mast.yaml              |   2 +-
 apps/mast/qwen3_8b_mast.yaml              |   2 +-
 src/forge/controller/launcher.py          | 318 ++++++++++++++++++
 src/forge/controller/launcher/__init__.py |   5 -
 src/forge/controller/launcher/mast.py     | 378 ----------------------
 src/forge/controller/provisioner.py       | 150 ++-------
 src/forge/types.py                        |   2 +-
 12 files changed, 362 insertions(+), 517 deletions(-)
 create mode 100644 src/forge/controller/launcher.py
 delete mode 100644 src/forge/controller/launcher/__init__.py
 delete mode 100644 src/forge/controller/launcher/mast.py

diff --git a/apps/grpo/main.py b/apps/grpo/main.py
index 7545aa561..138e406b0 100644
--- a/apps/grpo/main.py
+++ b/apps/grpo/main.py
@@ -7,7 +7,6 @@
 # Usage: python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
 
 import asyncio
-
 import time
 import uuid
 from dataclasses import dataclass
@@ -27,7 +26,8 @@
 from forge.actors.trainer import RLTrainer
 from forge.cli.config import parse
 from forge.controller.actor import ForgeActor
-from forge.controller.provisioner import shutdown
+
+from forge.controller.provisioner import init_provisioner, shutdown
 from forge.data.rewards import MathReward, ThinkingReward
 from forge.observability.metric_actors import get_or_create_metric_logger
 from forge.observability.metrics import record_metric, Reduce
@@ -312,6 +312,9 @@ async def main(cfg: DictConfig):
     max_req_tokens = cfg.max_req_tokens
     max_res_tokens = cfg.max_res_tokens
 
+    # init provisioner
+    await init_provisioner(cfg)
+
     # initialize before spawning services
     metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
     mlogger = await get_or_create_metric_logger()
diff --git a/apps/mast/main.py b/apps/mast/main.py
index 92d81082c..9627bcc24 100644
--- a/apps/mast/main.py
+++ b/apps/mast/main.py
@@ -10,9 +10,10 @@
 
 from apps.grpo.main import main as grpo_main
 from forge.cli.config import parse
-from forge.controller.provisioner import init_provisioner, JOB_NAME_KEY, SCHEDULER_KEY
+from forge.controller.launcher import JOB_NAME_KEY, LAUNCHER_KEY
+from forge.controller.provisioner import init_provisioner
 
-from forge.types import Scheduler
+from forge.types import Launcher
 from omegaconf import DictConfig
 
 DEFAULT_CHECKPOINT_FOLDER_KEY = "checkpoint_folder"
@@ -21,8 +22,8 @@
 
 async def main(cfg: DictConfig):
     """Main module for launching mast jobs for GRPO training."""
-    if cfg.get(SCHEDULER_KEY, Scheduler.MAST.value) != Scheduler.MAST.value:
-        raise ValueError("Schuduler must be MAST.")
+    if cfg.get(LAUNCHER_KEY, Launcher.MAST.value) != Launcher.MAST.value:
+        raise ValueError("Launcher must be MAST.")
 
     if cfg.get(JOB_NAME_KEY, None) is not None:
         # prepend user name to the job to avoid name collision
diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml
index 198ecf8f2..83d5b8103 100644
--- a/apps/mast/qwen3_14b_mast.yaml
+++ b/apps/mast/qwen3_14b_mast.yaml
@@ -8,7 +8,7 @@ max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-14B"
 off_by_n: 1 # Off by one by default
-scheduler: mast
+launcher: mast
 job_name: forge-qwen3-14b
 checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml
index 44a3fa906..58d879579 100644
--- a/apps/mast/qwen3_1_7b_mast.yaml
+++ b/apps/mast/qwen3_1_7b_mast.yaml
@@ -8,7 +8,7 @@ max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-1.7B"
 off_by_n: 1 # Off by one by default
-scheduler: mast
+launcher: mast
 job_name: forge-qwen3-1_7b
 checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml
index a6818b41c..0db8f4af3 100644
--- a/apps/mast/qwen3_32b_mast.yaml
+++ b/apps/mast/qwen3_32b_mast.yaml
@@ -8,7 +8,7 @@ max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-32B"
 off_by_n: 1 # Off by one by default
-scheduler: mast
+launcher: mast
 job_name: forge-qwen3-32b
 checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml
index a2962122b..92119055a 100644
--- a/apps/mast/qwen3_4b_mast.yaml
+++ b/apps/mast/qwen3_4b_mast.yaml
@@ -8,7 +8,7 @@ max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-4B"
 off_by_n: 1 # Off by one by default
-scheduler: mast
+launcher: mast
 job_name: forge-qwen3-4b
 checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml
index e711adbdb..7f2f99694 100644
--- a/apps/mast/qwen3_8b_mast.yaml
+++ b/apps/mast/qwen3_8b_mast.yaml
@@ -8,7 +8,7 @@ max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-8B"
 off_by_n: 1 # Off by one by default
-scheduler: mast
+launcher: mast
 job_name: forge-qwen3-8b
 checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
new file mode 100644
index 000000000..2db56b2ee
--- /dev/null
+++ b/src/forge/controller/launcher.py
@@ -0,0 +1,318 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import getpass
+import os
+import socket
+import subprocess
+import uuid
+from typing import Any
+
+import monarch
+
+import torchx.specs as specs
+
+from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints
+from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
+from monarch.actor import Actor, endpoint, ProcMesh
+from monarch.tools import commands
+from monarch.tools.commands import info
+from monarch.tools.components import hyperactor
+from monarch.tools.config import Config, Workspace
+from omegaconf import DictConfig
+
+from forge.types import Launcher
+
+try:
+    from monarch._src.actor.actor_mesh import current_rank
+    from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig
+    from monarch.tools.components.meta import hyperactor as meta_hyperactor
+    from torchx.specs import AppState
+    from torchx.specs.fb.component_helpers import Packages
+except ImportError as e:
+    print(f"Warning: Monarch meta/fb inetrnal imports failed: {e}")
+    print("Monarch functionality will be limited")
+
+JOB_NAME_KEY = "job_name"
+LAUNCHER_KEY = "launcher"
+
+
+def _get_port() -> str:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("localhost", 0))
+        addr = s.getsockname()
+        port = addr[1]
+        return str(port)
+
+
+class SetupActor(Actor):
+    @endpoint
+    def get_info(self) -> [str, str]:
+        return socket.gethostname(), _get_port()
+
+
+class MastSetupActor(SetupActor):
+    @endpoint
+    def mount(self, mount_dst: str):
+        point = current_rank()
+        # The last dimension is the local proc count.
+        last_label = point.extent.labels[-1]
+        proc_count = point.size(last_label)
+        if current_rank().rank % proc_count != 0:
+            # Only use one rank per host to mount the directory
+            return
+        self.mount_mnt_directory(mount_dst)
+
+    def mount_mnt_directory(self, mount_dst: str) -> None:
+        # Sanity check of the mounted directory
+        sanity_path = os.path.join(mount_dst, "huggingface_models/")
+        if os.path.exists(sanity_path):
+            print(f"Found directory {sanity_path}; skip mounting.")
+            return
+
+        # Otherwise, mount the directory
+        if not os.path.exists(mount_dst):
+            os.makedirs(mount_dst, exist_ok=True)
+
+        # Store original LD_LIBRARY_PATH to restore after mounting
+        original_ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
+
+        try:
+            clean_env = os.environ.copy()
+            if "LD_LIBRARY_PATH" in clean_env:
+                del clean_env["LD_LIBRARY_PATH"]
+
+            subprocess.run(
+                [
+                    "/packages/oil.oilfs/oilfs-wrapper",
+                    "ws://ws.ai.pci0ai/genai_fair_llm",
+                    mount_dst,
+                ],
+                capture_output=True,
+                text=True,
+                check=True,
+                env=clean_env,
+            )
+            print("Done mounting")
+        except subprocess.CalledProcessError as e:
+            print(
+                f"Get error during mounting {e}, Stderr: {e.stderr}, Stdout: {e.stdout}"
+            )
+        finally:
+            # Restore original LD_LIBRARY_PATH
+            if original_ld_library_path:
+                os.environ["LD_LIBRARY_PATH"] = original_ld_library_path
+            elif "LD_LIBRARY_PATH" in os.environ:
+                del os.environ["LD_LIBRARY_PATH"]
+
+        assert os.path.exists(
+            sanity_path
+        ), f"Did not find directory {sanity_path}; something wrong with mounting."
+
+
+class BaseLauncher:
+    async def initialize(self) -> None:
+        pass
+
+    async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]:
+        pass
+
+    async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]:
+        pass
+
+
+class Slurmlauncher(BaseLauncher):
+    def __init__(self, cfg: DictConfig | None = None):
+        self.cfg = cfg
+
+    async def initialize(self) -> None:
+        pass
+
+    async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]:
+        appdef = hyperactor.host_mesh(
+            image="test", meshes=[f"{name}:{num_hosts}:gpu.small"]
+        )
+        for role in appdef.roles:
+            # Note - this is hardcoded to SLURM
+            # We got this with sinfo
+            role.resource.memMB = 2062607
+            role.resource.cpu = 128
+            role.resource.gpu = 8
+
+        # TODO - multi scheduler support
+        server_config = Config(
+            scheduler="slurm",
+            appdef=appdef,
+            workspace=monarch.tools.config.workspace.Workspace(dirs=[""]),
+        )
+        server_info = await commands.get_or_create(
+            "forge_job",
+            server_config,
+            force_restart=False,
+        )
+        alloc = RemoteAllocator(
+            world_id=name,
+            initializer=TorchXRemoteAllocInitializer(server_info.server_handle),
+        )
+        server_name = f"slurm:///{server_info.name}"
+        return alloc, None, server_name  # (Allocator, AllocConstraints, SeverName)
+
+    async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]:
+        setup = procs.spawn(f"setup-{uuid.uuid1()}", SetupActor)
+        return await setup.get_info.choose()
+
+
+class Mastlauncher(BaseLauncher):
+    def __init__(self, cfg: DictConfig | None = None):
+        assert cfg is not None
+        self.cfg = cfg
+        job_name = cfg.get(JOB_NAME_KEY, None)
+        self.job_name = job_name or self.create_job_name()
+        self.default_monarch_port = 26600
+        self.scheduler_name = "mast_conda"
+
+        # TODO: enabe taking this from config
+        self.sku = "gtt_any"
+        self.timeout_sec = 1 * 60 * 60  # Kill the job if idle for 1 hour
+        self.user = getpass.getuser()
+        self.work_dir = f"/data/users/{self.user}"
+        self.edittable_workspaces = ["forge"]
+        self.remote_work_dir = "/packages/monarch_default_workspace/workspace/"
+        self.editable_workspace_paths = [
+            f"{self.work_dir}/{workspace}" for workspace in self.edittable_workspaces
+        ]
+
+    async def initialize(self) -> None:
+        await self.launch_mast_job()
+
+    async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]:
+        allocator = MastAllocator(
+            MastAllocatorConfig(
+                job_name=self.job_name,
+                remote_allocator_port=self.default_monarch_port,
+            ),
+        )
+        alloc_constraints = AllocConstraints(
+            {MastAllocator.ALLOC_LABEL_TASK_GROUP: name}
+        )
+
+        return allocator, alloc_constraints, self.create_server_handle()
+
+    async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]:
+        setup = procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor)
+        await setup.mount.call(mount_dst="/mnt/wsfuse")
+        return await setup.get_info.choose()
+
+    async def launch_mast_job(self):
+        handle = self.create_server_handle()
+        server_spec = info(handle)
+        if server_spec and server_spec.state == AppState.RUNNING:
+            print(f"Job {self.job_name} is already running. Skipping launch.")
+            return server_spec
+
+        config = Config(
+            scheduler="mast_conda",
+            scheduler_args={
+                "hpcIdentity": "hyper_monarch",
+                "hpcJobOncall": "monarch",
+                "hpcClusterUuid": "MastProdCluster",
+                "rmAttribution": "pytorch4all_clients_approved",
+            },
+            appdef=self.build_appdef(),
+            workspace=Workspace(
+                dirs=[workspace_dir for workspace_dir in self.editable_workspace_paths],
+            ),
+        )
+
+        await commands.get_or_create(self.job_name, config)
+        return server_spec
+
+    def add_additional_packages(self, packages: "Packages") -> "Packages":
+        packages.add_package("oil.oilfs:stable")
+        packages.add_package("manifold.manifoldfs")
+        return packages
+
+    def build_appdef(self) -> specs.AppDef:
+
+        # create the app definition for the worker
+        remote_end_python_path = ":".join(
+            [
+                f"{self.remote_work_dir}{workspace}"
+                for workspace in self.editable_workspace_paths
+            ]
+        )
+
+        default_envs = {
+            **meta_hyperactor.DEFAULT_NVRT_ENVS,
+            **meta_hyperactor.DEFAULT_NCCL_ENVS,
+            **meta_hyperactor.DEFAULT_TORCH_ENVS,
+            **{
+                "TORCHX_RUN_PYTHONPATH": f"{remote_end_python_path}:{self.remote_work_dir}"
+            },
+            **{
+                "HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS": "600",
+                "HYPERACTOR_CODE_MAX_FRAME_LENGTH": "1073741824",
+                "TORCHINDUCTOR_COMPILE_THREADS": "1",
+                "TORCH_COMPILE_DISABLE": "1",
+                "TORCHDYNAMO_VERBOSE": "1",
+                "VLLM_TORCH_COMPILE_LEVEL": "0",
+                "VLLM_USE_TRITON_FLASH_ATTN": "0",
+            },
+        }
+
+        print("DEFAULT ENVS: ", default_envs)
+
+        packages = Packages()
+        meshes = []
+        # Process both services and actors configurations
+        for mesh_name, config in self.cfg["services"].items():
+            num_replicas = config["num_replicas"]
+            with_gpus = bool(config["with_gpus"])
+            num_hosts = int(config.get("hosts", 0))
+            # Create list of mesh names with indices and num_hosts
+            if with_gpus and num_hosts > 0:
+                mesh_list = [
+                    f"{mesh_name}_{i}:{num_hosts}:{self.sku}"
+                    for i in range(num_replicas)
+                ]
+                meshes.extend(mesh_list)
+
+        for mesh_name, config in self.cfg["actors"].items():
+            num_replicas = 1
+            with_gpus = bool(config["with_gpus"])
+            num_hosts = int(config.get("hosts", 0))
+            # single actors with GPUs
+            if with_gpus:
+                meshes.append(f"{mesh_name}:{num_replicas}:{self.sku}")
+
+        appdef = meta_hyperactor.host_mesh_conda(
+            meshes=meshes,
+            additional_packages=self.add_additional_packages(packages),
+            timeout_sec=self.timeout_sec,
+            env=default_envs,
+        )
+
+        for role in appdef.roles:
+            role.resource.capabilities["server_sub_types"] = [
+                # role.resource.capabilities["server_sub_types"][2]  # hardcoded to ROCE
+                role.resource.capabilities["server_sub_types"][1]  # GTT
+            ]
+
+        return appdef
+
+    def create_job_name(self):
+        return f"{USER}-forge-{uuid.uuid4().hex[:6]}"
+
+    def create_server_handle(self) -> str:
+        return f"{self.scheduler_name}:///{self.job_name}"
+
+
+def get_launcher(cfg: DictConfig | None = None) -> BaseLauncher:
+    launcher = cfg.get(LAUNCHER_KEY, Launcher.LOCAL.value)
+    if launcher == Launcher.MAST.value:
+        return Mastlauncher(cfg)
+    else:
+        return Slurmlauncher()
diff --git a/src/forge/controller/launcher/__init__.py b/src/forge/controller/launcher/__init__.py
deleted file mode 100644
index 2e41cd717..000000000
--- a/src/forge/controller/launcher/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/src/forge/controller/launcher/mast.py b/src/forge/controller/launcher/mast.py
deleted file mode 100644
index 1aacd87dd..000000000
--- a/src/forge/controller/launcher/mast.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import asyncio
-import functools
-import getpass
-import logging
-import os
-import socket
-import subprocess
-import uuid
-from typing import Optional
-
-import torchx.specs as specs
-from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints
-
-from forge.observability.metric_actors import get_or_create_metric_logger
-
-try:
-    from monarch._src.actor.actor_mesh import current_rank
-    from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig
-    from monarch._src.actor.shape import NDSlice, Shape
-    from monarch.tools.components.meta import hyperactor
-    from torchx.specs import AppState
-    from torchx.specs.fb.component_helpers import Packages
-except ImportError as e:
-    print(f"Warning: Monarch imports failed: {e}")
-    print("Monarch functionality will be limited")
-from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
-from monarch.tools import commands
-from monarch.tools.commands import info
-from monarch.tools.config import Config, Workspace
-from omegaconf import DictConfig
-
-from forge.controller.provisioner import BaseProvisioner, GpuManager, JOB_NAME_KEY
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-
-SCHEDULER_NAME = "mast_conda"
-SKU = "gtt_any"
-TIMEOUT_SEC = 1 * 60 * 60  # Kill the job if idle for 1 hour
-
-USER = getpass.getuser()
-WORK_DIR = f"/data/users/{USER}"  # on DEVGPU
-EDITABLE_WORKSPACES = ["forge"]
-REMOTE_WORK_DIR = "/packages/monarch_default_workspace/workspace/"
-
-EDITABLE_WORKSPACE_PATHS = [
-    f"{WORK_DIR}/{workspace}" for workspace in EDITABLE_WORKSPACES
-]
-
-
-def _get_port() -> str:
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("localhost", 0))
-        addr = s.getsockname()
-        port = addr[1]
-        return str(port)
-
-
-class MastSetupActor(Actor):
-    @endpoint
-    def get_info(self) -> [str, str]:
-        return socket.gethostname(), _get_port()
-
-    @endpoint
-    def mount(self, mount_dst: str):
-        point = current_rank()
-        # The last dimension is the local proc count.
-        last_label = point.extent.labels[-1]
-        proc_count = point.size(last_label)
-        if current_rank().rank % proc_count != 0:
-            # Only use one rank per host to mount the directory
-            return
-        self.mount_mnt_directory(mount_dst)
-
-    def mount_mnt_directory(self, mount_dst: str) -> None:
-        # Sanity check of the mounted directory
-        sanity_path = os.path.join(mount_dst, "huggingface_models/")
-        if os.path.exists(sanity_path):
-            print(f"Found directory {sanity_path}; skip mounting.")
-            return
-
-        # Otherwise, mount the directory
-        if not os.path.exists(mount_dst):
-            os.makedirs(mount_dst, exist_ok=True)
-
-        # Store original LD_LIBRARY_PATH to restore after mounting
-        original_ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
-
-        try:
-            clean_env = os.environ.copy()
-            if "LD_LIBRARY_PATH" in clean_env:
-                del clean_env["LD_LIBRARY_PATH"]
-
-            subprocess.run(
-                [
-                    "/packages/oil.oilfs/oilfs-wrapper",
-                    "ws://ws.ai.pci0ai/genai_fair_llm",
-                    mount_dst,
-                ],
-                capture_output=True,
-                text=True,
-                check=True,
-                env=clean_env,
-            )
-            print("Done mounting")
-        except subprocess.CalledProcessError as e:
-            print(
-                f"Get error during mounting {e}, Stderr: {e.stderr}, Stdout: {e.stdout}"
-            )
-        finally:
-            # Restore original LD_LIBRARY_PATH
-            if original_ld_library_path:
-                os.environ["LD_LIBRARY_PATH"] = original_ld_library_path
-            elif "LD_LIBRARY_PATH" in os.environ:
-                del os.environ["LD_LIBRARY_PATH"]
-
-        assert os.path.exists(
-            sanity_path
-        ), f"Did not find directory {sanity_path}; something wrong with mounting."
-
-
-class MastProvisioner(BaseProvisioner):
-    def __init__(self, cfg: DictConfig | None = None):
-        self._server_names = []
-        self._proc_server_map = {}
-        self._lock = asyncio.Lock()
-        self._this_host_id = uuid.uuid1()
-        available_local_devices = None
-        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-        if cuda_visible_devices is not None and cuda_visible_devices.strip():
-            try:
-                available_local_devices = set(
-                    int(x.strip()) for x in cuda_visible_devices.split(",") if x.strip()
-                )
-            except ValueError as e:
-                raise ValueError(
-                    f"Invalid CUDA_VISIBLE_DEVICES format: '{cuda_visible_devices}'. "
-                    f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}"
-                ) from e
-        self._host_gpu_map = {
-            self._this_host_id: GpuManager(available_local_devices),
-        }
-        assert cfg is not None
-        self.cfg = cfg
-        job_name = cfg.get(JOB_NAME_KEY, None)
-        self.job_name = job_name or self.create_job_name()
-
-    async def initialize(self):
-        """Call this after creating the instance"""
-        await self.launch_mast_job()
-
-    async def get_mast_allocator(
-        self,
-        job_name: str,
-        task_group: str,
-    ):
-        allocator = MastAllocator(
-            MastAllocatorConfig(
-                job_name=job_name,
-                remote_allocator_port=26600,  # This is the default monarch port
-            ),
-        )
-        alloc_constraints = AllocConstraints(
-            {MastAllocator.ALLOC_LABEL_TASK_GROUP: task_group}
-        )
-
-        return allocator, alloc_constraints
-
-    async def create_host_mesh(self, name: str, num_hosts: int):
-        """Creates a remote server and a HostMesh on it."""
-        logger.debug(f"Creating remote server for mesh: {name}")
-        server_name = f"{SCHEDULER_NAME}:///{self.job_name}"
-        alloc, alloc_constraints = await self.get_mast_allocator(
-            task_group=name, job_name=self.job_name
-        )
-        return (
-            HostMesh(
-                shape=Shape(["hosts"], NDSlice.new_row_major([num_hosts])),
-                allocator=alloc,
-                alloc_constraints=alloc_constraints,
-            ),
-            server_name,
-        )
-
-    async def get_proc_mesh(
-        self,
-        num_procs: int,
-        with_gpus: bool = False,
-        num_hosts: int | None = None,
-        mesh_name: Optional[str] = None,
-    ):
-        """Gets a proc mesh.
-
-        num_hosts = None implies that you want a local allocation, this may change.
-
-        """
-        async with self._lock:
-            server_name = None
-            if num_hosts is not None and num_hosts > 0:
-                assert mesh_name is not None
-                host_mesh, server_name = await self.create_host_mesh(
-                    name=mesh_name,
-                    num_hosts=num_hosts,
-                )
-                host_id = uuid.uuid1()
-                gpu_manager = GpuManager()
-                self._host_gpu_map[host_id] = gpu_manager
-            else:
-                host_mesh = this_host()
-                gpu_manager = self._host_gpu_map[self._this_host_id]
-                host_mesh._host_id = self._this_host_id
-
-            if with_gpus:
-
-                def bootstrap(gpu_ids: list[str]):
-                    # This works for single host, needed for vLLM currently.
-                    import os
-
-                    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(gpu_ids)
-                    os.environ["MASTER_ADDR"] = socket.gethostname()
-                    os.environ["MASTER_PORT"] = f"1234{gpu_ids[0]}"
-                    os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600"
-                    os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824"
-
-                gpu_ids = gpu_manager.get_gpus(num_procs)
-                procs = host_mesh.spawn_procs(
-                    per_host={"gpus": num_procs},
-                    bootstrap=functools.partial(bootstrap, gpu_ids=gpu_ids),
-                )
-                await procs.initialized
-                setup = procs.spawn(f"setup-{uuid.uuid1()}", MastSetupActor)
-                hostname, port = await setup.get_info.choose()
-                await setup.mount.call(mount_dst="/mnt/wsfuse")
-                procs._hostname = hostname
-                procs._port = port
-                procs._gpu_ids = gpu_ids
-            else:
-                procs = host_mesh.spawn_procs(per_host={"gpus": num_procs})
-
-            procs._host = host_mesh
-
-            # If we created a server, track so we can tear it down later.
-            if server_name:
-                self._server_names.append(server_name)
-                self._proc_server_map[procs] = server_name
-
-        _ = await get_or_create_metric_logger(procs)
-
-        return procs
-
-    async def stop_proc_mesh(self, proc_mesh: ProcMesh):
-        """Stops a proc mesh."""
-        async with self._lock:
-            # Deregister local logger from global logger
-            if hasattr(proc_mesh, "_local_fetcher"):
-                global_logger = await get_or_create_metric_logger(proc_mesh)
-                await global_logger.deregister_fetcher.call_one(proc_mesh)
-
-            if hasattr(proc_mesh, "_gpu_ids"):
-                gpu_manager = self._host_gpu_map[proc_mesh._host._host_id]
-                gpu_manager.release_gpus(proc_mesh._gpu_ids)
-            await proc_mesh.stop()
-            if proc_mesh in self._proc_server_map:
-                server_name = self._proc_server_map[proc_mesh]
-                commands.kill(server_name)
-
-    async def shutdown(self):
-        """Tears down all remaining remote allocations."""
-        async with self._lock:
-            for server_name in self._server_names:
-                commands.kill(server_name)
-
-    async def launch_mast_job(self):
-        handle = self.create_server_handle()
-        server_spec = info(handle)
-        if server_spec and server_spec.state == AppState.RUNNING:
-            print(f"Job {self.job_name} is already running. Skipping launch.")
-            return server_spec
-
-        config = Config(
-            scheduler="mast_conda",
-            scheduler_args={
-                # NOTE: TODO: support passing these args from CLI
-                "hpcIdentity": "hyper_monarch",
-                # "hpcIdentity": "genai_llm_pretraining_data",
-                # "hpcIdentity": "pytorch_distributed",
-                "hpcJobOncall": "monarch",
-                "hpcClusterUuid": "MastProdCluster",
-                "rmAttribution": "pytorch4all_clients_approved",
-            },
-            appdef=self.build_appdef(),
-            workspace=Workspace(
-                dirs=[workspace_dir for workspace_dir in EDITABLE_WORKSPACE_PATHS],
-            ),
-        )
-
-        await commands.get_or_create(self.job_name, config)
-        return server_spec
-
-    def add_additional_packages(self, packages: Packages) -> Packages:
-        packages.add_package("oil.oilfs:stable")
-        packages.add_package("manifold.manifoldfs")
-        return packages
-
-    def build_appdef(self) -> specs.AppDef:
-
-        # create the app definition for the worker
-        REMOTE_END_PYTHONPATH = ":".join(
-            [f"{REMOTE_WORK_DIR}{workspace}" for workspace in EDITABLE_WORKSPACE_PATHS]
-        )
-
-        default_envs = {
-            **hyperactor.DEFAULT_NVRT_ENVS,
-            **hyperactor.DEFAULT_NCCL_ENVS,
-            **hyperactor.DEFAULT_TORCH_ENVS,
-            **{"TORCHX_RUN_PYTHONPATH": f"{REMOTE_END_PYTHONPATH}:{REMOTE_WORK_DIR}"},
-            **{
-                "HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS": "600",
-                "HYPERACTOR_CODE_MAX_FRAME_LENGTH": "1073741824",
-                "TORCHINDUCTOR_COMPILE_THREADS": "1",
-                "TORCH_COMPILE_DISABLE": "1",
-                "TORCHDYNAMO_VERBOSE": "1",
-                "VLLM_TORCH_COMPILE_LEVEL": "0",
-                "VLLM_USE_TRITON_FLASH_ATTN": "0",
-            },
-        }
-
-        print("DEFAULT ENVS: ", default_envs)
-
-        packages = Packages()
-        meshes = []
-        # Process both services and actors configurations
-        for mesh_name, config in self.cfg["services"].items():
-            num_replicas = config["num_replicas"]
-            with_gpus = bool(config["with_gpus"])
-            num_hosts = int(config.get("hosts", 0))
-            # Create list of mesh names with indices and num_hosts
-            if with_gpus and num_hosts > 0:
-                mesh_list = [
-                    f"{mesh_name}_{i}:{num_hosts}:{SKU}" for i in range(num_replicas)
-                ]
-                meshes.extend(mesh_list)
-
-        for mesh_name, config in self.cfg["actors"].items():
-            num_replicas = 1
-            with_gpus = bool(config["with_gpus"])
-            num_hosts = int(config.get("hosts", 0))
-            # single actors with GPUs
-            if with_gpus:
-                meshes.append(f"{mesh_name}:{num_replicas}:{SKU}")
-
-        appdef = hyperactor.host_mesh_conda(
-            meshes=meshes,
-            additional_packages=self.add_additional_packages(packages),
-            timeout_sec=TIMEOUT_SEC,
-            env=default_envs,
-        )
-
-        for role in appdef.roles:
-            role.resource.capabilities["server_sub_types"] = [
-                # role.resource.capabilities["server_sub_types"][2]  # hardcoded to ROCE
-                role.resource.capabilities["server_sub_types"][1]  # GTT
-            ]
-
-        return appdef
-
-    def create_job_name(self):
-        return f"{USER}-forge-{uuid.uuid4().hex[:6]}"
-
-    def create_server_handle(self) -> str:
-        return f"{SCHEDULER_NAME}:///{self.job_name}"
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index f4344909d..dd3d05efd 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -12,43 +12,22 @@
 import os
 import socket
 import uuid
-from abc import ABC, abstractmethod
 from typing import Optional
 
-import monarch
-from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
 from monarch._src.actor.shape import NDSlice, Shape
-from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
+from monarch.actor import HostMesh, ProcMesh, this_host
 from monarch.tools import commands
-from monarch.tools.components import hyperactor
-from monarch.tools.config import Config
-
 from omegaconf import DictConfig
 
+from forge.controller.launcher import BaseLauncher, get_launcher
+
 from forge.observability.metric_actors import get_or_create_metric_logger
 
-from forge.types import ProcessConfig, Scheduler
+from forge.types import ProcessConfig
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
-JOB_NAME_KEY = "job_name"
-SCHEDULER_KEY = "scheduler"
-
-
-def _get_port() -> str:
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("localhost", 0))
-        addr = s.getsockname()
-        port = addr[1]
-        return str(port)
-
-
-class _SetupActor(Actor):
-    @endpoint
-    def get_info(self) -> [str, str]:
-        return socket.gethostname(), _get_port()
-
 
 class GpuManager:
     """Tracks and assigns GPU devices on a host.
@@ -83,57 +62,10 @@ def release_gpus(self, gpu_ids: list[str]) -> None:
             self.available_gpus.add(int(gpu_id))
 
 
-class BaseProvisioner(ABC):
-    """Abstract base class for resource provisioners."""
-
-    @abstractmethod
-    async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
-        """Creates a remote server and a HostMesh on it.
-        Args:
-            name: Name identifier for the host mesh
-            num_hosts: Number of hosts to create
-        Returns:
-            HostMesh: The created host mesh
-        """
-        pass
-
-    @abstractmethod
-    async def get_proc_mesh(
-        self,
-        num_procs: int,
-        with_gpus: bool = False,
-        num_hosts: Optional[int] = None,
-        mesh_name: Optional[str] = None,
-    ) -> ProcMesh:
-        """Gets a proc mesh.
-        Args:
-            num_procs: Number of processes needed
-            with_gpus: Whether GPU support is required
-            num_hosts: Number of hosts (None implies local allocation)
-            mesh_name: Name identifier for the proc mesh
-        Returns:
-            ProcMesh: The allocated process mesh
-        """
-        pass
-
-    @abstractmethod
-    async def stop_proc_mesh(self, proc_mesh: ProcMesh) -> None:
-        """Stops a proc mesh.
-        Args:
-            proc_mesh: The process mesh to stop
-        """
-        pass
-
-    @abstractmethod
-    async def shutdown(self) -> None:
-        """Tears down all remaining remote allocations."""
-        pass
-
-
-class Provisioner(BaseProvisioner):
+class Provisioner:
     """A global resource provisioner."""
 
-    def __init__(self):
+    def __init__(self, cfg: DictConfig | None = None):
         self._server_names = []
         self._proc_server_map = {}
         self._lock = asyncio.Lock()
@@ -162,39 +94,25 @@ def __init__(self):
         self._host_gpu_map = {
             self._this_host_id: GpuManager(available_local_devices),
         }
+        self.launcher: BaseLauncher = get_launcher(cfg)
+
+    async def initialize(self):
+        """Call this after creating the instance"""
+        await self.launcher.initialize()
 
     async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
         """Creates a remote server and a HostMesh on it."""
         # no need to lock here because this is already locked behind `get_proc_mesh`
         logger.debug(f"Creating remote server for alloc {name}")
-        appdef = hyperactor.host_mesh(
-            image="test", meshes=[f"{name}:{num_hosts}:gpu.small"]
-        )
-        for role in appdef.roles:
-            # Note - this is hardcoded to SLURM
-            # We got this with sinfo
-            role.resource.memMB = 2062607
-            role.resource.cpu = 128
-            role.resource.gpu = 8
-
-        # TODO - multi scheduler support
-        server_config = Config(
-            scheduler="slurm",
-            appdef=appdef,
-            workspace=monarch.tools.config.workspace.Workspace(dirs=[""]),
-        )
-        server_info = await commands.get_or_create(
-            "forge_job",
-            server_config,
-            force_restart=False,
-        )
-        alloc = RemoteAllocator(
-            world_id=name,
-            initializer=TorchXRemoteAllocInitializer(server_info.server_handle),
+        alloc, alloc_constraints, server_name = await self.launcher.get_allocator(
+            name, num_hosts
         )
-        server_name = f"slurm:///{server_info.name}"
         return (
-            HostMesh(Shape(["hosts"], NDSlice.new_row_major([num_hosts])), alloc),
+            HostMesh(
+                Shape(["hosts"], NDSlice.new_row_major([num_hosts])),
+                allocator=alloc,
+                alloc_constraints=alloc_constraints,
+            ),
             server_name,
         )
 
@@ -215,7 +133,7 @@ async def get_proc_mesh(
             if num_hosts is not None and num_hosts > 0:
                 created_hosts = len(self._server_names)
                 host_mesh, server_name = await self.create_host_mesh(
-                    name=f"alloc-{created_hosts}",
+                    name=mesh_name,
                     num_hosts=num_hosts,
                 )
                 host_id = uuid.uuid1()
@@ -257,11 +175,10 @@ def bootstrap(gpu_ids: list[str]):
                     per_host={"gpus": num_procs},
                     bootstrap=functools.partial(bootstrap, gpu_ids=gpu_ids),
                 )
-                setup = procs.spawn(f"setup-{uuid.uuid1()}", _SetupActor)
                 # Pick a random host/port, we'll feed this in afterwards
                 # Once we have true HostMesh support, we can do this on proc 0 of each host
                 # then spin up the proc meshes with the environment afterwards.
-                hostname, port = await setup.get_info.choose()
+                hostname, port = await self.launcher.remote_setup(procs)
                 procs._hostname = hostname
                 procs._port = port
                 procs._gpu_ids = gpu_ids
@@ -303,34 +220,25 @@ async def shutdown(self):
                 commands.kill(server_name)
 
 
-_provisioner: BaseProvisioner | None = None
+_provisioner: Provisioner | None = None
 
 
 async def init_provisioner(cfg: DictConfig | None = None):
     global _provisioner
     if not _provisioner:
-        scheduler = Scheduler.LOCAL
-        if cfg is not None:
-            scheduler = cfg.get(SCHEDULER_KEY, Scheduler.LOCAL.value)
-        if scheduler == Scheduler.MAST.value:
-            from forge.controller.launcher.mast import MastProvisioner
-
-            _provisioner = MastProvisioner(cfg=cfg)
-            await _provisioner.initialize()
-        else:
-            _provisioner = Provisioner()
+        _provisioner = Provisioner(cfg)
+        await _provisioner.initialize()
     return _provisioner
 
 
-async def _get_provisioner():
+def _get_provisioner():
     if not _provisioner:
-        await init_provisioner()
+        raise RuntimeError("Provisioner not initialized")
     return _provisioner
 
 
 async def get_proc_mesh(config: ProcessConfig) -> ProcMesh:
-    provisioner = await _get_provisioner()
-    return await provisioner.get_proc_mesh(
+    return await _get_provisioner().get_proc_mesh(
         num_procs=config.procs,
         with_gpus=config.with_gpus,
         num_hosts=config.hosts,
@@ -339,11 +247,9 @@ async def get_proc_mesh(config: ProcessConfig) -> ProcMesh:
 
 
 async def stop_proc_mesh(proc_mesh: ProcMesh):
-    provisioner = await _get_provisioner()
-    return await provisioner.stop_proc_mesh(proc_mesh=proc_mesh)
+    return await _get_provisioner().stop_proc_mesh(proc_mesh=proc_mesh)
 
 
 async def shutdown():
     logger.info("Shutting down provisioner..")
-    provisioner = await _get_provisioner()
-    return await provisioner.shutdown()
+    await _get_provisioner().shutdown()
diff --git a/src/forge/types.py b/src/forge/types.py
index 271797d95..16585922d 100644
--- a/src/forge/types.py
+++ b/src/forge/types.py
@@ -88,7 +88,7 @@ class State:
     metadata: dict[str, Any] = field(default_factory=dict)
 
 
-class Scheduler(Enum):
+class Launcher(Enum):
     MAST = "mast"
     SLURM = "slurm"
     LOCAL = "local"

From 9d41973497496f70a47cf3ddfa8dfca8799653ef Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu004.rva5.facebook.com>
Date: Thu, 2 Oct 2025 17:25:03 -0700
Subject: [PATCH 15/17] unit test issues

---
 src/forge/controller/launcher.py    |  9 ++++++---
 src/forge/controller/provisioner.py | 23 +++++++++++++----------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
index 2db56b2ee..5370b495a 100644
--- a/src/forge/controller/launcher.py
+++ b/src/forge/controller/launcher.py
@@ -15,6 +15,8 @@
 
 import torchx.specs as specs
 
+from forge.types import Launcher
+
 from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints
 from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
 from monarch.actor import Actor, endpoint, ProcMesh
@@ -24,8 +26,6 @@
 from monarch.tools.config import Config, Workspace
 from omegaconf import DictConfig
 
-from forge.types import Launcher
-
 try:
     from monarch._src.actor.actor_mesh import current_rank
     from monarch._src.actor.meta.allocator import MastAllocator, MastAllocatorConfig
@@ -311,7 +311,10 @@ def create_server_handle(self) -> str:
 
 
 def get_launcher(cfg: DictConfig | None = None) -> BaseLauncher:
-    launcher = cfg.get(LAUNCHER_KEY, Launcher.LOCAL.value)
+    if cfg is not None:
+        launcher = cfg.get(LAUNCHER_KEY, Launcher.LOCAL.value)
+    else:
+        launcher = Launcher.LOCAL.value
     if launcher == Launcher.MAST.value:
         return Mastlauncher(cfg)
     else:
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index dd3d05efd..8dea22d28 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -14,17 +14,17 @@
 import uuid
 from typing import Optional
 
-from monarch._src.actor.shape import NDSlice, Shape
-from monarch.actor import HostMesh, ProcMesh, this_host
-from monarch.tools import commands
-from omegaconf import DictConfig
-
 from forge.controller.launcher import BaseLauncher, get_launcher
 
 from forge.observability.metric_actors import get_or_create_metric_logger
 
 from forge.types import ProcessConfig
 
+from monarch._src.actor.shape import NDSlice, Shape
+from monarch.actor import HostMesh, ProcMesh, this_host
+from monarch.tools import commands
+from omegaconf import DictConfig
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
@@ -231,14 +231,15 @@ async def init_provisioner(cfg: DictConfig | None = None):
     return _provisioner
 
 
-def _get_provisioner():
+async def _get_provisioner():
     if not _provisioner:
-        raise RuntimeError("Provisioner not initialized")
+        await init_provisioner()
     return _provisioner
 
 
 async def get_proc_mesh(config: ProcessConfig) -> ProcMesh:
-    return await _get_provisioner().get_proc_mesh(
+    provisioner = await _get_provisioner()
+    return await provisioner.get_proc_mesh(
         num_procs=config.procs,
         with_gpus=config.with_gpus,
         num_hosts=config.hosts,
@@ -247,9 +248,11 @@ async def get_proc_mesh(config: ProcessConfig) -> ProcMesh:
 
 
 async def stop_proc_mesh(proc_mesh: ProcMesh):
-    return await _get_provisioner().stop_proc_mesh(proc_mesh=proc_mesh)
+    provisioner = await _get_provisioner()
+    return await provisioner.stop_proc_mesh(proc_mesh=proc_mesh)
 
 
 async def shutdown():
     logger.info("Shutting down provisioner..")
-    await _get_provisioner().shutdown()
+    provisioner = await _get_provisioner()
+    return await provisioner.shutdown()

From 6dbc5eedb3ab4d9625f4a5a3c7e9acdd837b4230 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu004.rva5.facebook.com>
Date: Fri, 3 Oct 2025 11:23:57 -0700
Subject: [PATCH 16/17] suggested changes

---
 apps/grpo/main.py                   | 21 +++++++++++++--
 apps/mast/main.py                   | 19 +++++++++++--
 src/forge/controller/launcher.py    | 42 ++++++++++++++---------------
 src/forge/controller/provisioner.py | 24 ++++++++++-------
 src/forge/types.py                  | 18 ++++++++++++-
 5 files changed, 87 insertions(+), 37 deletions(-)

diff --git a/apps/grpo/main.py b/apps/grpo/main.py
index 138e406b0..2439100d9 100644
--- a/apps/grpo/main.py
+++ b/apps/grpo/main.py
@@ -26,12 +26,20 @@
 from forge.actors.trainer import RLTrainer
 from forge.cli.config import parse
 from forge.controller.actor import ForgeActor
-
+from forge.controller.launcher import JOB_NAME_KEY, LAUNCHER_KEY
 from forge.controller.provisioner import init_provisioner, shutdown
 from forge.data.rewards import MathReward, ThinkingReward
 from forge.observability.metric_actors import get_or_create_metric_logger
 from forge.observability.metrics import record_metric, Reduce
 from forge.observability.perf_tracker import Tracer
+
+from forge.types import (
+    Launcher,
+    LauncherConfig,
+    ProcessConfig,
+    ProvisionerConfig,
+    ServiceConfig,
+)
 from forge.util.ops import compute_logprobs
 from monarch.actor import endpoint
 from omegaconf import DictConfig
@@ -313,7 +321,16 @@ async def main(cfg: DictConfig):
     max_res_tokens = cfg.max_res_tokens
 
     # init provisioner
-    await init_provisioner(cfg)
+    await init_provisioner(
+        ProvisionerConfig(
+            launcher_config=LauncherConfig(
+                launcher=Launcher(cfg.get(LAUNCHER_KEY, Launcher.SLURM.value)),
+                job_name=cfg.get(JOB_NAME_KEY, None),
+                services={k: ServiceConfig(**v) for k, v in cfg.services.items()},
+                actors={k: ProcessConfig(**v) for k, v in cfg.actors.items()},
+            )
+        )
+    )
 
     # initialize before spawning services
     metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
diff --git a/apps/mast/main.py b/apps/mast/main.py
index 9627bcc24..cd5de0be9 100644
--- a/apps/mast/main.py
+++ b/apps/mast/main.py
@@ -13,7 +13,13 @@
 from forge.controller.launcher import JOB_NAME_KEY, LAUNCHER_KEY
 from forge.controller.provisioner import init_provisioner
 
-from forge.types import Launcher
+from forge.types import (
+    Launcher,
+    LauncherConfig,
+    ProcessConfig,
+    ProvisionerConfig,
+    ServiceConfig,
+)
 from omegaconf import DictConfig
 
 DEFAULT_CHECKPOINT_FOLDER_KEY = "checkpoint_folder"
@@ -39,7 +45,16 @@ async def main(cfg: DictConfig):
         print(f"Overriding checkpoint folder to {cfg[DEFAULT_CHECKPOINT_FOLDER_KEY]}")
 
     # init mast provisioner
-    await init_provisioner(cfg)
+    await init_provisioner(
+        ProvisionerConfig(
+            launcher_config=LauncherConfig(
+                launcher=Launcher(cfg.get(LAUNCHER_KEY, Launcher.MAST.value)),
+                job_name=cfg.get(JOB_NAME_KEY, None),
+                services={k: ServiceConfig(**v) for k, v in cfg.services.items()},
+                actors={k: ProcessConfig(**v) for k, v in cfg.actors.items()},
+            )
+        )
+    )
     await grpo_main(cfg)
 
 
diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
index 5370b495a..00493e889 100644
--- a/src/forge/controller/launcher.py
+++ b/src/forge/controller/launcher.py
@@ -15,8 +15,6 @@
 
 import torchx.specs as specs
 
-from forge.types import Launcher
-
 from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints
 from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
 from monarch.actor import Actor, endpoint, ProcMesh
@@ -24,7 +22,8 @@
 from monarch.tools.commands import info
 from monarch.tools.components import hyperactor
 from monarch.tools.config import Config, Workspace
-from omegaconf import DictConfig
+
+from forge.types import Launcher, LauncherConfig
 
 try:
     from monarch._src.actor.actor_mesh import current_rank
@@ -125,7 +124,7 @@ async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]:
 
 
 class Slurmlauncher(BaseLauncher):
-    def __init__(self, cfg: DictConfig | None = None):
+    def __init__(self, cfg: LauncherConfig | None = None):
         self.cfg = cfg
 
     async def initialize(self) -> None:
@@ -166,11 +165,9 @@ async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]:
 
 
 class Mastlauncher(BaseLauncher):
-    def __init__(self, cfg: DictConfig | None = None):
+    def __init__(self, cfg: LauncherConfig | None = None):
         assert cfg is not None
         self.cfg = cfg
-        job_name = cfg.get(JOB_NAME_KEY, None)
-        self.job_name = job_name or self.create_job_name()
         self.default_monarch_port = 26600
         self.scheduler_name = "mast_conda"
 
@@ -184,6 +181,7 @@ def __init__(self, cfg: DictConfig | None = None):
         self.editable_workspace_paths = [
             f"{self.work_dir}/{workspace}" for workspace in self.edittable_workspaces
         ]
+        self.job_name = self.cfg.job_name or self.create_job_name()
 
     async def initialize(self) -> None:
         await self.launch_mast_job()
@@ -268,10 +266,10 @@ def build_appdef(self) -> specs.AppDef:
         packages = Packages()
         meshes = []
         # Process both services and actors configurations
-        for mesh_name, config in self.cfg["services"].items():
-            num_replicas = config["num_replicas"]
-            with_gpus = bool(config["with_gpus"])
-            num_hosts = int(config.get("hosts", 0))
+        for mesh_name, service in self.cfg.services.items():
+            num_replicas = service.num_replicas
+            with_gpus = bool(service.with_gpus)
+            num_hosts = int(service.hosts or 0)
             # Create list of mesh names with indices and num_hosts
             if with_gpus and num_hosts > 0:
                 mesh_list = [
@@ -280,10 +278,10 @@ def build_appdef(self) -> specs.AppDef:
                 ]
                 meshes.extend(mesh_list)
 
-        for mesh_name, config in self.cfg["actors"].items():
+        for mesh_name, actor in self.cfg.actors.items():
             num_replicas = 1
-            with_gpus = bool(config["with_gpus"])
-            num_hosts = int(config.get("hosts", 0))
+            with_gpus = bool(actor.with_gpus)
+            num_hosts = int(actor.hosts or 0)
             # single actors with GPUs
             if with_gpus:
                 meshes.append(f"{mesh_name}:{num_replicas}:{self.sku}")
@@ -304,18 +302,18 @@ def build_appdef(self) -> specs.AppDef:
         return appdef
 
     def create_job_name(self):
-        return f"{USER}-forge-{uuid.uuid4().hex[:6]}"
+        return f"{self.user}-forge-{uuid.uuid4().hex[:6]}"
 
     def create_server_handle(self) -> str:
         return f"{self.scheduler_name}:///{self.job_name}"
 
 
-def get_launcher(cfg: DictConfig | None = None) -> BaseLauncher:
-    if cfg is not None:
-        launcher = cfg.get(LAUNCHER_KEY, Launcher.LOCAL.value)
-    else:
-        launcher = Launcher.LOCAL.value
-    if launcher == Launcher.MAST.value:
+def get_launcher(cfg: LauncherConfig | None = None) -> BaseLauncher | None:
+    if not cfg:
+        return None
+    if cfg.launcher == Launcher.MAST:
         return Mastlauncher(cfg)
+    elif cfg.launcher == Launcher.SLURM:
+        return Slurmlauncher(cfg)
     else:
-        return Slurmlauncher()
+        raise ValueError(f"Unsupported config provided, got {cfg}")
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index 8dea22d28..7d55b1c44 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -14,16 +14,15 @@
 import uuid
 from typing import Optional
 
+from monarch._src.actor.shape import NDSlice, Shape
+from monarch.actor import HostMesh, ProcMesh, this_host
+from monarch.tools import commands
+
 from forge.controller.launcher import BaseLauncher, get_launcher
 
 from forge.observability.metric_actors import get_or_create_metric_logger
 
-from forge.types import ProcessConfig
-
-from monarch._src.actor.shape import NDSlice, Shape
-from monarch.actor import HostMesh, ProcMesh, this_host
-from monarch.tools import commands
-from omegaconf import DictConfig
+from forge.types import ProcessConfig, ProvisionerConfig
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -65,7 +64,7 @@ def release_gpus(self, gpu_ids: list[str]) -> None:
 class Provisioner:
     """A global resource provisioner."""
 
-    def __init__(self, cfg: DictConfig | None = None):
+    def __init__(self, cfg: ProvisionerConfig | None = None):
         self._server_names = []
         self._proc_server_map = {}
         self._lock = asyncio.Lock()
@@ -94,11 +93,16 @@ def __init__(self, cfg: DictConfig | None = None):
         self._host_gpu_map = {
             self._this_host_id: GpuManager(available_local_devices),
         }
-        self.launcher: BaseLauncher = get_launcher(cfg)
+        self.launcher: BaseLauncher | None = get_launcher(
+            cfg.launcher_config if cfg is not None else None
+        )
+        if not self.launcher:
+            logger.warning("Launcher not provided, remote allocations will not work.")
 
     async def initialize(self):
         """Call this after creating the instance"""
-        await self.launcher.initialize()
+        if self.launcher is not None:
+            await self.launcher.initialize()
 
     async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
         """Creates a remote server and a HostMesh on it."""
@@ -223,7 +227,7 @@ async def shutdown(self):
 _provisioner: Provisioner | None = None
 
 
-async def init_provisioner(cfg: DictConfig | None = None):
+async def init_provisioner(cfg: ProvisionerConfig | None = None):
     global _provisioner
     if not _provisioner:
         _provisioner = Provisioner(cfg)
diff --git a/src/forge/types.py b/src/forge/types.py
index 16585922d..f79e3ef2c 100644
--- a/src/forge/types.py
+++ b/src/forge/types.py
@@ -91,7 +91,6 @@ class State:
 class Launcher(Enum):
     MAST = "mast"
     SLURM = "slurm"
-    LOCAL = "local"
 
 
 @dataclass
@@ -141,3 +140,20 @@ def to_process_config(self) -> ProcessConfig:
 
 
 Scalar = Union[int, float]
+
+
+@dataclass
+class LauncherConfig:
+    """A launcher config for the scheduler."""
+
+    launcher: Launcher
+    job_name: str
+    services: dict[str, ServiceConfig]
+    actors: dict[str, ProcessConfig]
+
+
+@dataclass
+class ProvisionerConfig:
+    """A config for the forge provisioner."""
+
+    launcher_config: LauncherConfig

From d313c59269d2c2b79014f92136257e82b82193d0 Mon Sep 17 00:00:00 2001
From: rithesh <rithesh@devgpu004.rva5.facebook.com>
Date: Fri, 3 Oct 2025 11:51:12 -0700
Subject: [PATCH 17/17] failing tests

---
 src/forge/controller/launcher.py    | 11 +++--------
 src/forge/controller/provisioner.py |  5 +++++
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
index 00493e889..cd54c00b0 100644
--- a/src/forge/controller/launcher.py
+++ b/src/forge/controller/launcher.py
@@ -124,9 +124,6 @@ async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]:
 
 
 class Slurmlauncher(BaseLauncher):
-    def __init__(self, cfg: LauncherConfig | None = None):
-        self.cfg = cfg
-
     async def initialize(self) -> None:
         pass
 
@@ -309,11 +306,9 @@ def create_server_handle(self) -> str:
 
 
 def get_launcher(cfg: LauncherConfig | None = None) -> BaseLauncher | None:
-    if not cfg:
-        return None
-    if cfg.launcher == Launcher.MAST:
+    if not cfg or cfg.launcher == Launcher.SLURM:
+        return Slurmlauncher()
+    elif cfg.launcher == Launcher.MAST:
         return Mastlauncher(cfg)
-    elif cfg.launcher == Launcher.SLURM:
-        return Slurmlauncher(cfg)
     else:
         raise ValueError(f"Unsupported config provided, got {cfg}")
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
index 7d55b1c44..d66504707 100644
--- a/src/forge/controller/provisioner.py
+++ b/src/forge/controller/provisioner.py
@@ -107,6 +107,11 @@ async def initialize(self):
     async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
         """Creates a remote server and a HostMesh on it."""
         # no need to lock here because this is already locked behind `get_proc_mesh`
+        if not self.launcher:
+            raise RuntimeError(
+                "You tried to create a remote allocation by specifying the number of hosts on an actor or service, "
+                "but no launcher was specified."
+            )
         logger.debug(f"Creating remote server for alloc {name}")
         alloc, alloc_constraints, server_name = await self.launcher.get_allocator(
             name, num_hosts