Skip to content

Commit 0db9ed6

Browse files
authored
Merge branch 'main' into provisioner_shutdown
2 parents d4f3d57 + b7d2526 commit 0db9ed6

File tree

19 files changed

+503
-587
lines changed

19 files changed

+503
-587
lines changed

.meta/mast/README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Forge MAST Environment Setup
2+
3+
A simple setup script to automatically configure your environment for running Forge with MAST jobs.
4+
This only applies to Meta internal users.
5+
6+
## Quick Start
7+
8+
⚠️ Important Note: the setup script will clone the forge repository under "/data/users/$USER".
9+
10+
### 1. Run the Setup Script
11+
12+
The `env_setup.sh` script will automatically:
13+
- ✅ Activate and configure the required conda environment
14+
- ✅ Clone/update the Forge repository
15+
- ✅ Install Forge package dependencies
16+
- ✅ Mount the required oilfs workspace to `/mnt/wsfuse`
17+
- ✅ Configure your environment for MAST job submission
18+
19+
```bash
20+
# Make the script executable
21+
chmod +x .meta/mast/env_setup.sh
22+
23+
# Run the setup
24+
./.meta/mast/env_setup.sh
25+
26+
```
27+
28+
### 2. Submit MAST job
29+
30+
Use the launch script to submit a MAST job:
31+
32+
```bash
33+
# Make the launch script executable (first time only)
34+
chmod +x .meta/mast/launch.sh
35+
36+
# Launch a job with your desired config
37+
./.meta/mast/launch.sh .meta/mast/qwen3_1_7b_mast.yaml
38+
```
39+
40+
The launch script will automatically:
41+
- Navigate to the forge root directory
42+
- Reinstall the forge package with your latest changes
43+
- Set the correct PYTHONPATH
44+
- Launch the MAST job with the specified config
45+
46+
You can run it from anywhere, and it will figure out the correct paths.
File renamed without changes.

apps/mast/env_setup.sh renamed to .meta/mast/env_setup.sh

Lines changed: 74 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
# setup_forge_env.sh - Setup conda environment and install forge with mounting
1010
set -e # Exit on any error
1111

12+
# Configuration
13+
CONDA_ENV_NAME="forge:stable"
14+
1215
# Colors for output
1316
RED='\033[0;31m'
1417
GREEN='\033[0;32m'
@@ -45,6 +48,7 @@ mount_workspace() {
4548
log_info "Creating mount directory: $mount_dir"
4649
sudo mkdir -p "$mount_dir" || {
4750
log_error "Failed to create mount directory (may need sudo privileges)"
51+
log_error "You could alternatively try to unmount with `sudo umount /mnt/wsfuse`"
4852
return 1
4953
}
5054
fi
@@ -130,10 +134,10 @@ if [ ! -f "$CONDA_SCRIPT_PATH" ]; then
130134
fi
131135

132136
log_info "Sourcing conda script: $CONDA_SCRIPT_PATH"
133-
source "$CONDA_SCRIPT_PATH" activate forge:e146614
137+
source "$CONDA_SCRIPT_PATH" activate "$CONDA_ENV_NAME"
134138

135139
if [ $? -ne 0 ]; then
136-
log_error "Failed to activate conda environment forge-e146614"
140+
log_error "Failed to activate conda environment $CONDA_ENV_NAME"
137141
exit 1
138142
fi
139143

@@ -191,8 +195,72 @@ fi
191195

192196
log_info "Current directory: $(pwd)"
193197

194-
# Step 5: Install forge package
195-
log_info "Step 5: Installing forge package..."
198+
# Step 5: Install torchtitan
199+
log_info "Step 5: Installing torchtitan..."
200+
201+
# Source versions.sh to get the pinned commit
202+
VERSIONS_FILE="$FORGE_REPO_DIR/assets/versions.sh"
203+
if [ -f "$VERSIONS_FILE" ]; then
204+
log_info "Sourcing version information from: $VERSIONS_FILE"
205+
source "$VERSIONS_FILE"
206+
207+
if [ -n "$TORCHTITAN_COMMIT" ]; then
208+
log_info "Installing torchtitan from commit: $TORCHTITAN_COMMIT"
209+
pip uninstall -y torchtitan
210+
pip install "git+https://github.com/pytorch/torchtitan.git@$TORCHTITAN_COMMIT"
211+
212+
if [ $? -eq 0 ]; then
213+
log_info "Torchtitan installed successfully"
214+
else
215+
log_error "Failed to install torchtitan"
216+
exit 1
217+
fi
218+
else
219+
log_error "TORCHTITAN_COMMIT not found in versions.sh"
220+
exit 1
221+
fi
222+
else
223+
log_error "versions.sh not found at: $VERSIONS_FILE"
224+
log_error "Cannot proceed without version information"
225+
exit 1
226+
fi
227+
228+
# Step 5.5: Apply monarch torch import hack
229+
log_info "Step 5.5: Applying monarch torch import hack..."
230+
231+
MONARCH_INIT="$CONDA_PREFIX/lib/python3.10/site-packages/monarch/__init__.py"
232+
if [ -f "$MONARCH_INIT" ]; then
233+
# Check if we already applied the hack
234+
if grep -q "^import torch # Injected by forge setup" "$MONARCH_INIT"; then
235+
log_info "Monarch torch import hack already applied, skipping"
236+
else
237+
log_info "Injecting 'import torch' into monarch/__init__.py"
238+
239+
# Create a backup
240+
cp "$MONARCH_INIT" "$MONARCH_INIT.bak"
241+
242+
# Use sed to inject 'import torch' before the "# Import before monarch" comment
243+
# We add it right after "from typing import TYPE_CHECKING" and before the comment
244+
sed -i '/^from typing import TYPE_CHECKING$/a\
245+
\
246+
# Torch must be imported before monarch (injected by forge setup)\
247+
import torch # Injected by forge setup' "$MONARCH_INIT"
248+
249+
if [ $? -eq 0 ]; then
250+
log_info "Successfully injected torch import into monarch/__init__.py"
251+
else
252+
log_error "Failed to inject torch import, restoring backup"
253+
mv "$MONARCH_INIT.bak" "$MONARCH_INIT"
254+
exit 1
255+
fi
256+
fi
257+
else
258+
log_warn "monarch/__init__.py not found at: $MONARCH_INIT"
259+
log_warn "Skipping monarch torch import hack (monarch may not be installed yet)"
260+
fi
261+
262+
# Step 6: Install forge package
263+
log_info "Step 6: Installing forge package..."
196264
pip install --no-deps --force-reinstall .
197265
if [ $? -ne 0 ]; then
198266
log_error "Failed to install forge package"
@@ -234,5 +302,5 @@ log_info "Mounted workspace available at: /mnt/wsfuse"
234302
echo ""
235303
log_info "Installation completed successfully!"
236304
echo ""
237-
log_info "Re-activate the conda environment to make the changes take effect:"
238-
log_info "conda deactivate && conda activate forge-e146614"
305+
log_info "Test that this is working locally with:"
306+
log_info "python -m apps.grpo.main --config=apps/grpo/qwen3_1_7b.yaml"

.meta/mast/launch.sh

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
3+
# Copyright (c) Meta Platforms, Inc. and affiliates.
4+
# All rights reserved.
5+
#
6+
# This source code is licensed under the BSD-style license found in the
7+
# LICENSE file in the root directory of this source tree.
8+
9+
# launch.sh - Launch MAST jobs with Forge
10+
set -e # Exit on any error
11+
12+
# Colors for output
13+
RED='\033[0;31m'
14+
GREEN='\033[0;32m'
15+
YELLOW='\033[1;33m'
16+
NC='\033[0m' # No Color
17+
18+
# Logging functions
19+
log_info() {
20+
echo -e "${GREEN}[INFO]${NC} $1"
21+
}
22+
23+
log_error() {
24+
echo -e "${RED}[ERROR]${NC} $1"
25+
}
26+
27+
# Check if config file is provided
28+
if [ $# -eq 0 ]; then
29+
log_error "No config file provided"
30+
echo "Usage: $0 <config_file>"
31+
echo "Example: $0 .meta/mast/qwen3_1_7b_mast.yaml"
32+
exit 1
33+
fi
34+
35+
CONFIG_FILE="$1"
36+
37+
# Get the directory where this script is located
38+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
39+
40+
# Navigate to forge root (two levels up from .meta/mast/)
41+
FORGE_ROOT="$( cd "$SCRIPT_DIR/../.." && pwd )"
42+
43+
log_info "Forge root directory: $FORGE_ROOT"
44+
log_info "Config file: $CONFIG_FILE"
45+
46+
# Check if config file exists
47+
if [ ! -f "$FORGE_ROOT/$CONFIG_FILE" ]; then
48+
log_error "Config file not found: $FORGE_ROOT/$CONFIG_FILE"
49+
exit 1
50+
fi
51+
52+
# Navigate to forge root
53+
cd "$FORGE_ROOT"
54+
log_info "Changed to directory: $(pwd)"
55+
56+
# Reinstall forge package
57+
log_info "Reinstalling forge package..."
58+
pip install --force-reinstall --no-deps .
59+
if [ $? -ne 0 ]; then
60+
log_error "Failed to reinstall forge package"
61+
exit 1
62+
fi
63+
64+
log_info "Successfully reinstalled forge package"
65+
66+
# Launch the job
67+
log_info "Launching MAST job..."
68+
PYTHONPATH=. python .meta/mast/main.py --config "$CONFIG_FILE"
File renamed without changes.

apps/mast/qwen3_14b_mast.yaml renamed to .meta/mast/qwen3_14b_mast.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Grouped Relative Policy Optimization (GRPO)
2-
# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
2+
# >>> ./.meta/mast/launch.sh .meta/mast/qwen3_14b_mast.yaml
33

44
# Global configuration
55
group_size: 8

apps/mast/qwen3_1_7b_mast.yaml renamed to .meta/mast/qwen3_1_7b_mast.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Grouped Relative Policy Optimization (GRPO)
2-
# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
2+
# >>> ./.meta/mast/launch.sh .meta/mast/qwen3_1_7b_mast.yaml
33

44
# Global configuration
55
group_size: 8

apps/mast/qwen3_32b_mast.yaml renamed to .meta/mast/qwen3_32b_mast.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Grouped Relative Policy Optimization (GRPO)
2-
# >>> python -m apps.mast.main --config apps/mast/qwen3_1_7b_mast.yaml
2+
# >>> ./.meta/mast/launch.sh .meta/mast/qwen3_32b_mast.yaml
33

44
# Global configuration
55
group_size: 8

apps/mast/qwen3_4b_mast.yaml renamed to .meta/mast/qwen3_4b_mast.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Grouped Relative Policy Optimization (GRPO)
2-
# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
2+
# >>> ./.meta/mast/launch.sh .meta/mast/qwen3_4b_mast.yaml
33

44
# Global configuration
55
group_size: 8

apps/mast/qwen3_8b_mast.yaml renamed to .meta/mast/qwen3_8b_mast.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Grouped Relative Policy Optimization (GRPO)
2-
# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
2+
# >>> ./.meta/mast/launch.sh .meta/mast/qwen3_8b_mast.yaml
33

44
# Global configuration
55
group_size: 8

0 commit comments

Comments
 (0)