|
9 | 9 | # setup_forge_env.sh - Setup conda environment and install forge with mounting |
10 | 10 | set -e # Exit on any error |
11 | 11 |
|
| 12 | +# Configuration |
| 13 | +CONDA_ENV_NAME="forge:stable" |
| 14 | + |
12 | 15 | # Colors for output |
13 | 16 | RED='\033[0;31m' |
14 | 17 | GREEN='\033[0;32m' |
@@ -45,6 +48,7 @@ mount_workspace() { |
45 | 48 | log_info "Creating mount directory: $mount_dir" |
46 | 49 | sudo mkdir -p "$mount_dir" || { |
47 | 50 | log_error "Failed to create mount directory (may need sudo privileges)" |
| 51 | + log_error "You could alternatively try to unmount with `sudo umount /mnt/wsfuse`" |
48 | 52 | return 1 |
49 | 53 | } |
50 | 54 | fi |
@@ -130,10 +134,10 @@ if [ ! -f "$CONDA_SCRIPT_PATH" ]; then |
130 | 134 | fi |
131 | 135 |
|
132 | 136 | log_info "Sourcing conda script: $CONDA_SCRIPT_PATH" |
133 | | -source "$CONDA_SCRIPT_PATH" activate forge:e146614 |
| 137 | +source "$CONDA_SCRIPT_PATH" activate "$CONDA_ENV_NAME" |
134 | 138 |
|
135 | 139 | if [ $? -ne 0 ]; then |
136 | | - log_error "Failed to activate conda environment forge-e146614" |
| 140 | + log_error "Failed to activate conda environment $CONDA_ENV_NAME" |
137 | 141 | exit 1 |
138 | 142 | fi |
139 | 143 |
|
|
191 | 195 |
|
192 | 196 | log_info "Current directory: $(pwd)" |
193 | 197 |
|
194 | | -# Step 5: Install forge package |
195 | | -log_info "Step 5: Installing forge package..." |
| 198 | +# Step 5: Install torchtitan |
| 199 | +log_info "Step 5: Installing torchtitan..." |
| 200 | + |
| 201 | +# Source versions.sh to get the pinned commit |
| 202 | +VERSIONS_FILE="$FORGE_REPO_DIR/assets/versions.sh" |
| 203 | +if [ -f "$VERSIONS_FILE" ]; then |
| 204 | + log_info "Sourcing version information from: $VERSIONS_FILE" |
| 205 | + source "$VERSIONS_FILE" |
| 206 | + |
| 207 | + if [ -n "$TORCHTITAN_COMMIT" ]; then |
| 208 | + log_info "Installing torchtitan from commit: $TORCHTITAN_COMMIT" |
| 209 | + pip uninstall -y torchtitan |
| 210 | + pip install "git+https://github.com/pytorch/torchtitan.git@$TORCHTITAN_COMMIT" |
| 211 | + |
| 212 | + if [ $? -eq 0 ]; then |
| 213 | + log_info "Torchtitan installed successfully" |
| 214 | + else |
| 215 | + log_error "Failed to install torchtitan" |
| 216 | + exit 1 |
| 217 | + fi |
| 218 | + else |
| 219 | + log_error "TORCHTITAN_COMMIT not found in versions.sh" |
| 220 | + exit 1 |
| 221 | + fi |
| 222 | +else |
| 223 | + log_error "versions.sh not found at: $VERSIONS_FILE" |
| 224 | + log_error "Cannot proceed without version information" |
| 225 | + exit 1 |
| 226 | +fi |
| 227 | + |
| 228 | +# Step 5.5: Apply monarch torch import hack |
| 229 | +log_info "Step 5.5: Applying monarch torch import hack..." |
| 230 | + |
| 231 | +MONARCH_INIT="$CONDA_PREFIX/lib/python3.10/site-packages/monarch/__init__.py" |
| 232 | +if [ -f "$MONARCH_INIT" ]; then |
| 233 | + # Check if we already applied the hack |
| 234 | + if grep -q "^import torch # Injected by forge setup" "$MONARCH_INIT"; then |
| 235 | + log_info "Monarch torch import hack already applied, skipping" |
| 236 | + else |
| 237 | + log_info "Injecting 'import torch' into monarch/__init__.py" |
| 238 | + |
| 239 | + # Create a backup |
| 240 | + cp "$MONARCH_INIT" "$MONARCH_INIT.bak" |
| 241 | + |
| 242 | + # Use sed to inject 'import torch' before the "# Import before monarch" comment |
| 243 | + # We add it right after "from typing import TYPE_CHECKING" and before the comment |
| 244 | + sed -i '/^from typing import TYPE_CHECKING$/a\ |
| 245 | +\ |
| 246 | +# Torch must be imported before monarch (injected by forge setup)\ |
| 247 | +import torch # Injected by forge setup' "$MONARCH_INIT" |
| 248 | + |
| 249 | + if [ $? -eq 0 ]; then |
| 250 | + log_info "Successfully injected torch import into monarch/__init__.py" |
| 251 | + else |
| 252 | + log_error "Failed to inject torch import, restoring backup" |
| 253 | + mv "$MONARCH_INIT.bak" "$MONARCH_INIT" |
| 254 | + exit 1 |
| 255 | + fi |
| 256 | + fi |
| 257 | +else |
| 258 | + log_warn "monarch/__init__.py not found at: $MONARCH_INIT" |
| 259 | + log_warn "Skipping monarch torch import hack (monarch may not be installed yet)" |
| 260 | +fi |
| 261 | + |
| 262 | +# Step 6: Install forge package |
| 263 | +log_info "Step 6: Installing forge package..." |
196 | 264 | pip install --no-deps --force-reinstall . |
197 | 265 | if [ $? -ne 0 ]; then |
198 | 266 | log_error "Failed to install forge package" |
@@ -234,5 +302,5 @@ log_info "Mounted workspace available at: /mnt/wsfuse" |
234 | 302 | echo "" |
235 | 303 | log_info "Installation completed successfully!" |
236 | 304 | echo "" |
237 | | -log_info "Re-activate the conda environment to make the changes take effect:" |
238 | | -log_info "conda deactivate && conda activate forge-e146614" |
| 305 | +log_info "Test that this is working locally with:" |
| 306 | +log_info "python -m apps.grpo.main --config=apps/grpo/qwen3_1_7b.yaml" |
0 commit comments