Merge pull request #8 from jeremymanning/main

jeremymanning · web-flow · commit a8c44f24e757 · 2025-09-17T13:26:26.000-04:00
Add remote training and model sync scripts for distributed GPU training
diff --git a/README.md b/README.md
@@ -42,6 +42,8 @@ llm-stylometry/
 │   ├── test_*.py       # Test modules
 │   └── check_outputs.py # Output validation script
 ├── run_llm_stylometry.sh # Shell wrapper for easy setup
+├── remote_train.sh     # Remote GPU server training script
+├── sync_models.sh      # Download models from remote server
 ├── LICENSE             # MIT License
 ├── README.md           # This file
 ├── requirements-dev.txt # Development dependencies
@@ -167,9 +169,14 @@ fig = generate_all_losses_figure(
 
 **Note**: Training requires a CUDA-enabled GPU and takes significant time (~80 models total).
 
+### Local Training
+
 ```bash
 # Using the CLI (recommended - handles all steps automatically)
 ./run_llm_stylometry.sh --train
+
+# Limit GPU usage if needed
+./run_llm_stylometry.sh --train --max-gpus 4
 ```
 
 This command will:
@@ -179,6 +186,57 @@ This command will:
 
 The training pipeline automatically handles data preparation, model training across available GPUs, and result consolidation. Individual model checkpoints and loss logs are saved in the `models/` directory.
 
+### Remote Training on GPU Server
+
+For training on a remote GPU server, use the provided `remote_train.sh` script:
+
+```bash
+# Start remote training
+./remote_train.sh
+
+# You'll be prompted for:
+# - Server address (hostname or IP)
+# - Username
+# - Password (for SSH)
+```
+
+This script will:
+1. Connect to your GPU server via SSH
+2. Clone or update the repository in `~/llm-stylometry`
+3. Start training in a `screen` session that persists after disconnection
+4. Allow you to safely disconnect while training continues
+
+To monitor training progress:
+```bash
+ssh username@server
+screen -r llm_training  # Reattach to training session
+# Press Ctrl+A, then D to detach again
+```
+
+### Downloading Trained Models
+
+After training completes on a remote server, use `sync_models.sh` to download the models:
+
+```bash
+# Download trained models from server
+./sync_models.sh
+
+# You'll be prompted for:
+# - Server address
+# - Username
+# - Password
+```
+
+This script will:
+1. Verify all 80 models are complete with weights
+2. Create a compressed archive on the server
+3. Download via rsync with progress indication
+4. Extract to your local `~/llm-stylometry/models/` directory
+5. Back up any existing local models
+6. Also sync `model_results.pkl` if available
+
+**Note**: The script will only download if all 80 models are complete. If training is still in progress, it will show which models are missing.
+
 ### Model Configuration
 
 Each model uses:
diff --git a/code/generate_figures.py b/code/generate_figures.py
@@ -72,10 +72,22 @@ def train_models(max_gpus=None):
     # Train models
     safe_print("\nTraining models...")
     try:
-        # Set environment to disable tqdm and multiprocessing (which can hang in subprocess)
+        # Set environment variables for training
         env = os.environ.copy()
-        env['DISABLE_TQDM'] = '1'
-        env['NO_MULTIPROCESSING'] = '1'
+        env['DISABLE_TQDM'] = '1'  # Disable progress bars in subprocess
+        # Only disable multiprocessing if we have a single GPU or non-GPU device
+        # With multiple GPUs, we want parallel training
+        if torch.cuda.is_available():
+            gpu_count = torch.cuda.device_count()
+            if gpu_count <= 1:
+                env['NO_MULTIPROCESSING'] = '1'
+                safe_print("Single GPU detected - using sequential mode")
+            else:
+                safe_print(f"Multiple GPUs detected ({gpu_count}) - using parallel training")
+        else:
+            # Non-CUDA device (CPU or MPS)
+            env['NO_MULTIPROCESSING'] = '1'
+            safe_print("Non-CUDA device - using sequential mode")
         # Set PyTorch memory management for better GPU memory usage
         env['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
         # Pass through max GPUs limit if specified
diff --git a/remote_train.sh b/remote_train.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# Remote Training Script for LLM Stylometry
+# This script connects to a GPU server, clones/updates the repository, and starts training
+
+set -e
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+print_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
+print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
+print_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
+print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
+
+echo "=================================================="
+echo "       LLM Stylometry Remote Training Setup"
+echo "=================================================="
+echo
+
+# Get server details
+read -p "Enter GPU server address (hostname or IP): " SERVER_ADDRESS
+if [ -z "$SERVER_ADDRESS" ]; then
+    print_error "Server address cannot be empty"
+    exit 1
+fi
+
+read -p "Enter username for $SERVER_ADDRESS: " USERNAME
+if [ -z "$USERNAME" ]; then
+    print_error "Username cannot be empty"
+    exit 1
+fi
+
+# Create the remote training script
+REMOTE_SCRIPT='
+#!/bin/bash
+set -e
+
+echo "=================================================="
+echo "Setting up LLM Stylometry on remote server"
+echo "=================================================="
+echo
+
+# Check if repo exists
+if [ -d "$HOME/llm-stylometry" ]; then
+    echo "Repository exists. Updating to latest version..."
+    cd "$HOME/llm-stylometry"
+
+    # Stash any local changes
+    if ! git diff --quiet || ! git diff --cached --quiet; then
+        echo "Stashing local changes..."
+        git stash
+    fi
+
+    # Update repository
+    git fetch origin
+    git checkout main
+    git pull origin main
+    echo "Repository updated successfully"
+else
+    echo "Cloning repository..."
+    cd "$HOME"
+    git clone https://github.com/ContextLab/llm-stylometry.git
+    cd "$HOME/llm-stylometry"
+    echo "Repository cloned successfully"
+fi
+
+# Check for screen
+if ! command -v screen &> /dev/null; then
+    echo "Installing screen..."
+    if command -v apt-get &> /dev/null; then
+        sudo apt-get update && sudo apt-get install -y screen
+    elif command -v yum &> /dev/null; then
+        sudo yum install -y screen
+    else
+        echo "Warning: Could not install screen. Please install it manually."
+    fi
+fi
+
+# Create log directory
+mkdir -p "$HOME/llm-stylometry/logs"
+LOG_FILE="$HOME/llm-stylometry/logs/training_$(date +%Y%m%d_%H%M%S).log"
+
+echo ""
+echo "=================================================="
+echo "Starting training in screen session"
+echo "=================================================="
+echo "Training will run in a screen session named: llm_training"
+echo "Log file: $LOG_FILE"
+echo ""
+echo "Useful commands:"
+echo "  - Detach from screen: Ctrl+A, then D"
+echo "  - Reattach later: screen -r llm_training"
+echo "  - View log: tail -f $LOG_FILE"
+echo ""
+echo "Starting training in 5 seconds..."
+sleep 5
+
+# Kill any existing screen session with the same name
+screen -X -S llm_training quit 2>/dev/null || true
+
+# Start training in screen
+screen -dmS llm_training bash -c "
+    cd $HOME/llm-stylometry
+    echo 'Training started at $(date)' | tee -a $LOG_FILE
+    ./run_llm_stylometry.sh --train 2>&1 | tee -a $LOG_FILE
+    echo 'Training completed at $(date)' | tee -a $LOG_FILE
+"
+
+# Wait a moment for screen to start
+sleep 2
+
+# Check if screen session started
+if screen -list | grep -q "llm_training"; then
+    echo ""
+    echo "✓ Training started successfully in screen session!"
+    echo ""
+    echo "The training is now running in the background."
+    echo "You can safely disconnect from SSH."
+    echo ""
+    echo "To monitor progress, reconnect and run:"
+    echo "  screen -r llm_training"
+    echo ""
+    echo "Or view the log file:"
+    echo "  tail -f $LOG_FILE"
+
+    # Attach to screen session
+    echo ""
+    echo "Attaching to screen session in 3 seconds..."
+    echo "(Press Ctrl+A, then D to detach and leave training running)"
+    sleep 3
+    screen -r llm_training
+else
+    echo "Error: Failed to start screen session"
+    exit 1
+fi
+'
+
+# Execute the remote script via SSH
+print_info "Connecting to $USERNAME@$SERVER_ADDRESS..."
+print_info "You may be prompted for your password and/or GitHub credentials."
+echo
+
+ssh -t "$USERNAME@$SERVER_ADDRESS" "$REMOTE_SCRIPT"
+
+RESULT=$?
+if [ $RESULT -eq 0 ]; then
+    print_success "Remote training setup completed!"
+    echo
+    echo "Training is running on $SERVER_ADDRESS"
+    echo "To reconnect and check progress:"
+    echo "  ssh $USERNAME@$SERVER_ADDRESS"
+    echo "  screen -r llm_training"
+else
+    print_error "Remote training setup failed"
+    exit 1
+fi
diff --git a/sync_models.sh b/sync_models.sh