Merge pull request #9 from jeremymanning/main

jeremymanning · web-flow · commit f945450b7680 · 2025-09-17T22:31:28.000-04:00
Simplify remote training and improve documentation
diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,7 @@ htmlcov/
 tests/output_*/
 tests/data/*.csv
 tests/data/*.pkl
-!tests/data/test_model_results.pkl
+!tests/data/test_model_results.pkl
+
+# Temporary test files
+.test_credentials
diff --git a/README.md b/README.md
@@ -188,34 +188,79 @@ The training pipeline automatically handles data preparation, model training acr
 
 ### Remote Training on GPU Server
 
-For training on a remote GPU server, use the provided `remote_train.sh` script:
+#### Prerequisites: Setting up Git credentials on the server
+
+Before using the remote training script, you need to set up Git credentials on your server once:
+
+1. SSH into your server:
+```bash
+ssh username@server
+```
+
+2. Configure Git with your credentials:
+```bash
+# Set your Git user information (use your GitHub username)
+git config --global user.name "your-github-username"
+git config --global user.email "your.email@example.com"
+
+# Enable credential storage
+git config --global credential.helper store
+```
+
+3. Clone the repository with your Personal Access Token:
+```bash
+# Replace <username> and <token> with your GitHub username and Personal Access Token
+# Get a token from: https://github.com/settings/tokens (grant 'repo' scope)
+git clone https://<username>:<token>@github.com/ContextLab/llm-stylometry.git
+
+# The credentials will be stored for future use
+cd llm-stylometry
+git pull  # This should work without prompting for credentials
+```
+
+#### Using the remote training script
+
+Once Git credentials are configured on your server, run `remote_train.sh` **from your local machine** (not on the GPU server):
 
 ```bash
-# Start remote training
+# From your local machine, start training on the remote GPU server
 ./remote_train.sh
 
+# Kill existing training sessions and optionally start new one
+./remote_train.sh --kill  # or -k
+
 # You'll be prompted for:
 # - Server address (hostname or IP)
 # - Username
-# - Password (for SSH)
 ```
 
-This script will:
-1. Connect to your GPU server via SSH
-2. Clone or update the repository in `~/llm-stylometry`
-3. Start training in a `screen` session that persists after disconnection
-4. Allow you to safely disconnect while training continues
+**What this script does:** The `remote_train.sh` script connects to your GPU server via SSH and executes `run_llm_stylometry.sh --train -y` in a `screen` session. This allows you to disconnect your local machine while the GPU server continues training.
+
+The script will:
+1. SSH into your GPU server
+2. Update the repository in `~/llm-stylometry` (or clone if it doesn't exist)
+3. Start `run_llm_stylometry.sh --train -y` in a `screen` session
+4. Exit, allowing your local machine to disconnect while training continues on the server
+
+#### Monitoring training progress
+
+To check on the training status, SSH into the server and reattach to the screen session:
 
-To monitor training progress:
 ```bash
+# From your local machine
 ssh username@server
-screen -r llm_training  # Reattach to training session
-# Press Ctrl+A, then D to detach again
+
+# On the server, reattach to see live training output
+screen -r llm_training
+
+# To detach and leave training running, press Ctrl+A, then D
+# To exit SSH while keeping training running
+exit
 ```
 
-### Downloading Trained Models
+#### Downloading results after training completes
 
-After training completes on a remote server, use `sync_models.sh` to download the models:
+Once training is complete, use `sync_models.sh` **from your local machine** to download the trained models and results:
 
 ```bash
 # Download trained models from server
diff --git a/code/generate_figures.py b/code/generate_figures.py
@@ -21,7 +21,7 @@
 from llm_stylometry.cli_utils import safe_print, format_header, is_windows
 
 
-def train_models(max_gpus=None):
+def train_models(max_gpus=None, no_confirm=False):
     """Train all models from scratch."""
     safe_print("\n" + "=" * 60)
     safe_print("Training Models from Scratch")
@@ -42,10 +42,14 @@ def train_models(max_gpus=None):
     safe_print(f"   Device: {device_info}")
     safe_print("   Training time depends on hardware (hours on GPU, days on CPU)")
 
-    response = input("\nProceed with training? [y/N]: ")
-    if response.lower() != 'y':
-        safe_print("Training cancelled.")
-        return False
+    if not no_confirm:
+        response = input("\nProceed with training? [y/N]: ")
+        if response.lower() != 'y':
+            safe_print("Training cancelled.")
+            return False
+    else:
+        safe_print("\nSkipping confirmation (--no-confirm flag set)")
+        safe_print("Starting training...")
 
     # Remove existing models directory to train from scratch
     import shutil
@@ -217,6 +221,12 @@ def main():
         default=None
     )
 
+    parser.add_argument(
+        '--no-confirm', '-y',
+        action='store_true',
+        help='Skip confirmation prompts (useful for non-interactive mode)'
+    )
+
     args = parser.parse_args()
 
     if args.list:
@@ -234,7 +244,7 @@ def main():
 
     # Train models if requested
     if args.train:
-        if not train_models(max_gpus=args.max_gpus):
+        if not train_models(max_gpus=args.max_gpus, no_confirm=args.no_confirm):
             return 1
         # Update data path to use newly generated results
         args.data = 'data/model_results.pkl'
diff --git a/remote_train.sh b/remote_train.sh
@@ -21,6 +21,18 @@ echo "=================================================="
 echo "       LLM Stylometry Remote Training Setup"
 echo "=================================================="
 echo
+echo "Usage: $0 [options]"
+echo "Options:"
+echo "  --kill, -k   Kill existing training sessions before starting new one"
+echo
+
+# Check for --kill flag
+if [ "$1" = "--kill" ] || [ "$1" = "-k" ]; then
+    echo "Kill mode: Will terminate existing training sessions"
+    KILL_MODE=true
+else
+    KILL_MODE=false
+fi
 
 # Get server details
 read -p "Enter GPU server address (hostname or IP): " SERVER_ADDRESS
@@ -35,8 +47,17 @@ if [ -z "$USERNAME" ]; then
     exit 1
 fi
 
-# Create the remote training script
-REMOTE_SCRIPT='
+print_info "Connecting to $USERNAME@$SERVER_ADDRESS..."
+
+# Test SSH connection first
+if ! ssh -o ConnectTimeout=5 -o BatchMode=yes "$USERNAME@$SERVER_ADDRESS" "echo 'Connection test successful'" 2>/dev/null; then
+    print_warning "Initial connection test failed. Trying with interactive authentication..."
+fi
+
+echo
+
+# Execute the remote script via SSH
+ssh -t "$USERNAME@$SERVER_ADDRESS" "KILL_MODE='$KILL_MODE' bash -s" << 'ENDSSH'
 #!/bin/bash
 set -e
 
@@ -45,27 +66,40 @@ echo "Setting up LLM Stylometry on remote server"
 echo "=================================================="
 echo
 
-# Check if repo exists
-if [ -d "$HOME/llm-stylometry" ]; then
-    echo "Repository exists. Updating to latest version..."
-    cd "$HOME/llm-stylometry"
+# Check if we're in kill mode
+if [ "$KILL_MODE" = "true" ]; then
+    echo "Kill mode activated - terminating existing training sessions..."
 
-    # Stash any local changes
-    if ! git diff --quiet || ! git diff --cached --quiet; then
-        echo "Stashing local changes..."
-        git stash
-    fi
+    # Kill any existing screen sessions
+    screen -ls | grep -o '[0-9]*\.llm_training' | cut -d. -f1 | while read pid; do
+        if [ ! -z "$pid" ]; then
+            echo "Killing screen session with PID: $pid"
+            screen -X -S "$pid.llm_training" quit
+        fi
+    done
+
+    # Also kill any remaining python training processes
+    pkill -f "python.*generate_figures.py.*--train" 2>/dev/null || true
+
+    echo "All training sessions terminated."
+    echo ""
+
+    # In non-interactive mode, always start new training after killing
+    echo "Starting new training session..."
+    echo ""
+fi
 
-    # Update repository
-    git fetch origin
-    git checkout main
-    git pull origin main
+# Check if repository exists
+if [ -d ~/llm-stylometry ]; then
+    echo "Repository exists. Updating..."
+    cd ~/llm-stylometry
+    git pull
     echo "Repository updated successfully"
 else
-    echo "Cloning repository..."
-    cd "$HOME"
+    echo "Repository not found. Cloning..."
+    cd ~
     git clone https://github.com/ContextLab/llm-stylometry.git
-    cd "$HOME/llm-stylometry"
+    cd ~/llm-stylometry
     echo "Repository cloned successfully"
 fi
 
@@ -82,8 +116,8 @@ if ! command -v screen &> /dev/null; then
 fi
 
 # Create log directory
-mkdir -p "$HOME/llm-stylometry/logs"
-LOG_FILE="$HOME/llm-stylometry/logs/training_$(date +%Y%m%d_%H%M%S).log"
+mkdir -p ~/llm-stylometry/logs
+LOG_FILE=~/llm-stylometry/logs/training_$(date +%Y%m%d_%H%M%S).log
 
 echo ""
 echo "=================================================="
@@ -95,26 +129,57 @@ echo ""
 echo "Useful commands:"
 echo "  - Detach from screen: Ctrl+A, then D"
 echo "  - Reattach later: screen -r llm_training"
-echo "  - View log: tail -f $LOG_FILE"
+echo "  - View log: tail -f ~/llm-stylometry/logs/training_*.log"
 echo ""
 echo "Starting training in 5 seconds..."
 sleep 5
 
 # Kill any existing screen session with the same name
 screen -X -S llm_training quit 2>/dev/null || true
 
-# Start training in screen
-screen -dmS llm_training bash -c "
-    cd $HOME/llm-stylometry
-    echo 'Training started at $(date)' | tee -a $LOG_FILE
-    ./run_llm_stylometry.sh --train 2>&1 | tee -a $LOG_FILE
-    echo 'Training completed at $(date)' | tee -a $LOG_FILE
-"
+# Start training in screen (use --no-confirm flag for non-interactive mode)
+# Create a script file first
+cat > /tmp/llm_train.sh << 'TRAINSCRIPT'
+#!/bin/bash
+set -e  # Exit on error
+
+# Change to the repository directory
+cd ~/llm-stylometry
+
+# Create log directory and file
+mkdir -p logs
+LOG_FILE=~/llm-stylometry/logs/training_$(date +%Y%m%d_%H%M%S).log
+echo "Training started at $(date)" | tee $LOG_FILE
+
+# Check if the run script exists
+if [ ! -f ./run_llm_stylometry.sh ]; then
+    echo "ERROR: run_llm_stylometry.sh not found in $(pwd)!" | tee -a $LOG_FILE
+    ls -la | tee -a $LOG_FILE
+    exit 1
+fi
+
+# Make sure it's executable
+chmod +x ./run_llm_stylometry.sh
+
+# Run the training script with non-interactive flag
+echo "Starting training with run_llm_stylometry.sh..." | tee -a $LOG_FILE
+./run_llm_stylometry.sh --train -y 2>&1 | tee -a $LOG_FILE
+
+echo "Training completed at $(date)" | tee -a $LOG_FILE
+TRAINSCRIPT
+
+chmod +x /tmp/llm_train.sh
+
+# Start screen session
+screen -dmS llm_training /tmp/llm_train.sh
 
 # Wait a moment for screen to start
 sleep 2
 
 # Check if screen session started
+echo "Checking screen sessions:"
+screen -list
+
 if screen -list | grep -q "llm_training"; then
     echo ""
     echo "✓ Training started successfully in screen session!"
@@ -138,14 +203,7 @@ else
     echo "Error: Failed to start screen session"
     exit 1
 fi
-'
-
-# Execute the remote script via SSH
-print_info "Connecting to $USERNAME@$SERVER_ADDRESS..."
-print_info "You may be prompted for your password and/or GitHub credentials."
-echo
-
-ssh -t "$USERNAME@$SERVER_ADDRESS" "$REMOTE_SCRIPT"
+ENDSSH
 
 RESULT=$?
 if [ $RESULT -eq 0 ]; then
diff --git a/run_llm_stylometry.sh b/run_llm_stylometry.sh
@@ -35,6 +35,7 @@ OPTIONS:
     -h, --help              Show this help message
     -f, --figure FIGURE     Generate specific figure (1a, 1b, 2a, 2b, 3, 4, 5)
     -t, --train             Train models from scratch before generating figures
+    -y, --yes, --no-confirm Skip confirmation prompts (non-interactive mode)
     -g, --max-gpus NUM      Maximum number of GPUs to use for training (default: all)
     -d, --data PATH         Path to model_results.pkl (default: data/model_results.pkl)
     -o, --output DIR        Output directory for figures (default: paper/figs/source)
@@ -289,6 +290,7 @@ SKIP_SETUP=false
 FORCE_INSTALL=false
 CLEAN=false
 CLEAN_CACHE=false
+NO_CONFIRM=false
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -304,6 +306,10 @@ while [[ $# -gt 0 ]]; do
             TRAIN=true
             shift
             ;;
+        -y|--yes|--no-confirm)
+            NO_CONFIRM=true
+            shift
+            ;;
         -g|--max-gpus)
             MAX_GPUS="$2"
             shift 2
@@ -421,6 +427,10 @@ if [ -n "$MAX_GPUS" ]; then
     PYTHON_CMD="$PYTHON_CMD --max-gpus $MAX_GPUS"
 fi
 
+if [ "$NO_CONFIRM" = true ]; then
+    PYTHON_CMD="$PYTHON_CMD --no-confirm"
+fi
+
 if [ "$DATA_PATH" != "data/model_results.pkl" ]; then
     PYTHON_CMD="$PYTHON_CMD --data $DATA_PATH"
 fi
diff --git a/sync_models.sh b/sync_models.sh