|
| 1 | +#!/bin/bash |
| 2 | +# |
| 3 | +# Check HuggingFace model training status on remote GPU server |
| 4 | +# |
| 5 | +# Adapted from check_remote_status.sh pattern |
| 6 | +# |
| 7 | + |
| 8 | +set -e |
| 9 | + |
| 10 | +# Color output helpers |
| 11 | +print_info() { |
| 12 | + echo -e "\033[0;34m[INFO]\033[0m $1" |
| 13 | +} |
| 14 | + |
| 15 | +print_error() { |
| 16 | + echo -e "\033[0;31m[ERROR]\033[0m $1" >&2 |
| 17 | +} |
| 18 | + |
| 19 | +print_success() { |
| 20 | + echo -e "\033[0;32m[SUCCESS]\033[0m $1" |
| 21 | +} |
| 22 | + |
| 23 | +# Default cluster |
| 24 | +CLUSTER="" # Must be specified with --cluster flag |
| 25 | + |
| 26 | +# Parse arguments |
| 27 | +while [[ $# -gt 0 ]]; do |
| 28 | + case $1 in |
| 29 | + --cluster) |
| 30 | + CLUSTER="$2" |
| 31 | + shift 2 |
| 32 | + ;; |
| 33 | + -h|--help) |
| 34 | + echo "Usage: $0 [OPTIONS]" |
| 35 | + echo "" |
| 36 | + echo "Check HuggingFace model training status on remote GPU server" |
| 37 | + echo "" |
| 38 | + echo "Options:" |
| 39 | + echo " --cluster NAME Select cluster (required)" |
| 40 | + echo " -h, --help Show this help message" |
| 41 | + echo "" |
| 42 | + echo "Examples:" |
| 43 | + echo " $0 --cluster tensor02" |
| 44 | + exit 0 |
| 45 | + ;; |
| 46 | + *) |
| 47 | + print_error "Unknown option: $1" |
| 48 | + echo "Use --help for usage information" |
| 49 | + exit 1 |
| 50 | + ;; |
| 51 | + esac |
| 52 | +done |
| 53 | + |
| 54 | +# Validate cluster is specified |
| 55 | +if [ -z "$CLUSTER" ]; then |
| 56 | + print_error "Cluster must be specified with --cluster flag" |
| 57 | + echo "Example: $0 --cluster mycluster" |
| 58 | + exit 1 |
| 59 | +fi |
| 60 | + |
| 61 | +# Read credentials from config file |
| 62 | +CRED_FILE=".ssh/credentials_${CLUSTER}.json" |
| 63 | +if [ ! -f "$CRED_FILE" ]; then |
| 64 | + print_error "Credentials file not found: $CRED_FILE" |
| 65 | + exit 1 |
| 66 | +fi |
| 67 | + |
| 68 | +SERVER_ADDRESS=$(python3 -c "import json; print(json.load(open('$CRED_FILE'))['server'])" 2>/dev/null) |
| 69 | +USERNAME=$(python3 -c "import json; print(json.load(open('$CRED_FILE'))['username'])" 2>/dev/null) |
| 70 | +PASSWORD=$(python3 -c "import json; print(json.load(open('$CRED_FILE'))['password'])" 2>/dev/null) |
| 71 | + |
| 72 | +if [ -z "$SERVER_ADDRESS" ] || [ -z "$USERNAME" ] || [ -z "$PASSWORD" ]; then |
| 73 | + print_error "Failed to read credentials from $CRED_FILE" |
| 74 | + exit 1 |
| 75 | +fi |
| 76 | + |
| 77 | +# Setup SSH command |
| 78 | +if ! command -v sshpass &> /dev/null; then |
| 79 | + print_error "sshpass is required but not installed" |
| 80 | + exit 1 |
| 81 | +fi |
| 82 | + |
| 83 | +SSH_CMD="sshpass -p '$PASSWORD' ssh -o StrictHostKeyChecking=no" |
| 84 | + |
| 85 | +print_info "Connecting to $USERNAME@$SERVER_ADDRESS..." |
| 86 | +print_info "Checking HF training status on $CLUSTER..." |
| 87 | +echo "" |
| 88 | + |
| 89 | +# Execute status check on remote server |
| 90 | +eval "$SSH_CMD \"$USERNAME@$SERVER_ADDRESS\" 'bash -s'" << 'ENDSSH' |
| 91 | +#!/bin/bash |
| 92 | +
|
| 93 | +# Change to project directory |
| 94 | +cd ~/llm-stylometry || { echo "ERROR: Project directory ~/llm-stylometry not found"; exit 1; } |
| 95 | +
|
| 96 | +# Activate conda environment |
| 97 | +if ! command -v conda &> /dev/null; then |
| 98 | + echo "ERROR: conda not found" |
| 99 | + exit 1 |
| 100 | +fi |
| 101 | +
|
| 102 | +eval "$(conda shell.bash hook)" 2>/dev/null || { echo "ERROR: Failed to initialize conda"; exit 1; } |
| 103 | +conda activate llm-stylometry 2>/dev/null || { echo "ERROR: llm-stylometry environment not found"; exit 1; } |
| 104 | +
|
| 105 | +# Create temporary Python script |
| 106 | +cat > /tmp/check_hf_status.py << 'ENDPYTHON' |
| 107 | +#!/usr/bin/env python |
| 108 | +"""Check HuggingFace training status.""" |
| 109 | +
|
| 110 | +import sys |
| 111 | +import pandas as pd |
| 112 | +import numpy as np |
| 113 | +from pathlib import Path |
| 114 | +from datetime import datetime, timedelta |
| 115 | +
|
| 116 | +AUTHORS = ['austen', 'baum', 'dickens', 'fitzgerald', 'melville', 'thompson', 'twain', 'wells'] |
| 117 | +TARGET_LOSS = 0.1 # HF target loss |
| 118 | +PAPER_LOSS = 3.0 # Paper stopping point |
| 119 | +
|
| 120 | +def format_timedelta(td): |
| 121 | + """Format timedelta as human-readable string.""" |
| 122 | + total_seconds = int(td.total_seconds()) |
| 123 | + days = total_seconds // 86400 |
| 124 | + hours = (total_seconds % 86400) // 3600 |
| 125 | + minutes = (total_seconds % 3600) // 60 |
| 126 | +
|
| 127 | + if days > 0: |
| 128 | + return f"{days}d {hours}h {minutes}m" |
| 129 | + elif hours > 0: |
| 130 | + return f"{hours}h {minutes}m" |
| 131 | + else: |
| 132 | + return f"{minutes}m" |
| 133 | +
|
| 134 | +def check_author_status(author): |
| 135 | + """Check HF training status for a single author.""" |
| 136 | + # Check seed=0 model (HF training location) |
| 137 | + model_dir = Path(f'models/{author}_tokenizer=gpt2_seed=0') |
| 138 | + loss_log = model_dir / 'loss_logs.csv' |
| 139 | +
|
| 140 | + if not loss_log.exists(): |
| 141 | + return None |
| 142 | +
|
| 143 | + try: |
| 144 | + df = pd.read_csv(loss_log) |
| 145 | + if len(df) == 0: |
| 146 | + return None |
| 147 | +
|
| 148 | + # Get latest epoch |
| 149 | + max_epoch = df['epochs_completed'].max() |
| 150 | + train_rows = df[(df['epochs_completed'] == max_epoch) & (df['loss_dataset'] == 'train')] |
| 151 | +
|
| 152 | + if len(train_rows) == 0: |
| 153 | + return None |
| 154 | +
|
| 155 | + current_loss = train_rows.iloc[0]['loss_value'] |
| 156 | +
|
| 157 | + # Check if we're in HF training (loss < PAPER_LOSS) |
| 158 | + hf_rows = df[(df['loss_dataset'] == 'train') & (df['loss_value'] < PAPER_LOSS)] |
| 159 | +
|
| 160 | + if len(hf_rows) == 0: |
| 161 | + # Not yet started HF training |
| 162 | + return None |
| 163 | +
|
| 164 | + # Find when HF training started |
| 165 | + hf_start_epoch = int(hf_rows.iloc[0]['epochs_completed']) |
| 166 | + epochs_since_start = int(max_epoch - hf_start_epoch) |
| 167 | +
|
| 168 | + # Estimate elapsed time (rough: 10 sec/epoch with eval skipped) |
| 169 | + elapsed = timedelta(seconds=int(epochs_since_start * 10)) |
| 170 | +
|
| 171 | + # Check if complete |
| 172 | + is_complete = current_loss <= TARGET_LOSS |
| 173 | +
|
| 174 | + return { |
| 175 | + 'current_epoch': max_epoch, |
| 176 | + 'current_loss': current_loss, |
| 177 | + 'target_loss': TARGET_LOSS, |
| 178 | + 'is_complete': is_complete, |
| 179 | + 'hf_start_epoch': hf_start_epoch, |
| 180 | + 'epochs_since_start': epochs_since_start, |
| 181 | + 'elapsed': elapsed |
| 182 | + } |
| 183 | +
|
| 184 | + except Exception as e: |
| 185 | + return None |
| 186 | +
|
| 187 | +# Print report |
| 188 | +print("=" * 80) |
| 189 | +print("HUGGINGFACE MODEL TRAINING STATUS") |
| 190 | +print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
| 191 | +print("=" * 80) |
| 192 | +
|
| 193 | +completed_count = 0 |
| 194 | +in_progress_count = 0 |
| 195 | +not_started_count = 0 |
| 196 | +
|
| 197 | +for author in AUTHORS: |
| 198 | + status = check_author_status(author) |
| 199 | +
|
| 200 | + print(f"\n{author.upper()}") |
| 201 | + print("-" * 80) |
| 202 | +
|
| 203 | + if status is None: |
| 204 | + print(" Status: Not started") |
| 205 | + not_started_count += 1 |
| 206 | + elif status['is_complete']: |
| 207 | + print(f" Status: Complete ✓") |
| 208 | + print(f" Final loss: {status['current_loss']:.4f}") |
| 209 | + print(f" Total epochs: {status['current_epoch']:,}") |
| 210 | + print(f" HF epochs: {status['epochs_since_start']:,} (from epoch {status['hf_start_epoch']})") |
| 211 | + completed_count += 1 |
| 212 | + else: |
| 213 | + print(f" Status: Training...") |
| 214 | + print(f" Current epoch: {status['current_epoch']:,}") |
| 215 | + print(f" Current loss: {status['current_loss']:.4f}") |
| 216 | + print(f" Target loss: {status['target_loss']:.4f}") |
| 217 | + print(f" HF epochs completed: {status['epochs_since_start']:,}") |
| 218 | + print(f" Elapsed: {format_timedelta(status['elapsed'])}") |
| 219 | +
|
| 220 | + # Estimate remaining time based on loss decay |
| 221 | + if status['epochs_since_start'] > 10: |
| 222 | + # Rough estimate: assume exponential decay |
| 223 | + # loss goes from ~3.0 to ~0.1 (factor of 30) |
| 224 | + # Current progress |
| 225 | + loss_ratio = (PAPER_LOSS - status['current_loss']) / (PAPER_LOSS - TARGET_LOSS) |
| 226 | + progress_pct = loss_ratio * 100 |
| 227 | +
|
| 228 | + # Estimate total HF epochs needed (very rough) |
| 229 | + if loss_ratio > 0: |
| 230 | + estimated_total_hf_epochs = int(status['epochs_since_start'] / loss_ratio) |
| 231 | + remaining_epochs = estimated_total_hf_epochs - status['epochs_since_start'] |
| 232 | + eta = timedelta(seconds=remaining_epochs * 10) |
| 233 | + print(f" Progress: {progress_pct:.1f}%") |
| 234 | + print(f" Estimated remaining: {format_timedelta(eta)}") |
| 235 | +
|
| 236 | + in_progress_count += 1 |
| 237 | +
|
| 238 | +# Summary |
| 239 | +print("\n" + "=" * 80) |
| 240 | +print("SUMMARY") |
| 241 | +print("=" * 80) |
| 242 | +print(f"Completed: {completed_count}/8") |
| 243 | +print(f"In progress: {in_progress_count}/8") |
| 244 | +print(f"Not started: {not_started_count}/8") |
| 245 | +
|
| 246 | +if in_progress_count > 0 or completed_count < 8: |
| 247 | + print("\nTo download completed models:") |
| 248 | + print(" ./sync_hf_models.sh --cluster CLUSTER") |
| 249 | +
|
| 250 | +ENDPYTHON |
| 251 | +
|
| 252 | +# Execute the Python script |
| 253 | +python3 /tmp/check_hf_status.py |
| 254 | +
|
| 255 | +# Clean up |
| 256 | +rm -f /tmp/check_hf_status.py |
| 257 | +ENDSSH |
| 258 | + |
| 259 | +if [ $? -eq 0 ]; then |
| 260 | + echo "" |
| 261 | + print_success "Status check complete!" |
| 262 | +else |
| 263 | + print_error "Failed to check training status" |
| 264 | + exit 1 |
| 265 | +fi |
0 commit comments