Skip to content

Commit cbcb2ae

Browse files
Merge pull request #43 from jeremymanning/main
Fix CI test and add HuggingFace training infrastructure
2 parents 8a7a4d7 + d757365 commit cbcb2ae

38 files changed

+5514
-359
lines changed

.gitignore

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,16 @@ models/*/model.safetensors
3939
models/*/model.pth
4040
models/*/pytorch_model.bin
4141
models/*/training_state.pt
42+
43+
# Model weight archives (for Dropbox distribution)
44+
model_weights_*.tar.gz
45+
# Note: .sha256 checksum files ARE checked into git (they're tiny)
46+
47+
# HuggingFace credentials (security)
48+
.huggingface/
49+
50+
# HuggingFace model files (weight files gitignored, but README.md should be tracked)
51+
models_hf/*/model.safetensors
52+
models_hf/*/training_state.pt
53+
models_hf/*/pytorch_model.bin
54+
!models_hf/*/README.md

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,15 @@ See the [Package API](#package-api) section for all available functions.
103103

104104
**Note**: T-test calculations (Figure 2) take 2-3 minutes due to statistical computations across all epochs and authors.
105105

106+
**Downloading pre-trained weights (optional):** Model weight files are gitignored due to size. Download pre-trained weights to explore or use trained models:
107+
108+
```bash
109+
./download_model_weights.sh --all # Download all variants (~26.6GB)
110+
./download_model_weights.sh -b # Baseline only (~6.7GB)
111+
```
112+
113+
See `models/README.md` for details. Pre-trained weights are not required for generating figures.
114+
106115
## Analysis Variants
107116

108117
The paper analyzes three linguistic variants (Supplemental Figures S1-S8):

assets/CDL_Avatar.png

3.63 KB
Loading

check_hf_status.sh

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
#!/bin/bash
2+
#
3+
# Check HuggingFace model training status on remote GPU server
4+
#
5+
# Adapted from check_remote_status.sh pattern
6+
#
7+
8+
set -e
9+
10+
# Color output helpers
11+
print_info() {
12+
echo -e "\033[0;34m[INFO]\033[0m $1"
13+
}
14+
15+
print_error() {
16+
echo -e "\033[0;31m[ERROR]\033[0m $1" >&2
17+
}
18+
19+
print_success() {
20+
echo -e "\033[0;32m[SUCCESS]\033[0m $1"
21+
}
22+
23+
# Default cluster
24+
CLUSTER="" # Must be specified with --cluster flag
25+
26+
# Parse arguments
27+
while [[ $# -gt 0 ]]; do
28+
case $1 in
29+
--cluster)
30+
CLUSTER="$2"
31+
shift 2
32+
;;
33+
-h|--help)
34+
echo "Usage: $0 [OPTIONS]"
35+
echo ""
36+
echo "Check HuggingFace model training status on remote GPU server"
37+
echo ""
38+
echo "Options:"
39+
echo " --cluster NAME Select cluster (required)"
40+
echo " -h, --help Show this help message"
41+
echo ""
42+
echo "Examples:"
43+
echo " $0 --cluster tensor02"
44+
exit 0
45+
;;
46+
*)
47+
print_error "Unknown option: $1"
48+
echo "Use --help for usage information"
49+
exit 1
50+
;;
51+
esac
52+
done
53+
54+
# Validate cluster is specified
55+
if [ -z "$CLUSTER" ]; then
56+
print_error "Cluster must be specified with --cluster flag"
57+
echo "Example: $0 --cluster mycluster"
58+
exit 1
59+
fi
60+
61+
# Read credentials from config file
62+
CRED_FILE=".ssh/credentials_${CLUSTER}.json"
63+
if [ ! -f "$CRED_FILE" ]; then
64+
print_error "Credentials file not found: $CRED_FILE"
65+
exit 1
66+
fi
67+
68+
SERVER_ADDRESS=$(python3 -c "import json; print(json.load(open('$CRED_FILE'))['server'])" 2>/dev/null)
69+
USERNAME=$(python3 -c "import json; print(json.load(open('$CRED_FILE'))['username'])" 2>/dev/null)
70+
PASSWORD=$(python3 -c "import json; print(json.load(open('$CRED_FILE'))['password'])" 2>/dev/null)
71+
72+
if [ -z "$SERVER_ADDRESS" ] || [ -z "$USERNAME" ] || [ -z "$PASSWORD" ]; then
73+
print_error "Failed to read credentials from $CRED_FILE"
74+
exit 1
75+
fi
76+
77+
# Setup SSH command
78+
if ! command -v sshpass &> /dev/null; then
79+
print_error "sshpass is required but not installed"
80+
exit 1
81+
fi
82+
83+
SSH_CMD="sshpass -p '$PASSWORD' ssh -o StrictHostKeyChecking=no"
84+
85+
print_info "Connecting to $USERNAME@$SERVER_ADDRESS..."
86+
print_info "Checking HF training status on $CLUSTER..."
87+
echo ""
88+
89+
# Execute status check on remote server
90+
eval "$SSH_CMD \"$USERNAME@$SERVER_ADDRESS\" 'bash -s'" << 'ENDSSH'
91+
#!/bin/bash
92+
93+
# Change to project directory
94+
cd ~/llm-stylometry || { echo "ERROR: Project directory ~/llm-stylometry not found"; exit 1; }
95+
96+
# Activate conda environment
97+
if ! command -v conda &> /dev/null; then
98+
echo "ERROR: conda not found"
99+
exit 1
100+
fi
101+
102+
eval "$(conda shell.bash hook)" 2>/dev/null || { echo "ERROR: Failed to initialize conda"; exit 1; }
103+
conda activate llm-stylometry 2>/dev/null || { echo "ERROR: llm-stylometry environment not found"; exit 1; }
104+
105+
# Create temporary Python script
106+
cat > /tmp/check_hf_status.py << 'ENDPYTHON'
107+
#!/usr/bin/env python
108+
"""Check HuggingFace training status."""
109+
110+
import sys
111+
import pandas as pd
112+
import numpy as np
113+
from pathlib import Path
114+
from datetime import datetime, timedelta
115+
116+
AUTHORS = ['austen', 'baum', 'dickens', 'fitzgerald', 'melville', 'thompson', 'twain', 'wells']
117+
TARGET_LOSS = 0.1 # HF target loss
118+
PAPER_LOSS = 3.0 # Paper stopping point
119+
120+
def format_timedelta(td):
121+
"""Format timedelta as human-readable string."""
122+
total_seconds = int(td.total_seconds())
123+
days = total_seconds // 86400
124+
hours = (total_seconds % 86400) // 3600
125+
minutes = (total_seconds % 3600) // 60
126+
127+
if days > 0:
128+
return f"{days}d {hours}h {minutes}m"
129+
elif hours > 0:
130+
return f"{hours}h {minutes}m"
131+
else:
132+
return f"{minutes}m"
133+
134+
def check_author_status(author):
135+
"""Check HF training status for a single author."""
136+
# Check seed=0 model (HF training location)
137+
model_dir = Path(f'models/{author}_tokenizer=gpt2_seed=0')
138+
loss_log = model_dir / 'loss_logs.csv'
139+
140+
if not loss_log.exists():
141+
return None
142+
143+
try:
144+
df = pd.read_csv(loss_log)
145+
if len(df) == 0:
146+
return None
147+
148+
# Get latest epoch
149+
max_epoch = df['epochs_completed'].max()
150+
train_rows = df[(df['epochs_completed'] == max_epoch) & (df['loss_dataset'] == 'train')]
151+
152+
if len(train_rows) == 0:
153+
return None
154+
155+
current_loss = train_rows.iloc[0]['loss_value']
156+
157+
# Check if we're in HF training (loss < PAPER_LOSS)
158+
hf_rows = df[(df['loss_dataset'] == 'train') & (df['loss_value'] < PAPER_LOSS)]
159+
160+
if len(hf_rows) == 0:
161+
# Not yet started HF training
162+
return None
163+
164+
# Find when HF training started
165+
hf_start_epoch = int(hf_rows.iloc[0]['epochs_completed'])
166+
epochs_since_start = int(max_epoch - hf_start_epoch)
167+
168+
# Estimate elapsed time (rough: 10 sec/epoch with eval skipped)
169+
elapsed = timedelta(seconds=int(epochs_since_start * 10))
170+
171+
# Check if complete
172+
is_complete = current_loss <= TARGET_LOSS
173+
174+
return {
175+
'current_epoch': max_epoch,
176+
'current_loss': current_loss,
177+
'target_loss': TARGET_LOSS,
178+
'is_complete': is_complete,
179+
'hf_start_epoch': hf_start_epoch,
180+
'epochs_since_start': epochs_since_start,
181+
'elapsed': elapsed
182+
}
183+
184+
except Exception as e:
185+
return None
186+
187+
# Print report
188+
print("=" * 80)
189+
print("HUGGINGFACE MODEL TRAINING STATUS")
190+
print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
191+
print("=" * 80)
192+
193+
completed_count = 0
194+
in_progress_count = 0
195+
not_started_count = 0
196+
197+
for author in AUTHORS:
198+
status = check_author_status(author)
199+
200+
print(f"\n{author.upper()}")
201+
print("-" * 80)
202+
203+
if status is None:
204+
print(" Status: Not started")
205+
not_started_count += 1
206+
elif status['is_complete']:
207+
print(f" Status: Complete ✓")
208+
print(f" Final loss: {status['current_loss']:.4f}")
209+
print(f" Total epochs: {status['current_epoch']:,}")
210+
print(f" HF epochs: {status['epochs_since_start']:,} (from epoch {status['hf_start_epoch']})")
211+
completed_count += 1
212+
else:
213+
print(f" Status: Training...")
214+
print(f" Current epoch: {status['current_epoch']:,}")
215+
print(f" Current loss: {status['current_loss']:.4f}")
216+
print(f" Target loss: {status['target_loss']:.4f}")
217+
print(f" HF epochs completed: {status['epochs_since_start']:,}")
218+
print(f" Elapsed: {format_timedelta(status['elapsed'])}")
219+
220+
# Estimate remaining time based on loss decay
221+
if status['epochs_since_start'] > 10:
222+
# Rough estimate: assume exponential decay
223+
# loss goes from ~3.0 to ~0.1 (factor of 30)
224+
# Current progress
225+
loss_ratio = (PAPER_LOSS - status['current_loss']) / (PAPER_LOSS - TARGET_LOSS)
226+
progress_pct = loss_ratio * 100
227+
228+
# Estimate total HF epochs needed (very rough)
229+
if loss_ratio > 0:
230+
estimated_total_hf_epochs = int(status['epochs_since_start'] / loss_ratio)
231+
remaining_epochs = estimated_total_hf_epochs - status['epochs_since_start']
232+
eta = timedelta(seconds=remaining_epochs * 10)
233+
print(f" Progress: {progress_pct:.1f}%")
234+
print(f" Estimated remaining: {format_timedelta(eta)}")
235+
236+
in_progress_count += 1
237+
238+
# Summary
239+
print("\n" + "=" * 80)
240+
print("SUMMARY")
241+
print("=" * 80)
242+
print(f"Completed: {completed_count}/8")
243+
print(f"In progress: {in_progress_count}/8")
244+
print(f"Not started: {not_started_count}/8")
245+
246+
if in_progress_count > 0 or completed_count < 8:
247+
print("\nTo download completed models:")
248+
print(" ./sync_hf_models.sh --cluster CLUSTER")
249+
250+
ENDPYTHON
251+
252+
# Execute the Python script
253+
python3 /tmp/check_hf_status.py
254+
255+
# Clean up
256+
rm -f /tmp/check_hf_status.py
257+
ENDSSH
258+
259+
if [ $? -eq 0 ]; then
260+
echo ""
261+
print_success "Status check complete!"
262+
else
263+
print_error "Failed to check training status"
264+
exit 1
265+
fi

0 commit comments

Comments
 (0)