Skip to content

Commit 69ed635

Browse files
committed
Add consolidate_model_results.py and update documentation
- Created consolidate_model_results.py to consolidate model training results - Removed redundant data/model_results.csv (pickle file is primary format) - Updated README to remove direct references to code folder scripts - Simplified training documentation to use CLI instead of manual steps - Added consolidate_model_results.py to repository structure in README
1 parent cffe878 commit 69ed635

File tree

3 files changed

+184
-700959
lines changed

3 files changed

+184
-700959
lines changed

README.md

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,10 @@ llm-stylometry/
2121
│ ├── utils/ # Helper utilities
2222
│ ├── visualization/ # Plotting and visualization
2323
│ └── cli_utils.py # CLI helper functions
24-
├── code/ # Original analysis scripts
25-
│ ├── main.py # Model training script
26-
│ ├── clean.py # Data preprocessing
27-
│ └── ... # Various analysis scripts
2824
├── data/ # Datasets and results
2925
│ ├── raw/ # Original texts from Project Gutenberg
3026
│ ├── cleaned/ # Preprocessed texts by author
31-
│ ├── model_results.pkl # Consolidated model training results
32-
│ └── model_results.csv # Model results in CSV format
27+
│ └── model_results.pkl # Consolidated model training results
3328
├── models/ # Trained models (80 total)
3429
│ └── {author}_tokenizer=gpt2_seed={0-9}/
3530
├── paper/ # LaTeX paper and figures
@@ -40,6 +35,7 @@ llm-stylometry/
4035
│ ├── data/ # Test data and fixtures
4136
│ ├── test_*.py # Test modules
4237
│ └── check_outputs.py # Output validation script
38+
├── consolidate_model_results.py # Consolidate training results
4339
├── generate_figures.py # Main CLI entry point
4440
├── run_llm_stylometry.sh # Shell wrapper for easy setup
4541
├── LICENSE # MIT License
@@ -168,16 +164,17 @@ fig = generate_all_losses_figure(
168164
**Note**: Training requires a CUDA-enabled GPU and takes significant time (~80 models total).
169165

170166
```bash
171-
# Using the CLI (recommended)
167+
# Using the CLI (recommended - handles all steps automatically)
172168
./run_llm_stylometry.sh --train
173-
174-
# Or manually
175-
conda activate llm-stylometry
176-
python code/clean.py # Clean data
177-
python code/main.py # Train models
178-
python consolidate_model_results.py # Consolidate results
179169
```
180170

171+
This command will:
172+
1. Clean and prepare the data if needed
173+
2. Train all 80 models (8 authors × 10 seeds)
174+
3. Consolidate results into `data/model_results.pkl`
175+
176+
The training pipeline automatically handles data preparation, model training across available GPUs, and result consolidation. Individual model checkpoints and loss logs are saved in the `models/` directory.
177+
181178
### Model Configuration
182179

183180
Each model uses:

consolidate_model_results.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#!/usr/bin/env python
2+
"""
3+
Consolidate model training results from individual loss_logs.csv files into a single DataFrame.
4+
5+
This script reads all loss_logs.csv files from the models/ directory and combines them
6+
into a single pandas DataFrame, adding model configuration information. The consolidated
7+
results are saved as data/model_results.pkl for use by visualization scripts.
8+
"""
9+
10+
import os
11+
import json
12+
import pandas as pd
13+
from pathlib import Path
14+
from tqdm import tqdm
15+
16+
17+
def consolidate_model_results(models_dir='models', output_path='data/model_results.pkl', save_csv=False):
18+
"""
19+
Consolidate all model training results into a single DataFrame.
20+
21+
Args:
22+
models_dir: Directory containing trained models
23+
output_path: Path to save consolidated pickle file
24+
save_csv: Also save as CSV for debugging (default: False)
25+
26+
Returns:
27+
Consolidated DataFrame with all model results
28+
"""
29+
models_path = Path(models_dir)
30+
31+
if not models_path.exists():
32+
raise FileNotFoundError(f"Models directory not found: {models_dir}")
33+
34+
all_results = []
35+
36+
# Find all model directories
37+
model_dirs = sorted([d for d in models_path.iterdir() if d.is_dir()])
38+
39+
if not model_dirs:
40+
raise ValueError(f"No model directories found in {models_dir}")
41+
42+
print(f"Found {len(model_dirs)} model directories to consolidate")
43+
44+
for model_dir in tqdm(model_dirs, desc="Consolidating models"):
45+
# Parse model directory name
46+
dir_name = model_dir.name
47+
parts = dir_name.split('_')
48+
49+
# Extract author and seed from directory name
50+
# Format: {author}_tokenizer={tokenizer}_seed={seed}
51+
author = parts[0]
52+
53+
# Find tokenizer and seed
54+
tokenizer = None
55+
seed = None
56+
for part in parts[1:]:
57+
if part.startswith('tokenizer='):
58+
tokenizer = part.split('=')[1]
59+
elif part.startswith('seed='):
60+
seed = int(part.split('=')[1])
61+
62+
if tokenizer is None or seed is None:
63+
print(f"Warning: Could not parse directory name: {dir_name}")
64+
continue
65+
66+
# Read loss logs
67+
loss_logs_path = model_dir / 'loss_logs.csv'
68+
if not loss_logs_path.exists():
69+
print(f"Warning: No loss_logs.csv found in {model_dir}")
70+
continue
71+
72+
# Read the CSV file
73+
df = pd.read_csv(loss_logs_path)
74+
75+
# Add model metadata
76+
df['model_name'] = dir_name
77+
df['author'] = author
78+
df['tokenizer'] = tokenizer
79+
df['checkpoint_path'] = str(model_dir)
80+
81+
# Read config files if they exist
82+
config_path = model_dir / 'config.json'
83+
if config_path.exists():
84+
with open(config_path, 'r') as f:
85+
config = json.load(f)
86+
df['model_config'] = json.dumps(config)
87+
else:
88+
df['model_config'] = None
89+
90+
gen_config_path = model_dir / 'generation_config.json'
91+
if gen_config_path.exists():
92+
with open(gen_config_path, 'r') as f:
93+
gen_config = json.load(f)
94+
df['generation_config'] = json.dumps(gen_config)
95+
else:
96+
df['generation_config'] = None
97+
98+
all_results.append(df)
99+
100+
# Combine all dataframes
101+
consolidated_df = pd.concat(all_results, ignore_index=True)
102+
103+
# Ensure column order matches expected format
104+
expected_columns = [
105+
'seed', 'train_author', 'epochs_completed', 'loss_dataset',
106+
'loss_value', 'model_name', 'author', 'tokenizer',
107+
'model_config', 'generation_config', 'checkpoint_path'
108+
]
109+
110+
# Reorder columns if they all exist
111+
available_columns = [col for col in expected_columns if col in consolidated_df.columns]
112+
consolidated_df = consolidated_df[available_columns]
113+
114+
# Save as pickle
115+
output_path = Path(output_path)
116+
output_path.parent.mkdir(parents=True, exist_ok=True)
117+
consolidated_df.to_pickle(output_path)
118+
119+
print(f"\nConsolidation complete!")
120+
print(f"Total records: {len(consolidated_df)}")
121+
print(f"Unique models: {consolidated_df['model_name'].nunique()}")
122+
print(f"Saved to: {output_path}")
123+
124+
# Optionally save as CSV for debugging/inspection
125+
if save_csv:
126+
csv_path = output_path.with_suffix('.csv')
127+
consolidated_df.to_csv(csv_path, index=False)
128+
print(f"Also saved CSV for inspection: {csv_path}")
129+
130+
# Print summary statistics
131+
print("\nSummary by author:")
132+
summary = consolidated_df.groupby('train_author')['seed'].nunique()
133+
for author, num_seeds in summary.items():
134+
print(f" {author}: {num_seeds} seeds")
135+
136+
return consolidated_df
137+
138+
139+
def main():
140+
"""Main entry point for the consolidation script."""
141+
import argparse
142+
143+
parser = argparse.ArgumentParser(
144+
description='Consolidate model training results into a single DataFrame'
145+
)
146+
parser.add_argument(
147+
'--models-dir',
148+
default='models',
149+
help='Directory containing trained models (default: models)'
150+
)
151+
parser.add_argument(
152+
'--output',
153+
default='data/model_results.pkl',
154+
help='Output path for consolidated pickle file (default: data/model_results.pkl)'
155+
)
156+
parser.add_argument(
157+
'--save-csv',
158+
action='store_true',
159+
help='Also save as CSV for debugging'
160+
)
161+
162+
args = parser.parse_args()
163+
164+
try:
165+
df = consolidate_model_results(args.models_dir, args.output)
166+
return 0
167+
except Exception as e:
168+
print(f"Error: {e}")
169+
return 1
170+
171+
172+
if __name__ == '__main__':
173+
import sys
174+
sys.exit(main())

0 commit comments

Comments
 (0)