1+ #!/usr/bin/env python
2+ """
3+ Consolidate model training results from individual loss_logs.csv files into a single DataFrame.
4+
5+ This script reads all loss_logs.csv files from the models/ directory and combines them
6+ into a single pandas DataFrame, adding model configuration information. The consolidated
7+ results are saved as data/model_results.pkl for use by visualization scripts.
8+ """
9+
10+ import os
11+ import json
12+ import pandas as pd
13+ from pathlib import Path
14+ from tqdm import tqdm
15+
16+
17+ def consolidate_model_results (models_dir = 'models' , output_path = 'data/model_results.pkl' , save_csv = False ):
18+ """
19+ Consolidate all model training results into a single DataFrame.
20+
21+ Args:
22+ models_dir: Directory containing trained models
23+ output_path: Path to save consolidated pickle file
24+ save_csv: Also save as CSV for debugging (default: False)
25+
26+ Returns:
27+ Consolidated DataFrame with all model results
28+ """
29+ models_path = Path (models_dir )
30+
31+ if not models_path .exists ():
32+ raise FileNotFoundError (f"Models directory not found: { models_dir } " )
33+
34+ all_results = []
35+
36+ # Find all model directories
37+ model_dirs = sorted ([d for d in models_path .iterdir () if d .is_dir ()])
38+
39+ if not model_dirs :
40+ raise ValueError (f"No model directories found in { models_dir } " )
41+
42+ print (f"Found { len (model_dirs )} model directories to consolidate" )
43+
44+ for model_dir in tqdm (model_dirs , desc = "Consolidating models" ):
45+ # Parse model directory name
46+ dir_name = model_dir .name
47+ parts = dir_name .split ('_' )
48+
49+ # Extract author and seed from directory name
50+ # Format: {author}_tokenizer={tokenizer}_seed={seed}
51+ author = parts [0 ]
52+
53+ # Find tokenizer and seed
54+ tokenizer = None
55+ seed = None
56+ for part in parts [1 :]:
57+ if part .startswith ('tokenizer=' ):
58+ tokenizer = part .split ('=' )[1 ]
59+ elif part .startswith ('seed=' ):
60+ seed = int (part .split ('=' )[1 ])
61+
62+ if tokenizer is None or seed is None :
63+ print (f"Warning: Could not parse directory name: { dir_name } " )
64+ continue
65+
66+ # Read loss logs
67+ loss_logs_path = model_dir / 'loss_logs.csv'
68+ if not loss_logs_path .exists ():
69+ print (f"Warning: No loss_logs.csv found in { model_dir } " )
70+ continue
71+
72+ # Read the CSV file
73+ df = pd .read_csv (loss_logs_path )
74+
75+ # Add model metadata
76+ df ['model_name' ] = dir_name
77+ df ['author' ] = author
78+ df ['tokenizer' ] = tokenizer
79+ df ['checkpoint_path' ] = str (model_dir )
80+
81+ # Read config files if they exist
82+ config_path = model_dir / 'config.json'
83+ if config_path .exists ():
84+ with open (config_path , 'r' ) as f :
85+ config = json .load (f )
86+ df ['model_config' ] = json .dumps (config )
87+ else :
88+ df ['model_config' ] = None
89+
90+ gen_config_path = model_dir / 'generation_config.json'
91+ if gen_config_path .exists ():
92+ with open (gen_config_path , 'r' ) as f :
93+ gen_config = json .load (f )
94+ df ['generation_config' ] = json .dumps (gen_config )
95+ else :
96+ df ['generation_config' ] = None
97+
98+ all_results .append (df )
99+
100+ # Combine all dataframes
101+ consolidated_df = pd .concat (all_results , ignore_index = True )
102+
103+ # Ensure column order matches expected format
104+ expected_columns = [
105+ 'seed' , 'train_author' , 'epochs_completed' , 'loss_dataset' ,
106+ 'loss_value' , 'model_name' , 'author' , 'tokenizer' ,
107+ 'model_config' , 'generation_config' , 'checkpoint_path'
108+ ]
109+
110+ # Reorder columns if they all exist
111+ available_columns = [col for col in expected_columns if col in consolidated_df .columns ]
112+ consolidated_df = consolidated_df [available_columns ]
113+
114+ # Save as pickle
115+ output_path = Path (output_path )
116+ output_path .parent .mkdir (parents = True , exist_ok = True )
117+ consolidated_df .to_pickle (output_path )
118+
119+ print (f"\n Consolidation complete!" )
120+ print (f"Total records: { len (consolidated_df )} " )
121+ print (f"Unique models: { consolidated_df ['model_name' ].nunique ()} " )
122+ print (f"Saved to: { output_path } " )
123+
124+ # Optionally save as CSV for debugging/inspection
125+ if save_csv :
126+ csv_path = output_path .with_suffix ('.csv' )
127+ consolidated_df .to_csv (csv_path , index = False )
128+ print (f"Also saved CSV for inspection: { csv_path } " )
129+
130+ # Print summary statistics
131+ print ("\n Summary by author:" )
132+ summary = consolidated_df .groupby ('train_author' )['seed' ].nunique ()
133+ for author , num_seeds in summary .items ():
134+ print (f" { author } : { num_seeds } seeds" )
135+
136+ return consolidated_df
137+
138+
139+ def main ():
140+ """Main entry point for the consolidation script."""
141+ import argparse
142+
143+ parser = argparse .ArgumentParser (
144+ description = 'Consolidate model training results into a single DataFrame'
145+ )
146+ parser .add_argument (
147+ '--models-dir' ,
148+ default = 'models' ,
149+ help = 'Directory containing trained models (default: models)'
150+ )
151+ parser .add_argument (
152+ '--output' ,
153+ default = 'data/model_results.pkl' ,
154+ help = 'Output path for consolidated pickle file (default: data/model_results.pkl)'
155+ )
156+ parser .add_argument (
157+ '--save-csv' ,
158+ action = 'store_true' ,
159+ help = 'Also save as CSV for debugging'
160+ )
161+
162+ args = parser .parse_args ()
163+
164+ try :
165+ df = consolidate_model_results (args .models_dir , args .output )
166+ return 0
167+ except Exception as e :
168+ print (f"Error: { e } " )
169+ return 1
170+
171+
172+ if __name__ == '__main__' :
173+ import sys
174+ sys .exit (main ())
0 commit comments