@@ -107,6 +107,7 @@ cat > /tmp/check_hf_status.py << 'ENDPYTHON'
107107#!/usr/bin/env python
108108"""Check HuggingFace training status."""
109109
110+ import sys
110111import pandas as pd
111112import numpy as np
112113from pathlib import Path
@@ -136,11 +137,15 @@ def check_author_status(author):
136137 model_dir = Path(f'models/{author}_tokenizer=gpt2_seed=0')
137138 loss_log = model_dir / 'loss_logs.csv'
138139
140+ print(f"[DEBUG] Checking {author}: {loss_log}", file=sys.stderr)
141+
139142 if not loss_log.exists():
143+ print(f"[DEBUG] Loss log not found for {author}", file=sys.stderr)
140144 return None
141145
142146 try:
143147 df = pd.read_csv(loss_log)
148+ print(f"[DEBUG] {author}: {len(df)} rows in loss log", file=sys.stderr)
144149 if len(df) == 0:
145150 return None
146151
@@ -155,21 +160,26 @@ def check_author_status(author):
155160
156161 # Check if we're in HF training (loss < PAPER_LOSS)
157162 hf_rows = df[(df['loss_dataset'] == 'train') & (df['loss_value'] < PAPER_LOSS)]
163+ print(f"[DEBUG] {author}: HF rows (loss < {PAPER_LOSS}): {len(hf_rows)}", file=sys.stderr)
164+
158165 if len(hf_rows) == 0:
159166 # Not yet started HF training
167+ print(f"[DEBUG] {author}: Returning None - no HF training yet", file=sys.stderr)
160168 return None
161169
162170 # Find when HF training started
163- hf_start_epoch = hf_rows.iloc[0]['epochs_completed']
164- epochs_since_start = max_epoch - hf_start_epoch
171+ hf_start_epoch = int(hf_rows.iloc[0]['epochs_completed'])
172+ epochs_since_start = int(max_epoch - hf_start_epoch)
173+ print(f"[DEBUG] {author}: HF start epoch {hf_start_epoch}, epochs since: {epochs_since_start}", file=sys.stderr)
165174
166175 # Estimate elapsed time (rough: 10 sec/epoch with eval skipped)
167- elapsed = timedelta(seconds=epochs_since_start * 10)
176+ elapsed = timedelta(seconds=int(epochs_since_start * 10))
177+ print(f"[DEBUG] {author}: About to return status dict", file=sys.stderr)
168178
169179 # Check if complete
170180 is_complete = current_loss <= TARGET_LOSS
171181
172- return {
182+ result = {
173183 'current_epoch': max_epoch,
174184 'current_loss': current_loss,
175185 'target_loss': TARGET_LOSS,
@@ -179,7 +189,11 @@ def check_author_status(author):
179189 'elapsed': elapsed
180190 }
181191
192+ print(f"[DEBUG] {author}: Returning status - epoch {max_epoch}, loss {current_loss:.4f}", file=sys.stderr)
193+ return result
194+
182195 except Exception as e:
196+ print(f"[DEBUG] {author}: Exception: {e}", file=sys.stderr)
183197 return None
184198
185199# Print report
0 commit comments