Skip to content

Commit babcf0a

Browse files
committed
Fix numpy.int64 to int conversion for timedelta
- Convert epochs to Python int before passing to timedelta - Fixes: unsupported type for timedelta seconds component - Critical bug preventing status display Ref: #38
1 parent f55fcba commit babcf0a

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

check_hf_status.sh

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ cat > /tmp/check_hf_status.py << 'ENDPYTHON'
107107
#!/usr/bin/env python
108108
"""Check HuggingFace training status."""
109109
110+
import sys
110111
import pandas as pd
111112
import numpy as np
112113
from pathlib import Path
@@ -136,11 +137,15 @@ def check_author_status(author):
136137
model_dir = Path(f'models/{author}_tokenizer=gpt2_seed=0')
137138
loss_log = model_dir / 'loss_logs.csv'
138139
140+
print(f"[DEBUG] Checking {author}: {loss_log}", file=sys.stderr)
141+
139142
if not loss_log.exists():
143+
print(f"[DEBUG] Loss log not found for {author}", file=sys.stderr)
140144
return None
141145
142146
try:
143147
df = pd.read_csv(loss_log)
148+
print(f"[DEBUG] {author}: {len(df)} rows in loss log", file=sys.stderr)
144149
if len(df) == 0:
145150
return None
146151
@@ -155,21 +160,26 @@ def check_author_status(author):
155160
156161
# Check if we're in HF training (loss < PAPER_LOSS)
157162
hf_rows = df[(df['loss_dataset'] == 'train') & (df['loss_value'] < PAPER_LOSS)]
163+
print(f"[DEBUG] {author}: HF rows (loss < {PAPER_LOSS}): {len(hf_rows)}", file=sys.stderr)
164+
158165
if len(hf_rows) == 0:
159166
# Not yet started HF training
167+
print(f"[DEBUG] {author}: Returning None - no HF training yet", file=sys.stderr)
160168
return None
161169
162170
# Find when HF training started
163-
hf_start_epoch = hf_rows.iloc[0]['epochs_completed']
164-
epochs_since_start = max_epoch - hf_start_epoch
171+
hf_start_epoch = int(hf_rows.iloc[0]['epochs_completed'])
172+
epochs_since_start = int(max_epoch - hf_start_epoch)
173+
print(f"[DEBUG] {author}: HF start epoch {hf_start_epoch}, epochs since: {epochs_since_start}", file=sys.stderr)
165174
166175
# Estimate elapsed time (rough: 10 sec/epoch with eval skipped)
167-
elapsed = timedelta(seconds=epochs_since_start * 10)
176+
elapsed = timedelta(seconds=int(epochs_since_start * 10))
177+
print(f"[DEBUG] {author}: About to return status dict", file=sys.stderr)
168178
169179
# Check if complete
170180
is_complete = current_loss <= TARGET_LOSS
171181
172-
return {
182+
result = {
173183
'current_epoch': max_epoch,
174184
'current_loss': current_loss,
175185
'target_loss': TARGET_LOSS,
@@ -179,7 +189,11 @@ def check_author_status(author):
179189
'elapsed': elapsed
180190
}
181191
192+
print(f"[DEBUG] {author}: Returning status - epoch {max_epoch}, loss {current_loss:.4f}", file=sys.stderr)
193+
return result
194+
182195
except Exception as e:
196+
print(f"[DEBUG] {author}: Exception: {e}", file=sys.stderr)
183197
return None
184198
185199
# Print report

0 commit comments

Comments
 (0)