Skip to content

Commit 638a753

Browse files
authored
fixes bug where process isn't completed by the time the process gets read (#675)
1 parent c495035 commit 638a753

File tree

1 file changed

+9
-2
lines changed

1 file changed

+9
-2
lines changed

src/instructlab/training/main_ds.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import logging
77
import os
88
import subprocess
9+
import sys
910
import time
1011
import warnings
1112

@@ -645,11 +646,17 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
645646
if "process" not in locals() or process is None:
646647
return
647648

648-
failure = process.poll() != 0
649+
# wait for the process to exit so we can properly read the exit code
650+
process.wait(timeout=60)
651+
process_code = process.poll()
652+
failure = process_code != 0
653+
649654
if not failure:
650655
logger.info("Operation completed successfully! 🎉")
651656
else:
652-
logger.error("Training subprocess has not exited yet. Sending SIGTERM.")
657+
logger.error(
658+
f"Training subprocess has not exited yet. Sending SIGTERM. Process code: {process_code}"
659+
)
653660

654661
process.terminate()
655662
try:

0 commit comments

Comments
 (0)