From bca48d456a8e45ce0964584524b5cc633a151cee Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Thu, 18 Dec 2025 21:23:33 +0000 Subject: [PATCH] fixes bug where process isn't completed by the time the process gets read --- src/instructlab/training/main_ds.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 4d7ef81a..d08afc91 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -6,6 +6,7 @@ import logging import os import subprocess +import sys import time import warnings @@ -645,11 +646,17 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: if "process" not in locals() or process is None: return - failure = process.poll() != 0 + # wait for the process to exit so we can properly read the exit code + process.wait(timeout=60) + process_code = process.poll() + failure = process_code != 0 + if not failure: logger.info("Operation completed successfully! 🎉") else: - logger.error("Training subprocess has not exited yet. Sending SIGTERM.") + logger.error( + f"Training subprocess has not exited yet. Sending SIGTERM. Process code: {process_code}" + ) process.terminate() try: