instructlab · RobotSail · Dec 18, 2025 · Dec 18, 2025 · coderabbitai · Dec 18, 2025
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
@@ -6,6 +6,7 @@
 import logging
 import os
 import subprocess
+import sys
 import time
 import warnings
 
@@ -645,11 +646,17 @@
         if "process" not in locals() or process is None:
             return
 
-        failure = process.poll() != 0
+        # wait for the process to exit so we can properly read the exit code
+        process.wait(timeout=60)
+        process_code = process.poll()
+        failure = process_code != 0
-        # wait for the process to exit so we can properly read the exit code
-        process.wait(timeout=60)
-        process_code = process.poll()
-        failure = process_code != 0
+        # wait for the process to exit so we can properly read the exit code
+        try:
+            process.wait(timeout=60)
+            process_code = process.poll()
+            failure = process_code != 0
+        except subprocess.TimeoutExpired:
+            logger.error("Training subprocess did not exit within 60 seconds.")
+            process_code = None
+            failure = True
-        # wait for the process to exit so we can properly read the exit code
-        process.wait(timeout=60)
-        process_code = process.poll()
-        failure = process_code != 0
+        # wait for the process to exit so we can properly read the exit code
+        try:
+            process.wait(timeout=60)
+            process_code = process.poll()
+            failure = process_code != 0
+        except subprocess.TimeoutExpired:
+            logger.error("Training subprocess did not exit within 60 seconds.")
+            process_code = None
+            failure = True
+
         if not failure:
             logger.info("Operation completed successfully! 🎉")
         else:
-            logger.error("Training subprocess has not exited yet. Sending SIGTERM.")
+            logger.error(
+                f"Training subprocess has not exited yet. Sending SIGTERM. Process code: {process_code}"
+            )
-            logger.error(
-                f"Training subprocess has not exited yet. Sending SIGTERM. Process code: {process_code}"
-            )
+            logger.error(
+                f"Training subprocess exited with non-zero code: {process_code}"
+            )
-            logger.error(
-                f"Training subprocess has not exited yet. Sending SIGTERM. Process code: {process_code}"
-            )
+            logger.error(
+                f"Training subprocess exited with non-zero code: {process_code}"
+            )
 
         process.terminate()
         try: