Enable crash dump of input image (#1921)

grzegorz-roboflow · rvirani1 · PawelPeczek-Roboflow · web-flow · commit 239a3171a30a · 2026-01-22T12:18:23.000+01:00
* Pella debug

* only change instance segmentation

* handle debug on execution engine level

* remove error enforcement

* add -rc10 to jetson 5.1.1 ci

* port av fix

* Revert .github

* revert requirements

* reformat

* INFERENCE_DEBUG_OUTPUT_DIR must exist if specified

* Store crash info only if INFERENCE_DEBUG_OUTPUT_DIR specified

* revert utils

* code review

* fix

---------

Co-authored-by: Riaz Virani &lt;riaz@roboflow.com&gt;
Co-authored-by: Paweł Pęczek &lt;146137186+PawelPeczek-Roboflow@users.noreply.github.com&gt;
diff --git a/inference/core/env.py b/inference/core/env.py
@@ -356,6 +356,7 @@
 
 # Model cache directory, default is "/tmp/cache"
 MODEL_CACHE_DIR = os.getenv("MODEL_CACHE_DIR", "/tmp/cache")
+INFERENCE_DEBUG_OUTPUT_DIR = os.environ.get("INFERENCE_DEBUG_OUTPUT_DIR")
 
 # Model ID, default is None
 MODEL_ID = os.getenv("MODEL_ID")
diff --git a/inference/core/workflows/execution_engine/v1/executor/core.py b/inference/core/workflows/execution_engine/v1/executor/core.py
@@ -1,14 +1,21 @@
+import os
+import traceback
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Set
+from uuid import uuid4
+
+import cv2
+import numpy as np
 
 try:
     from inference_sdk.config import execution_id
 except ImportError:
     execution_id = None
 
 from inference.core import logger
+from inference.core.env import INFERENCE_DEBUG_OUTPUT_DIR
 from inference.core.workflows.errors import StepExecutionError, WorkflowError
 from inference.core.workflows.execution_engine.profiling.core import (
     NullWorkflowsProfiler,
@@ -37,6 +44,31 @@
 from inference.usage_tracking.collector import usage_collector
 
 
+def _store_crash_info(
+    image: np.ndarray,
+    exception: Optional[Exception] = None,
+) -> None:
+    if image is None or not INFERENCE_DEBUG_OUTPUT_DIR:
+        logger.error("Failed attempt to store crash info")
+        return
+    try:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        file_name = f"image_{timestamp}_{uuid4().hex[:5]}"
+        os.makedirs(INFERENCE_DEBUG_OUTPUT_DIR, exist_ok=True)
+        if exception is not None:
+            traceback_str = traceback.format_exc()
+            with open(
+                os.path.join(INFERENCE_DEBUG_OUTPUT_DIR, f"{file_name}.txt"), "w"
+            ) as f:
+                f.write(str(exception))
+                f.write("\n")
+                f.write(traceback_str)
+        image_path = os.path.join(INFERENCE_DEBUG_OUTPUT_DIR, f"{file_name}.jpg")
+        cv2.imwrite(image_path, image)
+    except Exception as e:
+        logger.error(f"Failed to store crash info: {e}")
+
+
 @usage_collector("workflows")
 @execution_phase(
     name="workflow_execution",
@@ -243,7 +275,17 @@ def run_simd_step_in_batch_mode(
             # no inputs - discarded either by conditional exec or by not accepting empty
             outputs = []
         else:
-            outputs = step_instance.run(**step_input.parameters)
+            try:
+                outputs = step_instance.run(**step_input.parameters)
+            except Exception as exc:
+                if INFERENCE_DEBUG_OUTPUT_DIR:
+                    _store_crash_info(
+                        image=execution_data_manager._runtime_parameters["image"][
+                            0
+                        ].numpy_image,
+                        exception=exc,
+                    )
+                raise exc
     with profiler.profile_execution_phase(
         name="step_output_registration",
         categories=["execution_engine_operation"],