Skip to content

Commit 239a317

Browse files
grzegorz-roboflowrvirani1PawelPeczek-Roboflow
authored
Enable crash dump of input image (#1921)
* Pella debug * only change instance segmentation * handle debug on execution engine level * remove error enforcement * add -rc10 to jetson 5.1.1 ci * port av fix * Revert .github * revert requirements * reformat * INFERENCE_DEBUG_OUTPUT_DIR must exist if specified * Store crash info only if INFERENCE_DEBUG_OUTPUT_DIR specified * revert utils * code review * fix --------- Co-authored-by: Riaz Virani <riaz@roboflow.com> Co-authored-by: Paweł Pęczek <146137186+PawelPeczek-Roboflow@users.noreply.github.com>
1 parent 3bbbc65 commit 239a317

File tree

2 files changed

+44
-1
lines changed
  • inference/core

2 files changed

+44
-1
lines changed

inference/core/env.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@
356356

357357
# Model cache directory, default is "/tmp/cache"
358358
MODEL_CACHE_DIR = os.getenv("MODEL_CACHE_DIR", "/tmp/cache")
359+
INFERENCE_DEBUG_OUTPUT_DIR = os.environ.get("INFERENCE_DEBUG_OUTPUT_DIR")
359360

360361
# Model ID, default is None
361362
MODEL_ID = os.getenv("MODEL_ID")

inference/core/workflows/execution_engine/v1/executor/core.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
1+
import os
2+
import traceback
13
from concurrent.futures import ThreadPoolExecutor
24
from datetime import datetime
35
from functools import partial
46
from typing import Any, Callable, Dict, List, Optional, Set
7+
from uuid import uuid4
8+
9+
import cv2
10+
import numpy as np
511

612
try:
713
from inference_sdk.config import execution_id
814
except ImportError:
915
execution_id = None
1016

1117
from inference.core import logger
18+
from inference.core.env import INFERENCE_DEBUG_OUTPUT_DIR
1219
from inference.core.workflows.errors import StepExecutionError, WorkflowError
1320
from inference.core.workflows.execution_engine.profiling.core import (
1421
NullWorkflowsProfiler,
@@ -37,6 +44,31 @@
3744
from inference.usage_tracking.collector import usage_collector
3845

3946

47+
def _store_crash_info(
48+
image: np.ndarray,
49+
exception: Optional[Exception] = None,
50+
) -> None:
51+
if image is None or not INFERENCE_DEBUG_OUTPUT_DIR:
52+
logger.error("Failed attempt to store crash info")
53+
return
54+
try:
55+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
56+
file_name = f"image_{timestamp}_{uuid4().hex[:5]}"
57+
os.makedirs(INFERENCE_DEBUG_OUTPUT_DIR, exist_ok=True)
58+
if exception is not None:
59+
traceback_str = traceback.format_exc()
60+
with open(
61+
os.path.join(INFERENCE_DEBUG_OUTPUT_DIR, f"{file_name}.txt"), "w"
62+
) as f:
63+
f.write(str(exception))
64+
f.write("\n")
65+
f.write(traceback_str)
66+
image_path = os.path.join(INFERENCE_DEBUG_OUTPUT_DIR, f"{file_name}.jpg")
67+
cv2.imwrite(image_path, image)
68+
except Exception as e:
69+
logger.error(f"Failed to store crash info: {e}")
70+
71+
4072
@usage_collector("workflows")
4173
@execution_phase(
4274
name="workflow_execution",
@@ -243,7 +275,17 @@ def run_simd_step_in_batch_mode(
243275
# no inputs - discarded either by conditional exec or by not accepting empty
244276
outputs = []
245277
else:
246-
outputs = step_instance.run(**step_input.parameters)
278+
try:
279+
outputs = step_instance.run(**step_input.parameters)
280+
except Exception as exc:
281+
if INFERENCE_DEBUG_OUTPUT_DIR:
282+
_store_crash_info(
283+
image=execution_data_manager._runtime_parameters["image"][
284+
0
285+
].numpy_image,
286+
exception=exc,
287+
)
288+
raise exc
247289
with profiler.profile_execution_phase(
248290
name="step_output_registration",
249291
categories=["execution_engine_operation"],

0 commit comments

Comments
 (0)