1680 fix run engine filehandle leak (#1682)

rtuck99 · web-flow · commit fae6079e9ec3 · 2025-11-06T10:26:08.000Z
* Properly clean up the run engine after use
* Make all tests use the fixture instead of creating their own tests
* Additional patches to ensure filehandles not leaked in tests
* Utility to detect which tests are leaking
* Clean up dangling signal observers
* Fix the event loop scope of _ensure_running_bluesky_event_loop to be consistent with mx-bluesky
diff --git a/src/dodal/testing/fixtures/run_engine.py b/src/dodal/testing/fixtures/run_engine.py
@@ -3,32 +3,79 @@
 """
 
 import asyncio
+import os
+import threading
 import time
-from collections.abc import Mapping
+from collections.abc import AsyncGenerator, Mapping
 
 import pytest
+import pytest_asyncio
+from _pytest.fixtures import FixtureRequest
 from bluesky.run_engine import RunEngine
 from bluesky.simulators import RunEngineSimulator
 
 _run_engine = RunEngine()
 
+_ENABLE_FILEHANDLE_LEAK_CHECKS = (
+    os.getenv("DODAL_ENABLE_FILEHANDLE_LEAK_CHECKS", "").lower() == "true"
+)
 
-@pytest.fixture(scope="session", autouse=True)
-async def _ensure_running_bluesky_event_loop():
+
+@pytest_asyncio.fixture(scope="session", loop_scope="session", autouse=True)
+async def _ensure_running_bluesky_event_loop(_global_run_engine):
     # make sure the event loop is thoroughly up and running before we try to create
     # any ophyd_async devices which might need it
     timeout = time.monotonic() + 1
-    while not _run_engine.loop.is_running():
+    while not _global_run_engine.loop.is_running():
         await asyncio.sleep(0)
         if time.monotonic() > timeout:
             raise TimeoutError("This really shouldn't happen but just in case...")
 
 
 @pytest.fixture()
-def run_engine():
-    global _run_engine
-    _run_engine.reset()
-    return _run_engine
+async def run_engine(_global_run_engine: RunEngine) -> AsyncGenerator[RunEngine, None]:
+    try:
+        yield _global_run_engine
+    finally:
+        _global_run_engine.reset()
+
+
+@pytest_asyncio.fixture(scope="session", loop_scope="session")
+async def _global_run_engine() -> AsyncGenerator[RunEngine, None]:
+    """
+    Obtain a run engine, with its own event loop and thread.
+
+    On closure of the scope, the run engine is stopped and the event loop closed
+    in order to release all resources it consumes.
+    """
+    run_engine = RunEngine({}, call_returns_result=True)
+    yield run_engine
+    try:
+        run_engine.halt()
+    except Exception as e:
+        # Ignore exception thrown if the run engine is already halted.
+        print(f"Got exception while halting RunEngine {e}")
+    finally:
+
+        async def get_event_loop_thread():
+            """Get the thread which the run engine created for the event loop."""
+            return threading.current_thread()
+
+        fut = asyncio.run_coroutine_threadsafe(get_event_loop_thread(), run_engine.loop)
+        while not fut.done():
+            # It's not clear why this is necessary, given we are
+            # on a completely different thread and event loop
+            # but without it our future never seems to be populated with a result
+            # despite the coro getting executed
+            await asyncio.sleep(0)
+        # Terminate the event loop so that we can join() the thread
+        run_engine.loop.call_soon_threadsafe(run_engine.loop.stop)
+        run_engine_thread = fut.result()
+        run_engine_thread.join()
+        # This closes the filehandle in the event loop.
+        # This cannot be called while the event loop is running
+        run_engine.loop.close()
+    del run_engine
 
 
 @pytest.fixture
@@ -47,3 +94,25 @@ def append_and_print(name, doc):
 
     run_engine.subscribe(append_and_print)
     return docs
+
+
+@pytest.fixture(autouse=_ENABLE_FILEHANDLE_LEAK_CHECKS)
+def check_for_filehandle_leaks(request: FixtureRequest):
+    """
+    Test fixture that can be enabled in order to check for leaked filehandles
+    (typically caused by a rogue RunEngine instance).
+
+    Note that this test is not enabled by default due to imposing a significant
+    overhead. When a leak is suspected, usually from seeing a
+    PytestUnraisableExceptionWarning, enable this via autouse and run the full
+    test suite.
+    """
+    pid = os.getpid()
+    _baseline_n_open_files = len(os.listdir(f"/proc/{pid}/fd"))
+    try:
+        yield
+    finally:
+        _n_open_files = len(os.listdir(f"/proc/{pid}/fd"))
+        assert _n_open_files == _baseline_n_open_files, (
+            f"Function {request.function.__name__} leaked some filehandles"
+        )
diff --git a/tests/common/test_watcher_utils.py b/tests/common/test_watcher_utils.py
@@ -61,6 +61,7 @@ async def test_log_on_percentage_complete_value_error_on_bad_input():
         match="Percent interval on class _LogOnPercentageProgressWatcher must be a positive number, but received 0",
     ):
         log_on_percentage_complete(status, "", 0)
+    test_watchable.complete_event.set()  # Ensure the signal observer exits
 
 
 async def test_log_on_percentage_complete_for_already_updating_status():
@@ -75,3 +76,4 @@ async def do_log():
         log_on_percentage_complete(status, "")
 
     await asyncio.gather(update_signal(), do_log())
+    test_watchable.complete_event.set()  # Ensure the signal observer exits
diff --git a/tests/devices/oav/image_recognition/test_pin_tip_detect.py b/tests/devices/oav/image_recognition/test_pin_tip_detect.py
@@ -1,4 +1,3 @@
-import asyncio
 from unittest.mock import MagicMock, patch
 
 import numpy as np
@@ -10,9 +9,6 @@
 )
 from dodal.devices.oav.pin_image_recognition.utils import NONE_VALUE, SampleLocation
 
-EVENT_LOOP = asyncio.new_event_loop()
-
-
 DEVICE_NAME = "pin_tip_detection"
 TRIGGERED_TIP_READING = DEVICE_NAME + "-triggered_tip"
 TRIGGERED_TOP_EDGE_READING = DEVICE_NAME + "-triggered_top_edge"
diff --git a/tests/devices/oav/test_oav_utils.py b/tests/devices/oav/test_oav_utils.py
@@ -40,6 +40,13 @@ async def smargon() -> AsyncGenerator[Smargon]:
         yield smargon
 
 
+@pytest.fixture
+async def mock_pin_tip_detect() -> PinTipDetection:
+    async with init_devices(mock=True):
+        mock_pin_tip_detect = PinTipDetection("")
+        return mock_pin_tip_detect
+
+
 @pytest.mark.parametrize(
     "h, v, expected_x, expected_y",
     [
@@ -78,10 +85,10 @@ async def test_values_for_move_so_that_beam_is_at_pixel(
     expected_xyz: tuple,
     oav: OAV,
     smargon: Smargon,
+    run_engine: RunEngine,
 ):
     set_mock_value(oav.zoom_controller.level, zoom_level)
     set_mock_value(smargon.omega.user_readback, angle)
-    run_engine = RunEngine(call_returns_result=True)
     pos = run_engine(
         get_move_required_so_that_beam_is_at_pixel(smargon, pixel_to_move_to, oav)
     ).plan_result  # type: ignore
@@ -90,26 +97,20 @@ async def test_values_for_move_so_that_beam_is_at_pixel(
 
 
 async def test_given_tip_found_when_wait_for_tip_to_be_found_called_then_tip_immediately_returned(
-    run_engine,
+    run_engine: RunEngine, mock_pin_tip_detect: PinTipDetection
 ):
-    async with init_devices(mock=True):
-        mock_pin_tip_detect = PinTipDetection("")
-
-    await mock_pin_tip_detect.connect(mock=True)
     mock_pin_tip_detect._get_tip_and_edge_data = AsyncMock(
         return_value=SampleLocation(100, 100, np.array([]), np.array([]))
     )
-    run_engine = RunEngine(call_returns_result=True)
     result = run_engine(wait_for_tip_to_be_found(mock_pin_tip_detect))
     assert result.plan_result == (100, 100)  # type: ignore
     mock_pin_tip_detect._get_tip_and_edge_data.assert_called_once()
 
 
-async def test_given_no_tip_when_wait_for_tip_to_be_found_called_then_exception_thrown():
-    async with init_devices(mock=True):
-        mock_pin_tip_detect = PinTipDetection("")
-
-    await mock_pin_tip_detect.connect(mock=True)
+async def test_given_no_tip_when_wait_for_tip_to_be_found_called_then_exception_thrown(
+    run_engine: RunEngine,
+    mock_pin_tip_detect: PinTipDetection,
+):
     await mock_pin_tip_detect.validity_timeout.set(0.2)
     mock_pin_tip_detect._get_tip_and_edge_data = AsyncMock(
         return_value=SampleLocation(
@@ -119,6 +120,5 @@ async def test_given_no_tip_when_wait_for_tip_to_be_found_called_then_exception_
             np.array([]),
         )
     )
-    run_engine = RunEngine(call_returns_result=True)
     with pytest.raises(PinNotFoundError):
         run_engine(wait_for_tip_to_be_found(mock_pin_tip_detect))
diff --git a/tests/devices/test_gridscan.py b/tests/devices/test_gridscan.py
@@ -139,14 +139,15 @@ async def test_waits_for_running_motion(grid_scan: FastGridScanCommon):
     ],
 )
 async def test_given_different_step_numbers_then_expected_images_correct(
-    zebra_fast_grid_scan: ZebraFastGridScanThreeD, steps, expected_images
+    run_engine: RunEngine,
+    zebra_fast_grid_scan: ZebraFastGridScanThreeD,
+    steps: tuple[int, int, int],
+    expected_images: int,
 ):
     set_mock_value(zebra_fast_grid_scan.x_steps, steps[0])
     set_mock_value(zebra_fast_grid_scan.y_steps, steps[1])
     set_mock_value(zebra_fast_grid_scan.z_steps, steps[2])
 
-    run_engine = RunEngine(call_returns_result=True)
-
     result = run_engine(bps.rd(zebra_fast_grid_scan.expected_images))
 
     assert result.plan_result == expected_images  # type: ignore
@@ -161,13 +162,14 @@ async def test_given_different_step_numbers_then_expected_images_correct(
     ],
 )
 async def test_given_different_2d_step_numbers_then_expected_images_correct(
-    zebra_fast_grid_scan_2d: ZebraFastGridScanTwoD, steps, expected_images
+    zebra_fast_grid_scan_2d: ZebraFastGridScanTwoD,
+    steps: tuple[int, int],
+    expected_images: int,
+    run_engine: RunEngine,
 ):
     set_mock_value(zebra_fast_grid_scan_2d.x_steps, steps[0])
     set_mock_value(zebra_fast_grid_scan_2d.y_steps, steps[1])
 
-    run_engine = RunEngine(call_returns_result=True)
-
     result = run_engine(bps.rd(zebra_fast_grid_scan_2d.expected_images))
 
     assert result.plan_result == expected_images  # type: ignore
diff --git a/tests/plan_stubs/test_motor_util_plans.py b/tests/plan_stubs/test_motor_util_plans.py
@@ -70,10 +70,8 @@ def test_given_types_of_device_when_home_and_reset_wrapper_called_then_motors_an
 
 
 def test_given_a_device_when_check_and_cache_values_then_motor_values_returned(
-    my_device,
+    my_device: DeviceWithOnlyMotors, run_engine: RunEngine
 ):
-    run_engine = RunEngine(call_returns_result=True)
-
     for i, motor in enumerate(my_device.motors, start=1):
         set_mock_value(motor.user_readback, i * 100)
 
@@ -124,10 +122,12 @@ def test_given_a_device_with_a_too_large_move_when_check_and_cache_values_then_e
     ],
 )
 def test_given_a_device_where_one_move_too_small_when_check_and_cache_values_then_other_positions_returned(
-    my_device: DeviceWithOnlyMotors, initial, min, new_position: float
+    my_device: DeviceWithOnlyMotors,
+    initial: float,
+    min: float,
+    new_position: float,
+    run_engine: RunEngine,
 ):
-    run_engine = RunEngine(call_returns_result=True)
-
     set_mock_value(my_device.x.user_readback, initial)
     set_mock_value(my_device.y.user_readback, 200)
 
@@ -144,10 +144,8 @@ def test_given_a_device_where_one_move_too_small_when_check_and_cache_values_the
 
 
 def test_given_a_device_where_all_moves_too_small_when_check_and_cache_values_then_no_positions_returned(
-    my_device,
+    my_device: DeviceWithOnlyMotors, run_engine: RunEngine
 ):
-    run_engine = RunEngine(call_returns_result=True)
-
     set_mock_value(my_device.x.user_readback, 10)
     set_mock_value(my_device.y.user_readback, 20)
 
diff --git a/tests/plans/test_save_panda.py b/tests/plans/test_save_panda.py
@@ -2,13 +2,18 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
-from bluesky.simulators import RunEngineSimulator
+from bluesky import RunEngine
 
 from dodal.plans.save_panda import _save_panda, main
 
 
-def test_save_panda():
-    sim_run_engine = RunEngineSimulator()
+@pytest.fixture(autouse=True)
+def patch_run_engine_in_save_panda_to_avoid_leaks(run_engine: RunEngine):
+    with patch("dodal.plans.save_panda.RunEngine", return_value=run_engine):
+        yield
+
+
+def test_save_panda(sim_run_engine):
     panda = MagicMock()
     directory = "test"
     filename = "file.yml"
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -2,6 +2,7 @@
 from unittest.mock import patch
 
 import pytest
+from bluesky import RunEngine
 from click.testing import CliRunner, Result
 from ophyd.device import DEFAULT_CONNECTION_TIMEOUT
 from ophyd_async.core import (
@@ -19,6 +20,12 @@
 EXAMPLE_BEAMLINE = "i22"
 
 
+@pytest.fixture(autouse=True)
+def patch_run_engine_in_cli_to_avoid_leaks(run_engine: RunEngine):
+    with patch("dodal.cli.RunEngine", return_value=run_engine):
+        yield
+
+
 @pytest.fixture
 def runner():
     return CliRunner()