fix(robot-server, api): fix command history accumulation across protocol runs (#19108)

mjhuff · SyntaxColoring · sfoster1 · web-flow · commit e7e9c39abde7 · 2025-08-04T10:26:37.000-04:00
Closes [RQA-3917](https://opentrons.atlassian.net/browse/RQA-3917)  # Overview Although protocol engine is eventually dereferenced during the protocol run lifecycle, there exists a circular reference between command history and additional objects that isn't well captured by the gc lib or memray call stack analysis. Fully clearing the run's `CommandHistory` before dereferencing the run orchestrator eliminates a memory leak.  ## Test Plan and Hands on Testing - See ticket for the wonderful script made by @SyntaxColoring, which was modified to run a step-intensive protocol 25 times in a simulated environment. The following outputs were generated with memray, passing no additional flags when generating the bin file. To convert the bin file to an HTML file, the following was run: `memray flamegraph --split-threads --temporal /path/to/file`. ### Twenty-Five Simulated Protocol Runs (with PR only) <img width="1451" height="411" alt="Screenshot 2025-08-01 at 4 20 37 PM" src="https://github.com/user-attachments/assets/577ea481-8106-43e9-9ff8-8f41e3dbcae3" /> Note the total memory usage (which may vary by robot). Compare with `edge` output, below. We expect to see some increase in total heap usage up to a point as various caching occurs. After run 17, there is no more apparent memory increase. ### Twenty-Five Simulated Protocol Runs (`edge` prior to any recent memory fixes, without PR) <img width="1475" height="380" alt="Screenshot 2025-08-01 at 10 48 33 PM" src="https://github.com/user-attachments/assets/95c8b516-011e-4f8c-a890-505ad57bd1e2" /> Note that after the 25th run, total `opentrons-robot-server` heap allocation is substantially greater than the above case. ### Twenty-Five Simulated Protocol Runs (with PR), No LRU Caching, #19107 Cherry-Pick Included <img width="1464" height="358" alt="Screenshot 2025-08-01 at 4 28 56 PM" src="https://github.com/user-attachments/assets/df9ad20c-ddfd-48c4-8b15-f3f350d68a5a" /> Effectively no increase in memory utilization after initialization and the completion of the second protocol run. ### Two Real Protocol Runs (with PR), No LRU Caching Included <img width="1459" height="370" alt="Screenshot 2025-08-01 at 4 30 29 PM" src="https://github.com/user-attachments/assets/da3f2fa8-294e-4639-a551-423e86ed375f" /> The various spikes during the run are because of camera captures via HTTP. ### Six Real Protocol Runs (with PR, #19107, #19110, #19109, #19071) <img width="1458" height="361" alt="Screenshot 2025-08-01 at 11 03 41 PM" src="https://github.com/user-attachments/assets/5bd18f3f-4a78-4540-8502-b5ec0a5b2e9f" /> Run between 10-40 minutes. The end of run memory for run 2 is 504MB, which is equivalent to the end of run 6 memory. The memray HTML analysis file is too large to attach directly on github, but it's included in the ticket.  ## Changelog - Fixed command history accumulating in memory across protocol runs.   ## Risk assessment low - we are clearing state exactly before we dereference the run orchestrator, at which point we don't expect this state to be available, anyway.  [RQA-3917]: https://opentrons.atlassian.net/browse/RQA-3917?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ --------- Co-authored-by: Max Marrone <max@opentrons.com> Co-authored-by: Seth Foster <seth@opentrons.com>
diff --git a/api/src/opentrons/protocol_engine/protocol_engine.py b/api/src/opentrons/protocol_engine/protocol_engine.py
@@ -645,6 +645,10 @@ def set_error_recovery_policy(self, policy: ErrorRecoveryPolicy) -> None:
         """Replace the run's error recovery policy with a new one."""
         self._action_dispatcher.dispatch(SetErrorRecoveryPolicyAction(policy))
 
+    def clear_command_history(self) -> None:
+        """Clear command history."""
+        self._state_store.clear_command_history()
+
 
 # TODO(tz, 7-12-23): move this to shared data when we dont relay on ErrorOccurrence
 def code_in_error_tree(
diff --git a/api/src/opentrons/protocol_engine/state/command_history.py b/api/src/opentrons/protocol_engine/state/command_history.py
@@ -255,6 +255,19 @@ def set_command_failed(self, command: Command) -> None:
         self._set_most_recently_completed_command_id(command.id)
         self._all_failed_command_ids.append(command.id)
 
+    # TODO(jh, 08-01-25) Although protocol engine is garbage collected, command history persists in memory between protocol runs.
+    # Explicitly clearing all history before dereferencing protocol engine and the run's run orchestrator eliminates
+    # memory accumulation. Investigate further.
+    def clear(self) -> None:
+        """Clear state."""
+        self._commands_by_id.clear()
+        self._all_command_ids.clear()
+        self._all_failed_command_ids.clear()
+        self._all_command_ids_but_fixit_command_ids.clear()
+        self._queued_command_ids.clear()
+        self._queued_setup_command_ids.clear()
+        self._queued_fixit_command_ids.clear()
+
     def _add(self, command_id: str, command_entry: CommandEntry) -> None:
         """Create or update a command entry."""
         if command_id not in self._commands_by_id:
diff --git a/api/src/opentrons/protocol_engine/state/commands.py b/api/src/opentrons/protocol_engine/state/commands.py
@@ -301,6 +301,10 @@ def handle_action(self, action: Action) -> None:
             case _:
                 pass
 
+    def clear_history(self) -> None:
+        """Clears CommandHistory state."""
+        self._state.command_history.clear()
+
     def _handle_queue_command_action(self, action: QueueCommandAction) -> None:
         # TODO(mc, 2021-06-22): mypy has trouble with this automatic
         # request > command mapping, figure out how to type precisely
diff --git a/api/src/opentrons/protocol_engine/state/state.py b/api/src/opentrons/protocol_engine/state/state.py
@@ -317,6 +317,10 @@ def predicate() -> _ReturnT:
 
         return await self._wait_for(condition=predicate, truthiness_to_wait_for=True)
 
+    def clear_command_history(self) -> None:
+        """Clear CommandHistory state."""
+        self._command_store.clear_history()
+
     async def wait_for_not(
         self,
         condition: Callable[_ParamsT, _ReturnT],
diff --git a/api/src/opentrons/protocol_runner/run_orchestrator.py b/api/src/opentrons/protocol_runner/run_orchestrator.py
@@ -490,3 +490,7 @@ def _map_parse_mode_to_python_parse_mode(parse_mode: ParseMode) -> PythonParseMo
             return PythonParseMode.ALLOW_LEGACY_METADATA_AND_REQUIREMENTS
         else:
             raise UnknownProtocolParseMode()
+
+    def clear_command_history(self) -> None:
+        """Force cleanup of command history."""
+        self._protocol_engine.clear_command_history()
diff --git a/robot-server/robot_server/runs/run_orchestrator_store.py b/robot-server/robot_server/runs/run_orchestrator_store.py
@@ -295,7 +295,9 @@ async def clear(self) -> RunResult:
         run_time_parameters = self.run_orchestrator.get_run_time_parameters()
         command_annotations = self.run_orchestrator.get_command_annotations()
 
-        self._run_orchestrator = None
+        if self._run_orchestrator is not None:
+            self._run_orchestrator.clear_command_history()
+            self._run_orchestrator = None
 
         return RunResult(
             state_summary=run_data,
diff --git a/robot-server/tests/runs/test_run_orchestrator_store.py b/robot-server/tests/runs/test_run_orchestrator_store.py
@@ -215,7 +215,12 @@ async def test_clear_engine(subject: RunOrchestratorStore) -> None:
         notify_publishers=mock_notify_publishers,
     )
     assert subject._run_orchestrator is not None
+    engine = subject._run_orchestrator._protocol_engine
+    engine.state_view.state.commands.command_history._queued_command_ids.add("1231")
     result = await subject.clear()
+    assert (
+        len(engine.state_view.state.commands.command_history._queued_command_ids) == 0
+    )
 
     assert subject.current_run_id is None
     assert isinstance(result, RunResult)