fix: Close runners after running eval

seanzhougoogle · copybara-github · commit 86ee6e3fa369 · 2025-09-18T09:36:56.000-07:00
this fixes #2196 PiperOrigin-RevId: 808618368
diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py
@@ -170,54 +170,53 @@ async def _generate_inferences_from_root_agent(
     if not artifact_service:
       artifact_service = InMemoryArtifactService()
 
-    runner = Runner(
-        app_name=app_name,
-        agent=root_agent,
-        artifact_service=artifact_service,
-        session_service=session_service,
-        memory_service=memory_service,
-    )
-
     # Reset agent state for each query
     if callable(reset_func):
       reset_func()
 
     response_invocations = []
 
-    for invocation in invocations:
-      final_response = None
-      user_content = invocation.user_content
-      tool_uses = []
-      invocation_id = ""
-
-      async with Aclosing(
-          runner.run_async(
-              user_id=user_id, session_id=session_id, new_message=user_content
-          )
-      ) as agen:
-        async for event in agen:
-          invocation_id = (
-              event.invocation_id if not invocation_id else invocation_id
-          )
-
-          if (
-              event.is_final_response()
-              and event.content
-              and event.content.parts
-          ):
-            final_response = event.content
-          elif event.get_function_calls():
-            for call in event.get_function_calls():
-              tool_uses.append(call)
-
-      response_invocations.append(
-          Invocation(
-              invocation_id=invocation_id,
-              user_content=user_content,
-              final_response=final_response,
-              intermediate_data=IntermediateData(tool_uses=tool_uses),
-          )
-      )
+    async with Runner(
+        app_name=app_name,
+        agent=root_agent,
+        artifact_service=artifact_service,
+        session_service=session_service,
+        memory_service=memory_service,
+    ) as runner:
+      for invocation in invocations:
+        final_response = None
+        user_content = invocation.user_content
+        tool_uses = []
+        invocation_id = ""
+
+        async with Aclosing(
+            runner.run_async(
+                user_id=user_id, session_id=session_id, new_message=user_content
+            )
+        ) as agen:
+          async for event in agen:
+            invocation_id = (
+                event.invocation_id if not invocation_id else invocation_id
+            )
+
+            if (
+                event.is_final_response()
+                and event.content
+                and event.content.parts
+            ):
+              final_response = event.content
+            elif event.get_function_calls():
+              for call in event.get_function_calls():
+                tool_uses.append(call)
+
+        response_invocations.append(
+            Invocation(
+                invocation_id=invocation_id,
+                user_content=user_content,
+                final_response=final_response,
+                intermediate_data=IntermediateData(tool_uses=tool_uses),
+            )
+        )
 
     return response_invocations
 
diff --git a/src/google/adk/runners.py b/src/google/adk/runners.py
@@ -725,13 +725,35 @@ async def _cleanup_toolsets(self, toolsets_to_close: set[BaseToolset]):
         logger.info('Successfully closed toolset: %s', type(toolset).__name__)
       except asyncio.TimeoutError:
         logger.warning('Toolset %s cleanup timed out', type(toolset).__name__)
+      except asyncio.CancelledError as e:
+        # Handle cancel scope issues in Python 3.10 and 3.11 with anyio
+        #
+        # Root cause: MCP library uses anyio.CancelScope() in RequestResponder.__enter__()
+        # and __exit__() methods. When asyncio.wait_for() creates a new task for cleanup,
+        # the cancel scope is entered in one task context but exited in another.
+        #
+        # Python 3.12+ fixes: Enhanced task context management (Task.get_context()),
+        # improved context propagation across task boundaries, and better cancellation
+        # handling prevent the cross-task cancel scope violation.
+        logger.warning(
+            'Toolset %s cleanup cancelled: %s', type(toolset).__name__, e
+        )
       except Exception as e:
         logger.error('Error closing toolset %s: %s', type(toolset).__name__, e)
 
   async def close(self):
     """Closes the runner."""
     await self._cleanup_toolsets(self._collect_toolset(self.agent))
 
+  async def __aenter__(self):
+    """Async context manager entry."""
+    return self
+
+  async def __aexit__(self, exc_type, exc_val, exc_tb):
+    """Async context manager exit."""
+    await self.close()
+    return False  # Don't suppress exceptions from the async with block
+
 
 class InMemoryRunner(Runner):
   """An in-memory Runner for testing and development.
diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import asyncio
+import sys
 from unittest import mock
 
 from google.adk.agents.llm_agent import LlmAgent
@@ -21,6 +23,7 @@
 from google.adk.evaluation.base_eval_service import InferenceConfig
 from google.adk.evaluation.base_eval_service import InferenceRequest
 from google.adk.evaluation.base_eval_service import InferenceResult
+from google.adk.evaluation.base_eval_service import InferenceStatus
 from google.adk.evaluation.eval_case import Invocation
 from google.adk.evaluation.eval_metrics import EvalMetric
 from google.adk.evaluation.eval_metrics import EvalMetricResult
@@ -361,3 +364,144 @@ def test_generate_final_eval_status_doesn_t_throw_on(eval_service):
         metric_name="metric1", threshold=0.5, eval_status=status
     )
     eval_service._generate_final_eval_status([eval_metric_result])
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(
+    sys.version_info < (3, 10), reason="MCP tool requires Python 3.10+"
+)
+async def test_mcp_stdio_agent_no_runtime_error():
+  """Test that LocalEvalService can handle MCP stdio agents without RuntimeError.
+
+  This is a regression test for GitHub issue #2196:
+  "RuntimeError: Attempted to exit cancel scope in a different task than it was entered in"
+
+  The fix ensures that Runner.close() is called to properly cleanup MCP connections.
+  """
+  import tempfile
+
+  from google.adk.evaluation.local_eval_service import LocalEvalService
+  from google.adk.tools.mcp_tool.mcp_session_manager import StdioConnectionParams
+  from google.adk.tools.mcp_tool.mcp_toolset import MCPToolset
+  from mcp import StdioServerParameters
+
+  # Mock LLM responses to avoid real API calls
+  from tests.unittests.testing_utils import MockModel
+
+  mock_responses = [
+      genai_types.Content(
+          parts=[genai_types.Part(text="Mocked response from test agent")]
+      )
+  ]
+  mock_model = MockModel.create(responses=mock_responses)
+
+  # Create a test agent with MCP stdio toolset and mocked model
+  test_dir = tempfile.mkdtemp()
+  try:
+    agent = LlmAgent(
+        model=mock_model,
+        name="test_mcp_agent",
+        instruction="Test agent for MCP stdio regression test.",
+        tools=[
+            MCPToolset(
+                connection_params=StdioConnectionParams(
+                    server_params=StdioServerParameters(
+                        command="npx",
+                        args=[
+                            "-y",
+                            "@modelcontextprotocol/server-filesystem",
+                            test_dir,
+                        ],
+                    ),
+                    timeout=5,
+                ),
+                tool_filter=["read_file", "list_directory"],
+            )
+        ],
+    )
+
+    # Create a mock eval sets manager that returns an eval case
+    mock_eval_sets_manager = mock.create_autospec(EvalSetsManager)
+    test_eval_case = EvalCase(
+        eval_id="test_mcp_case",
+        conversation=[
+            Invocation(
+                user_content=genai_types.Content(
+                    parts=[genai_types.Part(text="List directory contents")]
+                ),
+                expected_response="",
+            )
+        ],
+    )
+    mock_eval_sets_manager.get_eval_case.return_value = test_eval_case
+    eval_set = EvalSet(
+        eval_set_id="test_set",
+        eval_cases=[test_eval_case],
+    )
+    mock_eval_sets_manager.get_eval_set.return_value = eval_set
+
+    # Create LocalEvalService with MCP agent
+    eval_service = LocalEvalService(
+        root_agent=agent,
+        eval_sets_manager=mock_eval_sets_manager,
+    )
+
+    # Create inference request to actually trigger the code path with the fix
+    inference_request = InferenceRequest(
+        app_name="test_app",
+        eval_set_id="test_set",
+        inference_config=InferenceConfig(parallelism=1),
+    )
+
+    # The main test: actually call perform_inference which will trigger
+    # _generate_inferences_from_root_agent where the fix is located
+
+    # Note: In Python 3.10 and 3.11, there may be asyncio.CancelledError during cleanup
+    # due to anyio cancel scope context violations when MCP toolsets are cleaned up
+    # via asyncio.wait_for() in different task contexts. Python 3.12+ enhanced task
+    # context management (Task.get_context(), improved context propagation) resolves this.
+
+    try:
+      results = []
+      async for result in eval_service.perform_inference(inference_request):
+        results.append(result)
+        # We should get at least one result since we mocked the LLM
+        break
+
+      # Test passes if we get here without the cancel scope RuntimeError
+      # With mocked model, we should get successful inference results
+      assert len(results) >= 1
+
+    except RuntimeError as e:
+      # If we get a RuntimeError about cancel scope, the fix isn't working
+      if "cancel scope" in str(e) and "different task" in str(e):
+        pytest.fail(f"MCP stdio RuntimeError regression detected: {e}")
+      else:
+        # Other RuntimeErrors might be acceptable
+        pass
+    except asyncio.CancelledError as e:
+      # In Python 3.10 and 3.11, anyio cancel scope context violations may manifest as CancelledError
+      # when MCP RequestResponder.__exit__() is called in a different task than __enter__()
+      if (
+          hasattr(e, "args")
+          and len(e.args) > 0
+          and "cancel scope" in str(e.args[0])
+      ):
+        pytest.fail(f"MCP stdio cancel scope error regression detected: {e}")
+      else:
+        # Re-raise other CancelledErrors
+        raise
+    except Exception as e:
+      # Check if this is the specific cancel scope error we're testing for
+      if "cancel scope" in str(e) and "different task" in str(e):
+        pytest.fail(f"MCP stdio RuntimeError regression detected: {e}")
+      # Other exceptions are acceptable for this test
+
+    # The main goal is to ensure the test completes without the specific
+    # RuntimeError about cancel scopes. If we reach here, the fix is working.
+
+  finally:
+    # Cleanup
+    import shutil
+
+    shutil.rmtree(test_dir, ignore_errors=True)