OpenHands
diff --git a/‎.github/workflows/qa-changes-by-openhands.yml‎
Lines changed: 54 additions & 0 deletions b/‎.github/workflows/qa-changes-by-openhands.yml‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎.github/workflows/qa-changes-evaluation.yml‎
Lines changed: 87 additions & 0 deletions b/‎.github/workflows/qa-changes-evaluation.yml‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎openhands-agent-server/AGENTS.md‎
Lines changed: 10 additions & 0 deletions b/‎openhands-agent-server/AGENTS.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎openhands-agent-server/openhands/agent_server/conversation_router.py‎
Lines changed: 22 additions & 0 deletions b/‎openhands-agent-server/openhands/agent_server/conversation_router.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎openhands-agent-server/openhands/agent_server/event_service.py‎
Lines changed: 23 additions & 0 deletions b/‎openhands-agent-server/openhands/agent_server/event_service.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎openhands-agent-server/openhands/agent_server/models.py‎
Lines changed: 16 additions & 0 deletions b/‎openhands-agent-server/openhands/agent_server/models.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎openhands-tools/openhands/tools/terminal/constants.py‎
Lines changed: 14 additions & 8 deletions b/‎openhands-tools/openhands/tools/terminal/constants.py‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎openhands-tools/openhands/tools/terminal/definition.py‎
Lines changed: 12 additions & 0 deletions b/‎openhands-tools/openhands/tools/terminal/definition.py‎
Lines changed: 12 additions & 0 deletions
@@ -0,0 +1,54 @@
+---
+# EXPERIMENTAL: Automated QA validation of PR changes using OpenHands.
+#
+# Unlike pr-review (which reads diffs and posts code-review comments),
+# this workflow actually runs the code — setting up the environment,
+# executing tests, exercising changed behavior, and posting a structured
+# QA report as a PR comment.
+#
+# This is an early experiment; expect rough edges.  The plugin source is
+# pinned to the extensions feature branch while we iterate.
+name: QA Changes by OpenHands [experimental]
+
+on:
+    pull_request:
+        types: [opened, ready_for_review, labeled, review_requested]
+
+permissions:
+    contents: read
+    pull-requests: write
+    issues: write
+
+jobs:
+    qa-changes:
+        # Only run for same-repo PRs (secrets aren't available for forks).
+        # Trigger conditions mirror pr-review, but use the 'qa-this' label
+        # and openhands-agent reviewer request.
+        if: |
+            github.event.pull_request.head.repo.full_name == github.repository && (
+                (github.event.action == 'opened' && github.event.pull_request.draft == false && github.event.pull_request.author_association != 'FIRST_TIME_CONTRIBUTOR' && github.event.pull_request.author_association != 'NONE') ||
+                (github.event.action == 'ready_for_review' && github.event.pull_request.author_association != 'FIRST_TIME_CONTRIBUTOR' && github.event.pull_request.author_association != 'NONE') ||
+                github.event.label.name == 'qa-this' ||
+                github.event.requested_reviewer.login == 'openhands-agent' ||
+                github.event.requested_reviewer.login == 'all-hands-bot'
+            )
+        concurrency:
+            group: qa-changes-${{ github.event.pull_request.number }}
+            cancel-in-progress: true
+        runs-on: ubuntu-24.04
+        timeout-minutes: 30
+        steps:
+            - name: Run QA Changes
+              # EXPERIMENTAL: pointing at feature branch while iterating
+              uses: OpenHands/extensions/plugins/qa-changes@feat/qa-changes-plugin
+              with:
+                  llm-model: litellm_proxy/claude-sonnet-4-5-20250929
+                  llm-base-url: https://llm-proxy.app.all-hands.dev
+                  max-budget: '10.0'
+                  timeout-minutes: '30'
+                  max-iterations: '500'
+                  # EXPERIMENTAL: use the feature branch of extensions
+                  extensions-version: feat/qa-changes-plugin
+                  llm-api-key: ${{ secrets.LLM_API_KEY }}
+                  github-token: ${{ secrets.PAT_TOKEN }}
+                  lmnr-api-key: ${{ secrets.LMNR_SKILLS_API_KEY }}
@@ -0,0 +1,87 @@
+---
+name: QA Changes Evaluation [experimental]
+
+# This workflow evaluates how well QA validation performed.
+# It runs when a PR is closed to assess QA effectiveness.
+#
+# Security note: pull_request_target is safe here because this workflow
+# never checks out or executes PR code. It only:
+# 1. Downloads artifacts produced by a trusted workflow run
+# 2. Runs evaluation scripts from the extensions repo (main/pinned branch)
+
+on:
+    pull_request_target:
+        types: [closed]
+
+permissions:
+    contents: read
+    pull-requests: read
+
+jobs:
+    evaluate:
+        runs-on: ubuntu-24.04
+        env:
+            PR_NUMBER: ${{ github.event.pull_request.number }}
+            REPO_NAME: ${{ github.repository }}
+            PR_MERGED: ${{ github.event.pull_request.merged }}
+
+        steps:
+            - name: Download QA trace artifact
+              id: download-trace
+              uses: dawidd6/action-download-artifact@v19
+              continue-on-error: true
+              with:
+                  workflow: qa-changes-by-openhands.yml
+                  name: qa-changes-trace-${{ github.event.pull_request.number }}
+                  path: trace-info
+                  search_artifacts: true
+                  if_no_artifact_found: warn
+
+            - name: Check if trace file exists
+              id: check-trace
+              run: |
+                  if [ -f "trace-info/laminar_trace_info.json" ]; then
+                    echo "trace_exists=true" >> $GITHUB_OUTPUT
+                    echo "Found trace file for PR #$PR_NUMBER"
+                  else
+                    echo "trace_exists=false" >> $GITHUB_OUTPUT
+                    echo "No trace file found for PR #$PR_NUMBER - skipping evaluation"
+                  fi
+
+            # EXPERIMENTAL: pinned to feature branch while qa-changes plugin is in development.
+            # Switch to @main (and remove ref:) once the plugin is merged.
+            - name: Checkout extensions repository
+              if: steps.check-trace.outputs.trace_exists == 'true'
+              uses: actions/checkout@v6
+              with:
+                  repository: OpenHands/extensions
+                  ref: feat/qa-changes-plugin
+                  path: extensions
+
+            - name: Set up Python
+              if: steps.check-trace.outputs.trace_exists == 'true'
+              uses: actions/setup-python@v6
+              with:
+                  python-version: '3.12'
+
+            - name: Install dependencies
+              if: steps.check-trace.outputs.trace_exists == 'true'
+              run: pip install lmnr
+
+            - name: Run evaluation
+              if: steps.check-trace.outputs.trace_exists == 'true'
+              env:
+                  # Script expects LMNR_PROJECT_API_KEY; org secret is named LMNR_SKILLS_API_KEY
+                  LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_SKILLS_API_KEY }}
+                  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+              run: |
+                  python extensions/plugins/qa-changes/scripts/evaluate_qa_changes.py \
+                      --trace-file trace-info/laminar_trace_info.json
+
+            - name: Upload evaluation logs
+              uses: actions/upload-artifact@v7
+              if: always() && steps.check-trace.outputs.trace_exists == 'true'
+              with:
+                  name: qa-changes-evaluation-${{ github.event.pull_request.number }}
+                  path: '*.log'
+                  retention-days: 30
@@ -14,6 +14,16 @@ This package lives in the monorepo root. Typical commands (run from repo root):
 When adding non-Python files (JS, templates, etc.) loaded at runtime, add them to `openhands-agent-server/openhands/agent_server/agent-server.spec` using `collect_data_files`.
 
 
+## Live server integration tests
+
+Small endpoint additions or changes to server behaviour should be covered by a
+test in `tests/cross/test_remote_conversation_live_server.py`.  These tests spin
+up a real FastAPI server with a patched LLM and exercise the full HTTP / WebSocket
+stack end-to-end.  Add or extend a test there whenever the change is localised
+enough that a single new test function (or a few assertions added to an existing
+test) captures the expected behaviour.
+
+
 ## Concurrency / async safety
 
 - `ConversationState` uses a synchronous `FIFOLock`. In async agent-server code, never do `with conversation._state` directly on the event loop when the conversation may be running.
 
@@ -12,6 +12,7 @@
 )
 from openhands.agent_server.dependencies import get_conversation_service
 from openhands.agent_server.models import (
+    AgentResponseResult,
     AskAgentRequest,
     AskAgentResponse,
     ConversationInfo,
@@ -113,6 +114,27 @@ async def get_conversation(
     return conversation
 
 
+@conversation_router.get(
+    "/{conversation_id}/agent_final_response",
+    responses={404: {"description": "Conversation not found"}},
+)
+async def get_conversation_agent_final_response(
+    conversation_id: UUID,
+    conversation_service: ConversationService = Depends(get_conversation_service),
+) -> AgentResponseResult:
+    """Get the agent's final response for a conversation.
+
+    Returns the text of the last agent finish message (FinishAction) or
+    the last agent text response (MessageEvent). Returns an empty string
+    if the agent has not produced a final response yet.
+    """
+    event_service = await conversation_service.get_event_service(conversation_id)
+    if event_service is None:
+        raise HTTPException(status.HTTP_404_NOT_FOUND)
+    response = await event_service.get_agent_final_response()
+    return AgentResponseResult(response=response)
+
+
 @conversation_router.get("")
 async def batch_get_conversations(
     ids: Annotated[list[UUID], Query()],
 
@@ -13,6 +13,7 @@
 from openhands.agent_server.pub_sub import PubSub, Subscriber
 from openhands.sdk import LLM, AgentBase, Event, Message, get_logger
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
+from openhands.sdk.conversation.response_utils import get_agent_final_response
 from openhands.sdk.conversation.secret_registry import SecretValue
 from openhands.sdk.conversation.state import (
     ConversationExecutionStatus,
@@ -705,6 +706,28 @@ async def condense(self) -> None:
         loop = asyncio.get_running_loop()
         return await loop.run_in_executor(None, self._conversation.condense)
 
+    def _get_agent_final_response_sync(self) -> str:
+        """Extract the agent's final response from the conversation events.
+
+        Reads directly from the EventLog without acquiring the state lock.
+        EventLog reads are safe without the FIFOLock because events are
+        append-only and immutable once written.
+        """
+        if not self._conversation:
+            raise ValueError("inactive_service")
+        return get_agent_final_response(self._conversation._state.events)
+
+    async def get_agent_final_response(self) -> str:
+        """Extract the agent's final response from the conversation events.
+
+        Returns the text from the last FinishAction or agent MessageEvent,
+        or empty string if no final response is found.
+        """
+        if not self._conversation:
+            raise ValueError("inactive_service")
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self._get_agent_final_response_sync)
+
     async def get_state(self) -> ConversationState:
         if not self._conversation:
             raise ValueError("inactive_service")
 
@@ -459,6 +459,22 @@ class AskAgentResponse(BaseModel):
     response: str = Field(description="The agent's response to the question")
 
 
+class AgentResponseResult(BaseModel):
+    """The agent's final response for a conversation.
+
+    Contains the text of the last agent finish message or text response.
+    Empty string if the agent has not produced a final response yet.
+    """
+
+    response: str = Field(
+        description=(
+            "The agent's final response text. Extracted from either a "
+            "FinishAction message or the last agent MessageEvent. "
+            "Empty string if no final response is available."
+        )
+    )
+
+
 class BashEventBase(DiscriminatedUnionMixin, ABC):
     """Base class for all bash event types"""
 
 
@@ -1,24 +1,25 @@
 import re
+from typing import Final
 
 
-CMD_OUTPUT_PS1_BEGIN = "\n###PS1JSON###\n"
-CMD_OUTPUT_PS1_END = "\n###PS1END###"
+CMD_OUTPUT_PS1_BEGIN: Final[str] = "\n###PS1JSON###\n"
+CMD_OUTPUT_PS1_END: Final[str] = "\n###PS1END###"
 # Regex to match PS1 metadata blocks. Uses negative lookahead to handle corruption
 # scenarios where concurrent output causes nested ###PS1JSON### markers. This ensures
 # we match only the LAST ###PS1JSON### before each ###PS1END###.
-CMD_OUTPUT_METADATA_PS1_REGEX = re.compile(
+CMD_OUTPUT_METADATA_PS1_REGEX: Final[re.Pattern[str]] = re.compile(
     rf"^{CMD_OUTPUT_PS1_BEGIN.strip()}((?:(?!{CMD_OUTPUT_PS1_BEGIN.strip()}).)*?){CMD_OUTPUT_PS1_END.strip()}",
     re.DOTALL | re.MULTILINE,
 )
 
 # Default max size for command output content
 # to prevent too large observations from being saved in the stream
 # This matches the default max_message_chars in LLM class
-MAX_CMD_OUTPUT_SIZE: int = 30000
+MAX_CMD_OUTPUT_SIZE: Final[int] = 30000
 
 
 # Common timeout message that can be used across different timeout scenarios
-TIMEOUT_MESSAGE_TEMPLATE = (
+TIMEOUT_MESSAGE_TEMPLATE: Final[str] = (
     "You may wait longer to see additional output by sending empty command '', "
     "send other commands to interact with the current process, send keys "
     '("C-c", "C-z", "C-d") '
@@ -27,8 +28,13 @@
 )
 
 # How long to wait with no new output before considering it a no-change timeout
-NO_CHANGE_TIMEOUT_SECONDS = 30
+NO_CHANGE_TIMEOUT_SECONDS: Final[int] = 30
 
 # How often to poll for new output in seconds
-POLL_INTERVAL = 0.5
-HISTORY_LIMIT = 10_000
+POLL_INTERVAL: Final[float] = 0.5
+HISTORY_LIMIT: Final[int] = 10_000
+
+# Tmux session dimensions (columns x rows).
+# Large values ensure output is not wrapped or truncated by the virtual terminal.
+TMUX_SESSION_WIDTH: Final[int] = 1000
+TMUX_SESSION_HEIGHT: Final[int] = 1000
@@ -14,6 +14,7 @@
 from openhands.sdk.llm import ImageContent, TextContent
 from openhands.sdk.tool import (
     Action,
+    DeclaredResources,
     Observation,
     ToolAnnotations,
     ToolDefinition,
@@ -234,6 +235,17 @@ def visualize(self) -> Text:
 class TerminalTool(ToolDefinition[TerminalAction, TerminalObservation]):
     """A ToolDefinition subclass that automatically initializes a TerminalExecutor with auto-detection."""  # noqa: E501
 
+    def declared_resources(self, action: Action) -> DeclaredResources:  # noqa: ARG002
+        # When using the tmux backend, TmuxPanePool handles concurrency
+        # internally via pane-level isolation — opt out of framework
+        # serialization so parallel calls are allowed.
+        # When using the subprocess backend there is only a single
+        # session, so we declare a resource key to serialize terminal
+        # calls against each other without blocking unrelated tools.
+        if getattr(self.executor, "is_pooled", False):
+            return DeclaredResources(keys=(), declared=True)
+        return DeclaredResources(keys=("terminal:session",), declared=True)
+
     @classmethod
     def create(
         cls,