Skip to content

Commit 89171a2

Browse files
Merge branch 'main' into fix/tool-call-compat-shim
2 parents 7b19b87 + b826597 commit 89171a2

21 files changed

+1533
-98
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
---
2+
# EXPERIMENTAL: Automated QA validation of PR changes using OpenHands.
3+
#
4+
# Unlike pr-review (which reads diffs and posts code-review comments),
5+
# this workflow actually runs the code — setting up the environment,
6+
# executing tests, exercising changed behavior, and posting a structured
7+
# QA report as a PR comment.
8+
#
9+
# This is an early experiment; expect rough edges. The plugin source is
10+
# pinned to the extensions feature branch while we iterate.
11+
name: QA Changes by OpenHands [experimental]
12+
13+
on:
14+
pull_request:
15+
types: [opened, ready_for_review, labeled, review_requested]
16+
17+
permissions:
18+
contents: read
19+
pull-requests: write
20+
issues: write
21+
22+
jobs:
23+
qa-changes:
24+
# Only run for same-repo PRs (secrets aren't available for forks).
25+
# Trigger conditions mirror pr-review, but use the 'qa-this' label
26+
# and openhands-agent reviewer request.
27+
if: |
28+
github.event.pull_request.head.repo.full_name == github.repository && (
29+
(github.event.action == 'opened' && github.event.pull_request.draft == false && github.event.pull_request.author_association != 'FIRST_TIME_CONTRIBUTOR' && github.event.pull_request.author_association != 'NONE') ||
30+
(github.event.action == 'ready_for_review' && github.event.pull_request.author_association != 'FIRST_TIME_CONTRIBUTOR' && github.event.pull_request.author_association != 'NONE') ||
31+
github.event.label.name == 'qa-this' ||
32+
github.event.requested_reviewer.login == 'openhands-agent' ||
33+
github.event.requested_reviewer.login == 'all-hands-bot'
34+
)
35+
concurrency:
36+
group: qa-changes-${{ github.event.pull_request.number }}
37+
cancel-in-progress: true
38+
runs-on: ubuntu-24.04
39+
timeout-minutes: 30
40+
steps:
41+
- name: Run QA Changes
42+
# EXPERIMENTAL: pointing at feature branch while iterating
43+
uses: OpenHands/extensions/plugins/qa-changes@feat/qa-changes-plugin
44+
with:
45+
llm-model: litellm_proxy/claude-sonnet-4-5-20250929
46+
llm-base-url: https://llm-proxy.app.all-hands.dev
47+
max-budget: '10.0'
48+
timeout-minutes: '30'
49+
max-iterations: '500'
50+
# EXPERIMENTAL: use the feature branch of extensions
51+
extensions-version: feat/qa-changes-plugin
52+
llm-api-key: ${{ secrets.LLM_API_KEY }}
53+
github-token: ${{ secrets.PAT_TOKEN }}
54+
lmnr-api-key: ${{ secrets.LMNR_SKILLS_API_KEY }}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
---
2+
name: QA Changes Evaluation [experimental]
3+
4+
# This workflow evaluates how well QA validation performed.
5+
# It runs when a PR is closed to assess QA effectiveness.
6+
#
7+
# Security note: pull_request_target is safe here because this workflow
8+
# never checks out or executes PR code. It only:
9+
# 1. Downloads artifacts produced by a trusted workflow run
10+
# 2. Runs evaluation scripts from the extensions repo (main/pinned branch)
11+
12+
on:
13+
pull_request_target:
14+
types: [closed]
15+
16+
permissions:
17+
contents: read
18+
pull-requests: read
19+
20+
jobs:
21+
evaluate:
22+
runs-on: ubuntu-24.04
23+
env:
24+
PR_NUMBER: ${{ github.event.pull_request.number }}
25+
REPO_NAME: ${{ github.repository }}
26+
PR_MERGED: ${{ github.event.pull_request.merged }}
27+
28+
steps:
29+
- name: Download QA trace artifact
30+
id: download-trace
31+
uses: dawidd6/action-download-artifact@v19
32+
continue-on-error: true
33+
with:
34+
workflow: qa-changes-by-openhands.yml
35+
name: qa-changes-trace-${{ github.event.pull_request.number }}
36+
path: trace-info
37+
search_artifacts: true
38+
if_no_artifact_found: warn
39+
40+
- name: Check if trace file exists
41+
id: check-trace
42+
run: |
43+
if [ -f "trace-info/laminar_trace_info.json" ]; then
44+
echo "trace_exists=true" >> $GITHUB_OUTPUT
45+
echo "Found trace file for PR #$PR_NUMBER"
46+
else
47+
echo "trace_exists=false" >> $GITHUB_OUTPUT
48+
echo "No trace file found for PR #$PR_NUMBER - skipping evaluation"
49+
fi
50+
51+
# EXPERIMENTAL: pinned to feature branch while qa-changes plugin is in development.
52+
# Switch to @main (and remove ref:) once the plugin is merged.
53+
- name: Checkout extensions repository
54+
if: steps.check-trace.outputs.trace_exists == 'true'
55+
uses: actions/checkout@v6
56+
with:
57+
repository: OpenHands/extensions
58+
ref: feat/qa-changes-plugin
59+
path: extensions
60+
61+
- name: Set up Python
62+
if: steps.check-trace.outputs.trace_exists == 'true'
63+
uses: actions/setup-python@v6
64+
with:
65+
python-version: '3.12'
66+
67+
- name: Install dependencies
68+
if: steps.check-trace.outputs.trace_exists == 'true'
69+
run: pip install lmnr
70+
71+
- name: Run evaluation
72+
if: steps.check-trace.outputs.trace_exists == 'true'
73+
env:
74+
# Script expects LMNR_PROJECT_API_KEY; org secret is named LMNR_SKILLS_API_KEY
75+
LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_SKILLS_API_KEY }}
76+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
77+
run: |
78+
python extensions/plugins/qa-changes/scripts/evaluate_qa_changes.py \
79+
--trace-file trace-info/laminar_trace_info.json
80+
81+
- name: Upload evaluation logs
82+
uses: actions/upload-artifact@v7
83+
if: always() && steps.check-trace.outputs.trace_exists == 'true'
84+
with:
85+
name: qa-changes-evaluation-${{ github.event.pull_request.number }}
86+
path: '*.log'
87+
retention-days: 30

openhands-agent-server/AGENTS.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,16 @@ This package lives in the monorepo root. Typical commands (run from repo root):
1414
When adding non-Python files (JS, templates, etc.) loaded at runtime, add them to `openhands-agent-server/openhands/agent_server/agent-server.spec` using `collect_data_files`.
1515

1616

17+
## Live server integration tests
18+
19+
Small endpoint additions or changes to server behaviour should be covered by a
20+
test in `tests/cross/test_remote_conversation_live_server.py`. These tests spin
21+
up a real FastAPI server with a patched LLM and exercise the full HTTP / WebSocket
22+
stack end-to-end. Add or extend a test there whenever the change is localised
23+
enough that a single new test function (or a few assertions added to an existing
24+
test) captures the expected behaviour.
25+
26+
1727
## Concurrency / async safety
1828

1929
- `ConversationState` uses a synchronous `FIFOLock`. In async agent-server code, never do `with conversation._state` directly on the event loop when the conversation may be running.

openhands-agent-server/openhands/agent_server/conversation_router.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
)
1313
from openhands.agent_server.dependencies import get_conversation_service
1414
from openhands.agent_server.models import (
15+
AgentResponseResult,
1516
AskAgentRequest,
1617
AskAgentResponse,
1718
ConversationInfo,
@@ -113,6 +114,27 @@ async def get_conversation(
113114
return conversation
114115

115116

117+
@conversation_router.get(
118+
"/{conversation_id}/agent_final_response",
119+
responses={404: {"description": "Conversation not found"}},
120+
)
121+
async def get_conversation_agent_final_response(
122+
conversation_id: UUID,
123+
conversation_service: ConversationService = Depends(get_conversation_service),
124+
) -> AgentResponseResult:
125+
"""Get the agent's final response for a conversation.
126+
127+
Returns the text of the last agent finish message (FinishAction) or
128+
the last agent text response (MessageEvent). Returns an empty string
129+
if the agent has not produced a final response yet.
130+
"""
131+
event_service = await conversation_service.get_event_service(conversation_id)
132+
if event_service is None:
133+
raise HTTPException(status.HTTP_404_NOT_FOUND)
134+
response = await event_service.get_agent_final_response()
135+
return AgentResponseResult(response=response)
136+
137+
116138
@conversation_router.get("")
117139
async def batch_get_conversations(
118140
ids: Annotated[list[UUID], Query()],

openhands-agent-server/openhands/agent_server/event_service.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from openhands.agent_server.pub_sub import PubSub, Subscriber
1414
from openhands.sdk import LLM, AgentBase, Event, Message, get_logger
1515
from openhands.sdk.conversation.impl.local_conversation import LocalConversation
16+
from openhands.sdk.conversation.response_utils import get_agent_final_response
1617
from openhands.sdk.conversation.secret_registry import SecretValue
1718
from openhands.sdk.conversation.state import (
1819
ConversationExecutionStatus,
@@ -705,6 +706,28 @@ async def condense(self) -> None:
705706
loop = asyncio.get_running_loop()
706707
return await loop.run_in_executor(None, self._conversation.condense)
707708

709+
def _get_agent_final_response_sync(self) -> str:
710+
"""Extract the agent's final response from the conversation events.
711+
712+
Reads directly from the EventLog without acquiring the state lock.
713+
EventLog reads are safe without the FIFOLock because events are
714+
append-only and immutable once written.
715+
"""
716+
if not self._conversation:
717+
raise ValueError("inactive_service")
718+
return get_agent_final_response(self._conversation._state.events)
719+
720+
async def get_agent_final_response(self) -> str:
721+
"""Extract the agent's final response from the conversation events.
722+
723+
Returns the text from the last FinishAction or agent MessageEvent,
724+
or empty string if no final response is found.
725+
"""
726+
if not self._conversation:
727+
raise ValueError("inactive_service")
728+
loop = asyncio.get_running_loop()
729+
return await loop.run_in_executor(None, self._get_agent_final_response_sync)
730+
708731
async def get_state(self) -> ConversationState:
709732
if not self._conversation:
710733
raise ValueError("inactive_service")

openhands-agent-server/openhands/agent_server/models.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,22 @@ class AskAgentResponse(BaseModel):
459459
response: str = Field(description="The agent's response to the question")
460460

461461

462+
class AgentResponseResult(BaseModel):
463+
"""The agent's final response for a conversation.
464+
465+
Contains the text of the last agent finish message or text response.
466+
Empty string if the agent has not produced a final response yet.
467+
"""
468+
469+
response: str = Field(
470+
description=(
471+
"The agent's final response text. Extracted from either a "
472+
"FinishAction message or the last agent MessageEvent. "
473+
"Empty string if no final response is available."
474+
)
475+
)
476+
477+
462478
class BashEventBase(DiscriminatedUnionMixin, ABC):
463479
"""Base class for all bash event types"""
464480

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,25 @@
11
import re
2+
from typing import Final
23

34

4-
CMD_OUTPUT_PS1_BEGIN = "\n###PS1JSON###\n"
5-
CMD_OUTPUT_PS1_END = "\n###PS1END###"
5+
CMD_OUTPUT_PS1_BEGIN: Final[str] = "\n###PS1JSON###\n"
6+
CMD_OUTPUT_PS1_END: Final[str] = "\n###PS1END###"
67
# Regex to match PS1 metadata blocks. Uses negative lookahead to handle corruption
78
# scenarios where concurrent output causes nested ###PS1JSON### markers. This ensures
89
# we match only the LAST ###PS1JSON### before each ###PS1END###.
9-
CMD_OUTPUT_METADATA_PS1_REGEX = re.compile(
10+
CMD_OUTPUT_METADATA_PS1_REGEX: Final[re.Pattern[str]] = re.compile(
1011
rf"^{CMD_OUTPUT_PS1_BEGIN.strip()}((?:(?!{CMD_OUTPUT_PS1_BEGIN.strip()}).)*?){CMD_OUTPUT_PS1_END.strip()}",
1112
re.DOTALL | re.MULTILINE,
1213
)
1314

1415
# Default max size for command output content
1516
# to prevent too large observations from being saved in the stream
1617
# This matches the default max_message_chars in LLM class
17-
MAX_CMD_OUTPUT_SIZE: int = 30000
18+
MAX_CMD_OUTPUT_SIZE: Final[int] = 30000
1819

1920

2021
# Common timeout message that can be used across different timeout scenarios
21-
TIMEOUT_MESSAGE_TEMPLATE = (
22+
TIMEOUT_MESSAGE_TEMPLATE: Final[str] = (
2223
"You may wait longer to see additional output by sending empty command '', "
2324
"send other commands to interact with the current process, send keys "
2425
'("C-c", "C-z", "C-d") '
@@ -27,8 +28,13 @@
2728
)
2829

2930
# How long to wait with no new output before considering it a no-change timeout
30-
NO_CHANGE_TIMEOUT_SECONDS = 30
31+
NO_CHANGE_TIMEOUT_SECONDS: Final[int] = 30
3132

3233
# How often to poll for new output in seconds
33-
POLL_INTERVAL = 0.5
34-
HISTORY_LIMIT = 10_000
34+
POLL_INTERVAL: Final[float] = 0.5
35+
HISTORY_LIMIT: Final[int] = 10_000
36+
37+
# Tmux session dimensions (columns x rows).
38+
# Large values ensure output is not wrapped or truncated by the virtual terminal.
39+
TMUX_SESSION_WIDTH: Final[int] = 1000
40+
TMUX_SESSION_HEIGHT: Final[int] = 1000

openhands-tools/openhands/tools/terminal/definition.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from openhands.sdk.llm import ImageContent, TextContent
1515
from openhands.sdk.tool import (
1616
Action,
17+
DeclaredResources,
1718
Observation,
1819
ToolAnnotations,
1920
ToolDefinition,
@@ -234,6 +235,17 @@ def visualize(self) -> Text:
234235
class TerminalTool(ToolDefinition[TerminalAction, TerminalObservation]):
235236
"""A ToolDefinition subclass that automatically initializes a TerminalExecutor with auto-detection.""" # noqa: E501
236237

238+
def declared_resources(self, action: Action) -> DeclaredResources: # noqa: ARG002
239+
# When using the tmux backend, TmuxPanePool handles concurrency
240+
# internally via pane-level isolation — opt out of framework
241+
# serialization so parallel calls are allowed.
242+
# When using the subprocess backend there is only a single
243+
# session, so we declare a resource key to serialize terminal
244+
# calls against each other without blocking unrelated tools.
245+
if getattr(self.executor, "is_pooled", False):
246+
return DeclaredResources(keys=(), declared=True)
247+
return DeclaredResources(keys=("terminal:session",), declared=True)
248+
237249
@classmethod
238250
def create(
239251
cls,

0 commit comments

Comments
 (0)