diff --git a/plugins/pr-review/README.md b/plugins/pr-review/README.md index 312255e..f75e86d 100644 --- a/plugins/pr-review/README.md +++ b/plugins/pr-review/README.md @@ -24,6 +24,7 @@ Then configure the required secrets (see [Installation](#installation) below). - **A/B Testing**: Support for testing multiple LLM models - **Review Context Awareness**: Considers previous reviews and unresolved threads - **Evidence Enforcement**: Optional check that PR descriptions include concrete end-to-end proof the code works, not just test output +- **Sub-Agent Delegation** *(Experimental)*: Split large PR reviews across multiple sub-agents, one per file, then consolidate findings (see [Known Limitations](#known-limitations-sub-agent-delegation)) - **Observability**: Optional Laminar integration for tracing and evaluation ## Plugin Contents @@ -141,12 +142,24 @@ PR reviews are automatically triggered when: | `llm-base-url` | No | `''` | Custom LLM endpoint URL | | `review-style` | No | `roasted` | **[DEPRECATED]** Previously chose between `standard` and `roasted` review styles. Now ignored — the styles have been merged into a single unified skill. | | `require-evidence` | No | `'false'` | Require the reviewer to enforce an `Evidence` section in the PR description with end-to-end proof: screenshots/videos for frontend work, commands and runtime output for backend or scripts, and an agent conversation link when applicable. Test output alone does not qualify. | +| `use-sub-agents` | No | `'false'` | **(Experimental)** Enable sub-agent delegation for file-level reviews. The main agent acts as a coordinator that delegates per-file review work to `file_reviewer` sub-agents via the SDK TaskToolSet, then consolidates findings into a single PR review. Useful for large PRs with many changed files. | | `extensions-repo` | No | `OpenHands/extensions` | Extensions repository | | `extensions-version` | No | `main` | Git ref (tag, branch, or SHA) | | `llm-api-key` | Yes | - | LLM API key | | `github-token` | Yes | - | GitHub token for API access | | `lmnr-api-key` | No | `''` | Laminar API key for observability | +## Known Limitations: Sub-Agent Delegation + +The `use-sub-agents` feature is **experimental** and has the following known constraints: + +- **LLM-driven JSON parsing**: The coordinator agent relies on the LLM to parse and merge JSON responses from sub-agents. There is no code-level validation of sub-agent output, so malformed responses may cause incomplete reviews. +- **Potential information loss during consolidation**: When merging findings from multiple sub-agents, the coordinator may lose or deduplicate findings imperfectly, especially for cross-file issues. +- **No integration tests yet**: Current test coverage verifies prompt formatting only. End-to-end validation of the delegation flow requires manual workflow testing. +- **Sub-agents have read-only tools**: File reviewer sub-agents have access to `terminal` and `file_editor` for inspecting full source files and surrounding context, but they cannot query the GitHub API or post reviews — only the coordinator handles GitHub interaction. + +These limitations are acceptable for an opt-in experimental feature and will be addressed as the feature matures. + ## A/B Testing Multiple Models Test different LLM models by providing a comma-separated list: diff --git a/plugins/pr-review/action.yml b/plugins/pr-review/action.yml index 78dabdf..898eed4 100644 --- a/plugins/pr-review/action.yml +++ b/plugins/pr-review/action.yml @@ -27,6 +27,13 @@ inputs: description: "When true, require the reviewer to check the PR description for an Evidence section proving the code works end-to-end (screenshots/videos for frontend changes; commands and runtime output for backend, CLI, or script changes; conversation link when agent-generated). Test output alone does not count." required: false default: 'false' + use-sub-agents: + description: > + Enable sub-agent delegation for file-level reviews (experimental). + When true, the agent gets the TaskToolSet and decides at runtime + whether to delegate based on diff size and complexity. + required: false + default: 'false' extensions-repo: description: GitHub repository for extensions (owner/repo) required: false @@ -125,6 +132,7 @@ runs: LLM_BASE_URL: ${{ inputs.llm-base-url }} REVIEW_STYLE: ${{ inputs.review-style }} REQUIRE_EVIDENCE: ${{ inputs.require-evidence }} + USE_SUB_AGENTS: ${{ inputs.use-sub-agents }} LLM_API_KEY: ${{ inputs.llm-api-key }} GITHUB_TOKEN: ${{ inputs.github-token }} LMNR_PROJECT_API_KEY: ${{ inputs.lmnr-api-key }} diff --git a/plugins/pr-review/scripts/agent_script.py b/plugins/pr-review/scripts/agent_script.py index f926f7f..d356493 100644 --- a/plugins/pr-review/scripts/agent_script.py +++ b/plugins/pr-review/scripts/agent_script.py @@ -32,6 +32,11 @@ REPO_NAME: Repository name in format owner/repo (required) REQUIRE_EVIDENCE: Whether to require PR description evidence showing the code works ('true'/'false', default: 'false') + USE_SUB_AGENTS: Enable sub-agent delegation for file-level reviews + ('true'/'false', default: 'false'). When enabled, the main agent acts + as a coordinator that delegates per-file review work to + file_reviewer sub-agents via the TaskToolSet, then consolidates + findings into a single GitHub PR review. For setup instructions, usage examples, and GitHub Actions integration, see README.md in this directory. @@ -50,18 +55,29 @@ from typing import Any from lmnr import Laminar -from openhands.sdk import LLM, Agent, AgentContext, Conversation, get_logger +from openhands.sdk import ( + LLM, + Agent, + AgentContext, + Conversation, + Tool, + get_logger, + register_agent, +) +from openhands.sdk.context import Skill from openhands.sdk.context.skills import load_project_skills from openhands.sdk.conversation import get_agent_final_response from openhands.sdk.git.utils import run_git_command from openhands.sdk.plugin import PluginSource +from openhands.tools.delegate import DelegationVisualizer from openhands.tools.preset.default import get_default_condenser, get_default_tools +from openhands.tools.task import TaskToolSet # Add the script directory to Python path so we can import prompt.py script_dir = Path(__file__).parent sys.path.insert(0, str(script_dir)) -from prompt import format_prompt # noqa: E402 +from prompt import FILE_REVIEWER_SKILL, format_prompt # noqa: E402 logger = get_logger(__name__) @@ -728,6 +744,7 @@ def validate_environment() -> dict[str, Any]: "model": os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), "base_url": os.getenv("LLM_BASE_URL"), "require_evidence": _get_bool_env("REQUIRE_EVIDENCE"), + "use_sub_agents": _get_bool_env("USE_SUB_AGENTS"), "pr_info": { "number": os.getenv("PR_NUMBER"), "title": os.getenv("PR_TITLE"), @@ -763,6 +780,47 @@ def fetch_pr_context(pr_number: str) -> tuple[str, str, str]: return pr_diff, commit_id, review_context +def _create_file_reviewer_agent(llm: LLM) -> Agent: + """Factory for file_reviewer sub-agents used during delegation. + + Each sub-agent receives a skill that defines its review persona and + expected output format. It has read-only terminal and file_editor + access so it can inspect surrounding code context in the PR repo, + but the coordinator handles all GitHub API interaction. + """ + skills = [ + Skill( + name="file_review_instructions", + content=FILE_REVIEWER_SKILL, + trigger=None, + ), + ] + return Agent( + llm=llm, + tools=[ + Tool(name="terminal"), + Tool(name="file_editor"), + ], + agent_context=AgentContext(skills=skills), + ) + + +def _register_sub_agents() -> None: + """Register the file_reviewer agent type. + + TaskToolSet auto-registers on import, so no explicit + ``register_tool()`` call is needed. + """ + register_agent( + name="file_reviewer", + factory_func=_create_file_reviewer_agent, + description=( + "Reviews one or more files from a PR diff and returns structured " + "findings as a JSON array." + ), + ) + + def create_conversation( config: dict[str, Any], secrets: dict[str, str], @@ -773,6 +831,9 @@ def create_conversation( handles wiring skills, MCP config, and hooks automatically. Project-specific skills from the workspace are loaded separately. + When ``config["use_sub_agents"]`` is True the coordinator agent is + given the TaskToolSet so it can delegate to file_reviewer sub-agents. + Args: config: Configuration dictionary from validate_environment() secrets: Secrets to mask in output @@ -804,9 +865,17 @@ def create_conversation( skills=project_skills, ) + tools = get_default_tools(enable_browser=False) + + use_sub_agents = config.get("use_sub_agents", False) + if use_sub_agents: + _register_sub_agents() + tools.append(Tool(name=TaskToolSet.name)) + logger.info("Sub-agent delegation enabled — TaskToolSet added") + agent = Agent( llm=llm, - tools=get_default_tools(enable_browser=False), + tools=tools, agent_context=agent_context, system_prompt_kwargs={"cli_mode": True}, condenser=get_default_condenser( @@ -816,12 +885,18 @@ def create_conversation( # The plugin directory is the parent of the scripts/ directory plugin_dir = script_dir.parent # plugins/pr-review/ - return Conversation( - agent=agent, - workspace=cwd, - secrets=secrets, - plugins=[PluginSource(source=str(plugin_dir))], - ) + conversation_kwargs: dict[str, Any] = { + "agent": agent, + "workspace": cwd, + "secrets": secrets, + "plugins": [PluginSource(source=str(plugin_dir))], + } + if use_sub_agents: + conversation_kwargs["visualizer"] = DelegationVisualizer( + name="PR Review Coordinator" + ) + + return Conversation(**conversation_kwargs) def run_review( @@ -930,9 +1005,11 @@ def main(): config = validate_environment() pr_info = config["pr_info"] require_evidence = config["require_evidence"] + use_sub_agents = config["use_sub_agents"] logger.info(f"Reviewing PR #{pr_info['number']}: {pr_info['title']}") logger.info(f"Require PR evidence: {require_evidence}") + logger.info(f"Sub-agent delegation: {use_sub_agents}") try: pr_diff, commit_id, review_context = fetch_pr_context(pr_info["number"]) @@ -952,6 +1029,7 @@ def main(): diff=pr_diff, review_context=review_context, require_evidence=require_evidence, + use_sub_agents=use_sub_agents, ) secrets = {} diff --git a/plugins/pr-review/scripts/prompt.py b/plugins/pr-review/scripts/prompt.py index 1a3d2e6..4f33629 100644 --- a/plugins/pr-review/scripts/prompt.py +++ b/plugins/pr-review/scripts/prompt.py @@ -11,6 +11,10 @@ - {pr_number} - The PR number - {commit_id} - The HEAD commit SHA - {review_context} - Previous review comments and thread resolution status + +When sub-agent delegation is enabled (``use_sub_agents=True``), a short +delegation suffix is appended to the base prompt giving the agent the +option to delegate file-level reviews via the TaskToolSet. """ # Template for when there is review context available @@ -75,6 +79,78 @@ Analyze the changes and post your review using the GitHub API. """ +# Appended to PROMPT when use_sub_agents=True. Gives the main agent the +# option to delegate via the TaskToolSet without duplicating the base prompt. +_DELEGATION_SUFFIX = """ +## Sub-agent Delegation + +You have access to the **task** tool for delegating file-level reviews to +`file_reviewer` sub-agents. Use it when the diff is large — roughly 4+ files +or 500+ changed lines. For smaller diffs, just review directly. + +When delegating, split the diff by file (or small group of related files) and +call the task tool with `subagent_type: "file_reviewer"`. Each sub-agent will +return a JSON array of findings. Merge them, de-duplicate, drop noise, and +post a single consolidated review via the GitHub API. +""" + +# Skill content injected into each file_reviewer sub-agent. +# Defines the review persona, available tools, and — most importantly — the +# exact JSON schema the sub-agent must return. +FILE_REVIEWER_SKILL = """\ +You are a **file-level code reviewer** sub-agent. + +## Your Task + +You will receive a diff for one or more files from a pull request. +Review the changes and return structured findings. + +## Tools + +You have `terminal` and `file_editor` so you can inspect the full source +files for surrounding context — use `cat`, `grep`, or `file_editor view` +when the diff alone is not enough to judge an issue. + +## Review Style + +Be direct, pragmatic, and thorough. Focus on correctness, security, +simplicity, and maintainability. Call out real problems; skip trivial noise. + +## Output Format + +Return a JSON array wrapped in a ```json fenced code block. +Each element must have exactly these fields: + +| Field | Type | Description | +|------------|--------|-------------| +| `path` | string | File path exactly as shown in the diff header (e.g. `src/utils.py`) | +| `line` | int | Line number in the **new** file where the issue occurs | +| `severity` | string | One of: `"critical"`, `"major"`, `"minor"`, `"nit"` | +| `body` | string | Concise description of the issue, including a suggested fix | + +### Severity guide +- **critical** — bug, security vulnerability, or data loss +- **major** — incorrect logic, missing error handling, performance issue +- **minor** — style, readability, or minor correctness concern +- **nit** — cosmetic or trivial preference + +### Example + +```json +[ + {{"path": "src/utils.py", "line": 42, "severity": "major", "body": "Unchecked `None` return — add a guard before accessing `.value`."}}, + {{"path": "src/utils.py", "line": 78, "severity": "nit", "body": "Unused import `os`."}} +] +``` + +If you find no issues, return: +```json +[] +``` + +When you are done, call the `finish` tool with the JSON array as the message. +""" + def format_prompt( skill_trigger: str, @@ -88,6 +164,7 @@ def format_prompt( diff: str, review_context: str = "", require_evidence: bool = False, + use_sub_agents: bool = False, ) -> str: """Format the PR review prompt with all parameters. @@ -105,6 +182,9 @@ def format_prompt( the review context section is omitted from the prompt. require_evidence: Whether to instruct the reviewer to enforce PR description evidence showing the code works. + use_sub_agents: When True, the agent gets the TaskToolSet and decides + at runtime whether to delegate file-level reviews to + sub-agents based on diff size and complexity. Returns: Formatted prompt string @@ -121,7 +201,7 @@ def format_prompt( _EVIDENCE_REQUIREMENT_SECTION if require_evidence else "" ) - return PROMPT.format( + prompt = PROMPT.format( skill_trigger=skill_trigger, title=title, body=body, @@ -134,3 +214,8 @@ def format_prompt( evidence_requirements_section=evidence_requirements_section, diff=diff, ) + + if use_sub_agents: + prompt += _DELEGATION_SUFFIX + + return prompt diff --git a/tests/test_pr_review_prompt.py b/tests/test_pr_review_prompt.py index 5ab6aee..80c16a5 100644 --- a/tests/test_pr_review_prompt.py +++ b/tests/test_pr_review_prompt.py @@ -18,7 +18,9 @@ def _load_prompt_module(): return module -def _format_prompt(*, require_evidence: bool) -> str: +def _format_prompt( + *, require_evidence: bool, use_sub_agents: bool = False +) -> str: module = _load_prompt_module() return module.format_prompt( skill_trigger="/codereview", @@ -32,6 +34,7 @@ def _format_prompt(*, require_evidence: bool) -> str: diff="diff --git a/file b/file", review_context="", require_evidence=require_evidence, + use_sub_agents=use_sub_agents, ) @@ -71,3 +74,59 @@ def test_format_prompt_includes_evidence_requirements_when_enabled(): assert "real code path end-to-end" in prompt assert "unit test output" in prompt assert "https://app.all-hands.dev/conversations/{conversation_id}" in prompt + + +# --- Sub-agent delegation prompt tests --- + + +def test_format_prompt_uses_standard_prompt_by_default(): + prompt = _format_prompt(require_evidence=False, use_sub_agents=False) + + # Standard prompt should NOT mention delegation or sub-agents + assert "review coordinator" not in prompt + assert "TaskToolSet" not in prompt + assert "file_reviewer" not in prompt + # Standard prompt should contain the normal review instruction + assert "Analyze the changes and post your review" in prompt + + +def test_format_prompt_appends_delegation_suffix_when_enabled(): + prompt = _format_prompt(require_evidence=False, use_sub_agents=True) + + # Should still include the base prompt content + assert "Add evidence enforcement" in prompt + assert "OpenHands/extensions" in prompt + assert "abc123" in prompt + assert "diff --git a/file b/file" in prompt + assert "Analyze the changes and post your review" in prompt + # Delegation suffix appended + assert "Sub-agent Delegation" in prompt + assert "file_reviewer" in prompt + assert "task" in prompt.lower() + + +def test_delegation_suffix_with_evidence(): + prompt = _format_prompt(require_evidence=True, use_sub_agents=True) + + assert "Sub-agent Delegation" in prompt + assert "## PR Description Evidence Requirement" in prompt + + +def test_file_reviewer_skill_content(): + module = _load_prompt_module() + content = module.FILE_REVIEWER_SKILL + + assert "file-level code reviewer" in content + # Unified review style + assert "pragmatic" in content + # JSON schema documented + assert "path" in content + assert "line" in content + assert "severity" in content + assert "body" in content + assert "critical" in content + # Tool access documented + assert "terminal" in content + assert "file_editor" in content + # Sub-agent returns results via finish tool + assert "finish" in content diff --git a/tests/test_pr_review_review_context.py b/tests/test_pr_review_review_context.py index 6fead29..b999dfb 100644 --- a/tests/test_pr_review_review_context.py +++ b/tests/test_pr_review_review_context.py @@ -53,11 +53,18 @@ def set_trace_metadata(metadata): lmnr.Laminar = _Laminar sys.modules["lmnr"] = lmnr + class _Stub: + """Generic stub that accepts any arguments.""" + def __init__(self, *args, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + sdk = types.ModuleType("openhands.sdk") - sdk.LLM = object - sdk.Agent = object - sdk.AgentContext = object - sdk.Conversation = object + sdk.LLM = _Stub + sdk.Agent = _Stub + sdk.AgentContext = _Stub + sdk.Conversation = _Stub + sdk.Tool = _Stub class _Logger: def info(self, *args, **kwargs): @@ -75,6 +82,15 @@ def debug(self, *args, **kwargs): sdk.get_logger = lambda name: _Logger() sys.modules["openhands.sdk"] = sdk + class _Skill: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + sdk_context = _ensure_package("openhands.sdk.context") + sdk_context.Skill = _Skill + sys.modules["openhands.sdk.context"] = sdk_context + context_skills = types.ModuleType("openhands.sdk.context.skills") context_skills.load_project_skills = lambda cwd: [] sys.modules["openhands.sdk.context.skills"] = context_skills @@ -87,11 +103,35 @@ def debug(self, *args, **kwargs): git_utils.run_git_command = lambda command, repo_dir: "deadbeef" sys.modules["openhands.sdk.git.utils"] = git_utils + sdk_plugin = types.ModuleType("openhands.sdk.plugin") + sdk_plugin.PluginSource = object + sys.modules["openhands.sdk.plugin"] = sdk_plugin + + tools_delegate = types.ModuleType("openhands.tools.delegate") + tools_delegate.DelegationVisualizer = object + sys.modules["openhands.tools.delegate"] = tools_delegate + + # register_agent lives in openhands.sdk, not openhands.tools.delegate + sdk.register_agent = lambda **kwargs: None + + tools_task = types.ModuleType("openhands.tools.task") + + class _TaskToolSet: + name = "TaskToolSet" + + tools_task.TaskToolSet = _TaskToolSet + sys.modules["openhands.tools.task"] = tools_task + tools_preset = types.ModuleType("openhands.tools.preset.default") tools_preset.get_default_condenser = lambda llm: None tools_preset.get_default_tools = lambda enable_browser=False: [] sys.modules["openhands.tools.preset.default"] = tools_preset + # Clear any cached 'prompt' module so agent_script.py picks up the + # correct prompt.py from its own scripts/ directory (not the one from + # another plugin like release-notes). + sys.modules.pop("prompt", None) + script_path = ( Path(__file__).parent.parent / "plugins" @@ -163,3 +203,19 @@ def test_format_thread_includes_rendered_suggestion_text_in_review_context(): assert "- Do **NOT** approve the PR." in formatted assert "Dependabot ignores the freshness guardrail" in formatted assert "```suggestion" not in formatted + + +def test_register_sub_agents_completes_without_error(): + """Smoke test: _register_sub_agents() runs without raising.""" + module = _load_agent_script_module() + # _register_sub_agents calls register_agent (stubbed as a no-op) + module._register_sub_agents() + + +def test_create_file_reviewer_agent_factory_is_callable(): + """Smoke test: _create_file_reviewer_agent accepts an LLM and is callable.""" + module = _load_agent_script_module() + # The factory should be callable; with our stubs LLM is just `object` + result = module._create_file_reviewer_agent(object()) + # Agent stub is `object`, so the factory should return *something* + assert result is not None