Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions .github/workflows/run-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ on:
required: false
default: main
type: string
extensions_branch:
description: Extensions repo branch to use (for testing feature branches with skills/plugins)
required: false
default: main
type: string
instance_ids:
description: >-
Comma-separated instance IDs to evaluate.
Expand Down Expand Up @@ -157,6 +162,7 @@ jobs:
echo "reason: ${{ github.event.inputs.reason || 'N/A' }}"
echo "eval_branch: ${{ github.event.inputs.eval_branch || 'main' }}"
echo "benchmarks_branch: ${{ github.event.inputs.benchmarks_branch || 'main' }}"
echo "extensions_branch: ${{ github.event.inputs.extensions_branch || 'main' }}"
echo "instance_ids: ${{ github.event.inputs.instance_ids || 'N/A' }}"
echo "num_infer_workers: ${{ github.event.inputs.num_infer_workers || '(default)' }}"
echo "num_eval_workers: ${{ github.event.inputs.num_eval_workers || '(default)' }}"
Expand Down Expand Up @@ -341,6 +347,7 @@ jobs:
EVAL_WORKFLOW: ${{ env.EVAL_WORKFLOW }}
EVAL_BRANCH: ${{ github.event.inputs.eval_branch || 'main' }}
BENCHMARKS_BRANCH: ${{ github.event.inputs.benchmarks_branch || 'main' }}
EXTENSIONS_BRANCH: ${{ github.event.inputs.extensions_branch || 'main' }}
BENCHMARK: ${{ github.event.inputs.benchmark || 'swebench' }}
TRIGGER_REASON: ${{ github.event.inputs.reason }}
PR_NUMBER: ${{ steps.params.outputs.pr_number }}
Expand All @@ -357,7 +364,7 @@ jobs:
# Normalize instance_ids: strip all spaces
INSTANCE_IDS=$(printf '%s' "$INSTANCE_IDS" | tr -d ' ')

echo "Dispatching evaluation workflow with SDK commit: $SDK_SHA (benchmark: $BENCHMARK, eval branch: $EVAL_BRANCH, benchmarks branch: $BENCHMARKS_BRANCH, tool preset: $TOOL_PRESET)"
echo "Dispatching evaluation workflow with SDK commit: $SDK_SHA (benchmark: $BENCHMARK, eval branch: $EVAL_BRANCH, benchmarks branch: $BENCHMARKS_BRANCH, extensions branch: $EXTENSIONS_BRANCH, tool preset: $TOOL_PRESET)"
PAYLOAD=$(jq -n \
--arg sdk "$SDK_SHA" \
--arg sdk_run_id "${{ github.run_id }}" \
Expand All @@ -367,6 +374,7 @@ jobs:
--arg reason "$TRIGGER_REASON" \
--arg pr "$PR_NUMBER" \
--arg benchmarks "$BENCHMARKS_BRANCH" \
--arg extensions "$EXTENSIONS_BRANCH" \
--arg benchmark "$BENCHMARK" \
--arg instance_ids "$INSTANCE_IDS" \
--arg num_infer_workers "$NUM_INFER_WORKERS" \
Expand All @@ -377,7 +385,7 @@ jobs:
--arg agent_type "$AGENT_TYPE" \
--arg partial_archive_url "$PARTIAL_ARCHIVE_URL" \
--arg triggered_by "$TRIGGERED_BY" \
'{ref: $ref, inputs: {sdk_commit: $sdk, sdk_workflow_run_id: $sdk_run_id, eval_limit: $eval_limit, models_json: ($models | tostring), trigger_reason: $reason, pr_number: $pr, benchmarks_branch: $benchmarks, benchmark: $benchmark, instance_ids: $instance_ids, num_infer_workers: $num_infer_workers, num_eval_workers: $num_eval_workers, enable_conversation_event_logging: $enable_conversation_event_logging, max_retries: $max_retries, tool_preset: $tool_preset, agent_type: $agent_type, partial_archive_url: $partial_archive_url, triggered_by: $triggered_by}}')
'{ref: $ref, inputs: {sdk_commit: $sdk, sdk_workflow_run_id: $sdk_run_id, eval_limit: $eval_limit, models_json: ($models | tostring), trigger_reason: $reason, pr_number: $pr, benchmarks_branch: $benchmarks, extensions_branch: $extensions, benchmark: $benchmark, instance_ids: $instance_ids, num_infer_workers: $num_infer_workers, num_eval_workers: $num_eval_workers, enable_conversation_event_logging: $enable_conversation_event_logging, max_retries: $max_retries, tool_preset: $tool_preset, agent_type: $agent_type, partial_archive_url: $partial_archive_url, triggered_by: $triggered_by}}')
RESPONSE=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" -X POST \
-H "Authorization: token $PAT_TOKEN" \
-H "Accept: application/vnd.github+json" \
Expand Down
5 changes: 4 additions & 1 deletion openhands-sdk/openhands/sdk/context/skills/skill.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import io
import json
import os
import re
from pathlib import Path
from typing import Annotated, ClassVar, Literal, Union
Expand Down Expand Up @@ -891,7 +892,9 @@ def load_project_skills(work_dir: str | Path) -> list[Skill]:

# Public skills repository configuration
PUBLIC_SKILLS_REPO = "https://github.com/OpenHands/extensions"
PUBLIC_SKILLS_BRANCH = "main"
# Allow overriding the branch via EXTENSIONS_REF environment variable
# (used by evaluation/benchmarks workflows to test feature branches)
PUBLIC_SKILLS_BRANCH = os.environ.get("EXTENSIONS_REF", "main")
DEFAULT_MARKETPLACE_PATH = "marketplaces/default.json"


Expand Down
91 changes: 91 additions & 0 deletions tests/sdk/context/skill/test_extensions_ref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Tests for EXTENSIONS_REF environment variable support.

These tests use subprocess to run each test in an isolated Python process,
avoiding module state pollution that would affect other tests.
"""

import subprocess
import sys


def _run_in_subprocess(test_code: str, env_extra: dict | None = None) -> None:
"""Run test code in a subprocess with the given environment variables."""
import os

env = os.environ.copy()
if env_extra:
env.update(env_extra)

result = subprocess.run(
[sys.executable, "-c", test_code],
env=env,
capture_output=True,
text=True,
)
if result.returncode != 0:
raise AssertionError(
f"Subprocess test failed:\nstdout: {result.stdout}\nstderr: {result.stderr}"
)


def test_extensions_ref_default():
"""PUBLIC_SKILLS_BRANCH should default to 'main' when EXTENSIONS_REF is not set."""
code = """
import os
if "EXTENSIONS_REF" in os.environ:
del os.environ["EXTENSIONS_REF"]
from openhands.sdk.context.skills.skill import PUBLIC_SKILLS_BRANCH
assert PUBLIC_SKILLS_BRANCH == "main", (
f"Expected 'main' but got '{PUBLIC_SKILLS_BRANCH}'"
)
"""
_run_in_subprocess(code)


def test_extensions_ref_custom_branch():
"""PUBLIC_SKILLS_BRANCH should use EXTENSIONS_REF when set."""
code = """
from openhands.sdk.context.skills.skill import PUBLIC_SKILLS_BRANCH
assert PUBLIC_SKILLS_BRANCH == "feature-branch", (
f"Expected 'feature-branch' but got '{PUBLIC_SKILLS_BRANCH}'"
)
"""
_run_in_subprocess(code, {"EXTENSIONS_REF": "feature-branch"})


def test_extensions_ref_with_load_public_skills():
"""load_public_skills should respect EXTENSIONS_REF environment variable."""
code = """
from unittest import mock
from openhands.sdk.context.skills.skill import (
PUBLIC_SKILLS_BRANCH,
load_public_skills,
)
assert PUBLIC_SKILLS_BRANCH == "test-branch", (
f"Expected 'test-branch' but got '{PUBLIC_SKILLS_BRANCH}'"
)
with mock.patch(
"openhands.sdk.context.skills.skill.update_skills_repository"
) as mock_update:
mock_update.return_value = None
load_public_skills()
mock_update.assert_called_once()
call_args = mock_update.call_args
# branch is 2nd positional arg: (repo_url, branch, cache_dir)
assert call_args[0][1] == "test-branch", (
f"Expected branch='test-branch' but got {call_args[0][1]}"
)
"""
_run_in_subprocess(code, {"EXTENSIONS_REF": "test-branch"})


def test_extensions_ref_empty_string():
"""Empty EXTENSIONS_REF should fall back to 'main'."""
code = """
from openhands.sdk.context.skills.skill import PUBLIC_SKILLS_BRANCH
# Empty string returns empty string per os.environ.get behavior
assert PUBLIC_SKILLS_BRANCH == "", (
f"Expected '' but got '{PUBLIC_SKILLS_BRANCH}'"
)
"""
_run_in_subprocess(code, {"EXTENSIONS_REF": ""})
Loading