Skip to content

Commit dd81d22

Browse files
authored
feat(oculus): Implement initial oculus evals framework. (#4715)
1 parent a16a397 commit dd81d22

File tree

18 files changed

+4765
-0
lines changed

18 files changed

+4765
-0
lines changed

servers/oculus/.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
ANTHROPIC_API_KEY=your-api-key-here
2+
OPENAI_API_KEY=your-api-key-here
3+
TURBOPUFFER_API_KEY=your-api-key-here

servers/oculus/.gitignore

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
*.so
6+
.Python
7+
build/
8+
develop-eggs/
9+
dist/
10+
downloads/
11+
eggs/
12+
.eggs/
13+
lib/
14+
lib64/
15+
parts/
16+
sdist/
17+
var/
18+
wheels/
19+
*.egg-info/
20+
.installed.cfg
21+
*.egg
22+
MANIFEST
23+
24+
# Poetry
25+
poetry.lock
26+
27+
# Virtual environments
28+
.venv/
29+
venv/
30+
ENV/
31+
env/
32+
33+
# IDE
34+
.vscode/
35+
.idea/
36+
*.swp
37+
*.swo
38+
*~
39+
40+
# Testing
41+
.pytest_cache/
42+
.coverage
43+
htmlcov/
44+
.tox/
45+
.mypy_cache/
46+
.dmypy.json
47+
dmypy.json
48+
49+
# Evaluation outputs
50+
suites/*/answers/
51+
suites/*/evals/
52+
suites/*/results_*.json
53+
54+
# Environment variables
55+
.env
56+
.env.local
57+
!.env.example
58+
59+
# OS
60+
.DS_Store
61+
Thumbs.db

servers/oculus/logging.conf

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[loggers]
2+
keys=root
3+
4+
[handlers]
5+
keys=console
6+
7+
[formatters]
8+
keys=simple
9+
10+
[logger_root]
11+
level=WARNING
12+
handlers=console
13+
14+
[handler_console]
15+
class=StreamHandler
16+
level=WARNING
17+
formatter=simple
18+
args=(sys.stdout,)
19+
20+
[formatter_simple]
21+
format=%(levelname)s: %(message)s

servers/oculus/makefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
install:
2+
poetry install
3+
4+
test:
5+
poetry run pytest -sv
6+
7+
typecheck:
8+
poetry run mypy src/
9+
10+
lint:
11+
poetry run ruff check src/
12+
13+
format:
14+
poetry run ruff format src/

servers/oculus/poetry.lock

Lines changed: 3853 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

servers/oculus/pyproject.toml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
[tool.poetry]
2+
name = "oculus"
3+
version = "0.1.0"
4+
description = "Ask Fern Evaluation Pipeline"
5+
authors = []
6+
packages = [{include = "oculus", from = "src"}]
7+
8+
[[tool.poetry.source]]
9+
name = "fern"
10+
url = "https://pypi.buildwithfern.com"
11+
priority = "supplemental"
12+
13+
[tool.poetry.dependencies]
14+
python = "^3.11"
15+
anthropic = "^0.57.1"
16+
pydantic = "^2.8.0"
17+
pyyaml = "^6.0.1"
18+
python-dotenv = "^1.0.0"
19+
fern-ai = {path = "../fai", develop = true}
20+
21+
[tool.poetry.group.dev.dependencies]
22+
pytest = "^8.0.0"
23+
mypy = "^1.8.0"
24+
ruff = "^0.3.0"
25+
26+
[tool.poetry.scripts]
27+
oculus = "oculus.__main__:main"
28+
29+
[build-system]
30+
requires = ["poetry-core"]
31+
build-backend = "poetry.core.masonry.api"
32+
33+
[tool.ruff]
34+
line-length = 120
35+
target-version = "py311"
36+
37+
[tool.ruff.lint]
38+
select = ["E", "F", "I", "UP"]
39+
40+
[tool.mypy]
41+
python_version = "3.11"
42+
strict = true
43+
warn_return_any = true
44+
warn_unused_configs = true
45+
mypy_path = "src"
46+
explicit_package_bases = true
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import argparse
2+
import sys
3+
from pathlib import Path
4+
5+
from dotenv import load_dotenv
6+
7+
from oculus.framework.runner import EvaluationRunner
8+
from oculus.integrations.fai_integration import create_fai_answer_function
9+
10+
load_dotenv()
11+
12+
13+
def main() -> int:
14+
parser = argparse.ArgumentParser(
15+
description="Run Ask Fern evaluations using LLM-as-a-judge",
16+
formatter_class=argparse.RawDescriptionHelpFormatter,
17+
epilog="""
18+
Examples:
19+
oculus --suite retrieval_quality --domain buildwithfern.com
20+
oculus --suite answer_quality --domain docs.cohere.com --run-id experiment_1
21+
oculus --suite test --domain example.com --model command-a-03-2025
22+
oculus --suite test --domain example.com --no-skip-existing
23+
""",
24+
)
25+
26+
parser.add_argument("--suite", type=str, required=True, help="Name of the evaluation suite")
27+
parser.add_argument("--domain", type=str, required=True, help="Documentation domain to query")
28+
parser.add_argument("--suite-path", type=Path, default=None, help="Base path to suites directory")
29+
parser.add_argument("--run-id", type=str, default=None, help="Unique run identifier")
30+
parser.add_argument(
31+
"--model",
32+
type=str,
33+
default="claude-4-sonnet-20250514",
34+
choices=["claude-4-sonnet-20250514", "command-a-03-2025"],
35+
help="Model to use for answer generation",
36+
)
37+
parser.add_argument("--judge-model", type=str, default="claude-opus-4-20250514", help="Claude model for judging")
38+
parser.add_argument("--max-workers", type=int, default=16, help="Number of parallel workers")
39+
parser.add_argument("--no-skip-existing", action="store_true", help="Re-generate existing answers/evaluations")
40+
parser.add_argument("--output-dir", type=Path, default=None, help="Directory to save results")
41+
42+
args = parser.parse_args()
43+
44+
if args.suite_path:
45+
suite_base = args.suite_path
46+
else:
47+
suite_base = Path.cwd() / "suites"
48+
49+
suite_path = suite_base / args.suite
50+
51+
if not suite_path.exists():
52+
print(f"Error: Suite directory not found: {suite_path}", file=sys.stderr)
53+
print(f"\nExpected structure:", file=sys.stderr)
54+
print(f" {suite_path}/", file=sys.stderr)
55+
print(f" questions/", file=sys.stderr)
56+
print(f" question_0.json", file=sys.stderr)
57+
return 1
58+
59+
questions_dir = suite_path / "questions"
60+
if not questions_dir.exists() or not any(questions_dir.glob("*.json")):
61+
print(f"Error: No questions found in {questions_dir}", file=sys.stderr)
62+
return 1
63+
64+
try:
65+
print(f"Initializing FAI integration for domain: {args.domain}")
66+
answer_fn = create_fai_answer_function(domain=args.domain, model=args.model)
67+
68+
runner = EvaluationRunner(
69+
suite_name=args.suite,
70+
suite_path=suite_path,
71+
run_id=args.run_id,
72+
max_workers=args.max_workers,
73+
)
74+
75+
result = runner.run(
76+
answer_fn=answer_fn,
77+
model_name=args.model,
78+
judge_model=args.judge_model,
79+
skip_existing=not args.no_skip_existing,
80+
)
81+
82+
if args.output_dir:
83+
from oculus.utils.file_utils import save_json
84+
85+
args.output_dir.mkdir(parents=True, exist_ok=True)
86+
output_path = args.output_dir / f"results_{result.run_id}.json"
87+
save_json(output_path, result.model_dump())
88+
print(f"\nAdditional output saved to: {output_path}")
89+
90+
return 0
91+
92+
except ImportError as e:
93+
print(f"\nError: Failed to import required modules", file=sys.stderr)
94+
print(f"{e}", file=sys.stderr)
95+
print(f"\nMake sure:", file=sys.stderr)
96+
print(f" 1. FAI dependencies are installed (poetry install in servers/fai)", file=sys.stderr)
97+
print(f" 2. PYTHONPATH includes the FAI source directory", file=sys.stderr)
98+
return 1
99+
100+
except Exception as e:
101+
print(f"\nError: Evaluation failed", file=sys.stderr)
102+
print(f"{e}", file=sys.stderr)
103+
import traceback
104+
105+
traceback.print_exc()
106+
return 1
107+
108+
109+
if __name__ == "__main__":
110+
sys.exit(main())
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import time
2+
from typing import Any, Optional, Type, TypeVar, cast
3+
4+
from anthropic import Anthropic
5+
from pydantic import BaseModel
6+
7+
T = TypeVar("T", bound=BaseModel)
8+
9+
10+
def generate_with_claude(
11+
response_type: Type[T],
12+
prompt_template: str,
13+
model: str = "claude-opus-4-20250514",
14+
max_tokens: int = 1000,
15+
max_retries: int = 3,
16+
**kwargs: str,
17+
) -> Optional[T]:
18+
anthropic_client = Anthropic()
19+
formatted_prompt = prompt_template.format(**kwargs)
20+
21+
tools = [
22+
{
23+
"name": "build_response_result",
24+
"description": "Build the structured response object.",
25+
"input_schema": response_type.model_json_schema(),
26+
}
27+
]
28+
29+
tries = 0
30+
while tries < max_retries:
31+
try:
32+
response = anthropic_client.messages.create(
33+
model=model,
34+
max_tokens=max_tokens,
35+
messages=[{"role": "user", "content": formatted_prompt}],
36+
tools=cast(Any, tools),
37+
tool_choice=cast(Any, {"type": "tool", "name": "build_response_result"}),
38+
)
39+
40+
function_call = response.content[0].input # type: ignore[union-attr]
41+
parsed_response = response_type(**cast(dict[str, Any], function_call))
42+
return parsed_response
43+
44+
except Exception as e:
45+
tries += 1
46+
if tries >= max_retries:
47+
print(f"Failed after {max_retries} attempts: {e}")
48+
return None
49+
time.sleep(0.5 * tries)
50+
51+
return None
52+
53+
54+
def evaluate_answer(
55+
question: str,
56+
answer: str,
57+
ground_truth: str,
58+
model: str = "claude-opus-4-20250514",
59+
) -> Optional[Any]:
60+
from oculus.framework.models import EVALUATION_PROMPT_TEMPLATE, EvaluationResponse
61+
62+
return generate_with_claude(
63+
response_type=EvaluationResponse,
64+
prompt_template=EVALUATION_PROMPT_TEMPLATE,
65+
model=model,
66+
question=question,
67+
answer=answer,
68+
ground_truth=ground_truth,
69+
)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from pydantic import BaseModel, Field
2+
3+
4+
class EvaluationResponse(BaseModel):
5+
is_correct: bool
6+
reason: str
7+
8+
9+
EVALUATION_PROMPT_TEMPLATE = """You are evaluating the correctness of an AI assistant's answer to a technical question about API documentation.
10+
11+
Question: {question}
12+
13+
Ground Truth / Expected Information:
14+
{ground_truth}
15+
16+
AI Assistant's Answer:
17+
{answer}
18+
19+
Evaluate whether the AI assistant's answer is correct and complete based on the ground truth. The answer should:
20+
1. Accurately represent the information in the ground truth
21+
2. Not include significant hallucinations or incorrect information
22+
3. Address the core question being asked
23+
24+
If the answer is mostly correct with minor issues, mark it as correct. Only mark as incorrect if there are significant errors or omissions.
25+
26+
Provide your evaluation with a brief reason."""
27+
28+
29+
class Question(BaseModel):
30+
question: str
31+
ground_truth: str
32+
metadata: dict[str, str] = Field(default_factory=dict)
33+
34+
35+
class Answer(BaseModel):
36+
question: str
37+
answer: str
38+
model: str
39+
metadata: dict[str, str] = Field(default_factory=dict)
40+
41+
42+
class Evaluation(BaseModel):
43+
question: str
44+
answer: str
45+
ground_truth: str
46+
is_correct: bool
47+
reason: str
48+
metadata: dict[str, str] = Field(default_factory=dict)
49+
50+
51+
class EvaluationRun(BaseModel):
52+
run_id: str
53+
timestamp: str
54+
suite: str
55+
results: list[Evaluation]
56+
metrics: "EvaluationMetrics"
57+
58+
59+
class EvaluationMetrics(BaseModel):
60+
total_questions: int
61+
total_correct: int
62+
accuracy: float

0 commit comments

Comments
 (0)