Skip to content

Commit a5c7e65

Browse files
committed
✨ feat: comprehensive evaluation system overhaul
- Restructure Makefile with dedicated evaluation commands - Migrate from conftest.py to config.py for shared evaluation configuration - Implement dual evaluation approach: graph trajectory + multi-turn chat simulation - Add comprehensive documentation with latest results and troubleshooting - Configure linting exceptions for evaluation scripts (allow print statements) - Consolidate evaluation utilities and scoring systems
1 parent de9ed4f commit a5c7e65

File tree

8 files changed

+1746
-336
lines changed

8 files changed

+1746
-336
lines changed

Makefile

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: all format lint test test_unit test_integration test_e2e test_all test_evaluations test_watch test_watch_unit test_watch_integration test_watch_e2e test_profile extended_tests dev dev_ui
1+
.PHONY: all format lint test test_unit test_integration test_e2e test_all evals eval_graph eval_multiturn eval_graph_qwen eval_graph_glm eval_multiturn_polite eval_multiturn_hacker test_watch test_watch_unit test_watch_integration test_watch_e2e test_profile extended_tests dev dev_ui
22

33
# Default target executed when no arguments are given to make.
44
all: help
@@ -20,15 +20,41 @@ test_integration:
2020
test_e2e:
2121
uv run python -m pytest tests/e2e_tests/
2222

23-
test_evaluations:
24-
uv run python -m pytest tests/evaluations/ -v
25-
26-
test_eval_graph:
27-
uv run python -m pytest -n auto tests/evaluations/graph.py -v
28-
2923
test_all:
3024
uv run python -m pytest tests/
3125

26+
######################
27+
# EVALUATIONS
28+
######################
29+
30+
# Comprehensive evaluation suite
31+
evals: eval_graph eval_multiturn
32+
33+
# Graph trajectory evaluation (scenario-specific LLM-as-judge)
34+
eval_graph:
35+
cd tests/evaluations && python graph.py --verbose
36+
37+
# Multi-turn chat evaluation (role-persona simulations)
38+
eval_multiturn:
39+
cd tests/evaluations && python multiturn.py --verbose
40+
41+
# Run specific evaluation scenarios
42+
eval_graph_qwen:
43+
cd tests/evaluations && python graph.py --model siliconflow:Qwen/Qwen3-8B --verbose
44+
45+
eval_graph_glm:
46+
cd tests/evaluations && python graph.py --model siliconflow:THUDM/GLM-4-9B-0414 --verbose
47+
48+
eval_multiturn_polite:
49+
cd tests/evaluations && python multiturn.py --persona polite --verbose
50+
51+
eval_multiturn_hacker:
52+
cd tests/evaluations && python multiturn.py --persona hacker --verbose
53+
54+
######################
55+
# WATCH MODES
56+
######################
57+
3258
# Watch mode for tests
3359
test_watch: test_watch_unit
3460

@@ -115,14 +141,21 @@ help:
115141
@echo 'test_unit - run unit tests only'
116142
@echo 'test_integration - run integration tests only'
117143
@echo 'test_e2e - run e2e tests only'
118-
@echo 'test_evaluations - run graph trajectory evaluation tests'
119-
@echo 'test_eval_graph - run graph evaluations in parallel (fast)'
120144
@echo 'test_all - run all tests (unit + integration + e2e)'
121145
@echo 'test_watch - run unit tests in watch mode'
122146
@echo 'test_watch_unit - run unit tests in watch mode'
123147
@echo 'test_watch_integration - run integration tests in watch mode'
124148
@echo 'test_watch_e2e - run e2e tests in watch mode'
125149
@echo ''
150+
@echo 'EVALUATIONS:'
151+
@echo 'evals - run comprehensive evaluation suite (all models)'
152+
@echo 'eval_graph - run graph trajectory evaluations (LLM-as-judge)'
153+
@echo 'eval_multiturn - run multi-turn chat evaluations (role-persona)'
154+
@echo 'eval_graph_qwen - run graph evaluation with Qwen/Qwen3-8B model'
155+
@echo 'eval_graph_glm - run graph evaluation with THUDM/GLM-4-9B model'
156+
@echo 'eval_multiturn_polite - run multiturn with polite persona only'
157+
@echo 'eval_multiturn_hacker - run multiturn with hacker persona only'
158+
@echo ''
126159
@echo 'CODE QUALITY:'
127160
@echo 'format - run code formatters'
128161
@echo 'lint - run linters (ruff + mypy on src/)'

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ lint.ignore = [
6161
]
6262
[tool.ruff.lint.per-file-ignores]
6363
"tests/*" = ["D", "UP"]
64+
"tests/evaluations/*" = ["D", "UP", "T201"] # Allow print statements in evaluation scripts
6465
[tool.ruff.lint.pydocstyle]
6566
convention = "google"
6667

0 commit comments

Comments
 (0)