|
| 1 | +import click |
| 2 | + |
| 3 | +from veadk.utils.logger import get_logger |
| 4 | + |
| 5 | +logger = get_logger(__name__) |
| 6 | + |
| 7 | + |
| 8 | +@click.command() |
| 9 | +@click.option( |
| 10 | + "--agent-dir", |
| 11 | + default=".", |
| 12 | + help="To-be-evaluated agent directory. Must export `root_agent` in `agent.py`", |
| 13 | +) |
| 14 | +@click.option( |
| 15 | + "--agent-a2a-url", |
| 16 | + default=None, |
| 17 | + help="To-be-evaluated agent URL. The agent should be deployed as A2A mode.", |
| 18 | +) |
| 19 | +@click.option( |
| 20 | + "--evalset-file", |
| 21 | + required=True, |
| 22 | + help="Google ADK formatted evalset file path", |
| 23 | +) |
| 24 | +@click.option( |
| 25 | + "--evaluator", |
| 26 | + type=click.Choice(["adk", "deepeval"], case_sensitive=False), |
| 27 | + help="Evaluator type, choose `adk` or `deepeval`", |
| 28 | +) |
| 29 | +@click.option( |
| 30 | + "--judge-model-name", |
| 31 | + default="doubao-1-5-pro-256k-250115", |
| 32 | + help="Judge model name, default is `doubao-1-5-pro-256k-250115`. Useless under `adk` evaluator.", |
| 33 | +) |
| 34 | +@click.option( |
| 35 | + "--volcengine-access-key", |
| 36 | + default=None, |
| 37 | + help="Volcengine access key for using Volcengine models", |
| 38 | +) |
| 39 | +@click.option( |
| 40 | + "--volcengine-secret-key", |
| 41 | + default=None, |
| 42 | + help="Volcengine secret key for using Volcengine models", |
| 43 | +) |
| 44 | +def eval( |
| 45 | + agent_dir: str, |
| 46 | + agent_a2a_url: str, |
| 47 | + evalset_file: str, |
| 48 | + evaluator: str, |
| 49 | + judge_model_name: str, |
| 50 | + volcengine_access_key: str, |
| 51 | + volcengine_secret_key: str, |
| 52 | +) -> None: |
| 53 | + import asyncio |
| 54 | + import os |
| 55 | + from pathlib import Path |
| 56 | + |
| 57 | + from google.adk.cli.utils.agent_loader import AgentLoader |
| 58 | + |
| 59 | + from veadk.a2a.remote_ve_agent import RemoteVeAgent |
| 60 | + from veadk.config import getenv, settings |
| 61 | + from veadk.prompts.prompt_evaluator import eval_principle_prompt |
| 62 | + |
| 63 | + try: |
| 64 | + from deepeval.metrics import GEval, ToolCorrectnessMetric |
| 65 | + from deepeval.test_case import LLMTestCaseParams |
| 66 | + |
| 67 | + from veadk.evaluation.adk_evaluator import ADKEvaluator |
| 68 | + from veadk.evaluation.deepeval_evaluator import DeepevalEvaluator |
| 69 | + except ImportError: |
| 70 | + raise ImportError( |
| 71 | + "Please install veadk with `[evaluation]` extras, e.g., `pip install veadk-python[eval]`" |
| 72 | + ) |
| 73 | + |
| 74 | + # ====== prepare agent instance ====== |
| 75 | + if not agent_dir and not agent_a2a_url: |
| 76 | + raise ValueError( |
| 77 | + "Option `--agent-dir` or `--agent-a2a-url` should be provided one of them." |
| 78 | + ) |
| 79 | + |
| 80 | + if agent_dir and agent_a2a_url: |
| 81 | + logger.warning( |
| 82 | + "`--agent-dir` and `--agent-a2a-url` are both provided, will use `--agent-a2a-url`." |
| 83 | + ) |
| 84 | + agent_instance = RemoteVeAgent(name="a2a_agent", url=agent_a2a_url) |
| 85 | + logger.info(f"Loaded agent from {agent_a2a_url}") |
| 86 | + |
| 87 | + if not agent_dir and agent_a2a_url: |
| 88 | + agent_instance = RemoteVeAgent(name="a2a_agent", url=agent_a2a_url) |
| 89 | + logger.info(f"Loaded agent from {agent_a2a_url}") |
| 90 | + |
| 91 | + if agent_dir and not agent_a2a_url: |
| 92 | + agent_instance = AgentLoader(str(Path(agent_dir).parent.resolve())).load_agent( |
| 93 | + str(Path(agent_dir).name) |
| 94 | + ) |
| 95 | + logger.info(f"Loaded agent from {agent_dir}, agent name: {agent_instance.name}") |
| 96 | + |
| 97 | + # ====== prepare envs ====== |
| 98 | + if volcengine_access_key and "VOLCENGINE_ACCESS_KEY" not in os.environ: |
| 99 | + os.environ["VOLCENGINE_ACCESS_KEY"] = volcengine_access_key |
| 100 | + if volcengine_secret_key and "VOLCENGINE_SECRET_KEY" not in os.environ: |
| 101 | + os.environ["VOLCENGINE_SECRET_KEY"] = volcengine_secret_key |
| 102 | + |
| 103 | + # ====== prepare evaluator instance ====== |
| 104 | + evaluator_instance = None |
| 105 | + if evaluator == "adk" and judge_model_name: |
| 106 | + logger.warning( |
| 107 | + "Using Google ADK evaluator, `--judge-model-name` will be ignored." |
| 108 | + ) |
| 109 | + evaluator_instance = ADKEvaluator(agent=agent_instance) |
| 110 | + |
| 111 | + asyncio.run(evaluator_instance.evaluate(eval_set_file_path=evalset_file)) |
| 112 | + |
| 113 | + if evaluator == "deepeval": |
| 114 | + if not volcengine_access_key: |
| 115 | + volcengine_access_key = getenv("VOLCENGINE_ACCESS_KEY") |
| 116 | + if not volcengine_secret_key: |
| 117 | + volcengine_secret_key = getenv("VOLCENGINE_SECRET_KEY") |
| 118 | + |
| 119 | + evaluator_instance = DeepevalEvaluator( |
| 120 | + agent=agent_instance, |
| 121 | + judge_model_api_key=settings.model.api_key, |
| 122 | + judge_model_name=judge_model_name, |
| 123 | + ) |
| 124 | + |
| 125 | + judge_model = evaluator_instance.judge_model |
| 126 | + |
| 127 | + metrics = [ |
| 128 | + GEval( |
| 129 | + threshold=0.8, |
| 130 | + name="Base Evaluation", |
| 131 | + criteria=eval_principle_prompt, |
| 132 | + evaluation_params=[ |
| 133 | + LLMTestCaseParams.INPUT, |
| 134 | + LLMTestCaseParams.ACTUAL_OUTPUT, |
| 135 | + LLMTestCaseParams.EXPECTED_OUTPUT, |
| 136 | + ], |
| 137 | + model=judge_model, |
| 138 | + ), |
| 139 | + ToolCorrectnessMetric(threshold=0.5), |
| 140 | + ] |
| 141 | + |
| 142 | + asyncio.run( |
| 143 | + evaluator_instance.evaluate( |
| 144 | + eval_set_file_path=evalset_file, metrics=metrics |
| 145 | + ) |
| 146 | + ) |
0 commit comments