Merge pull request #176 from volcengine/feat/cli-eval

floritange · web-flow · commit 06281a5f9359 · 2025-09-18T15:11:55.000+08:00
feat(eval): support do eval from cli
diff --git a/docs/content/90.cli/1.overview.md b/docs/content/90.cli/1.overview.md
@@ -5,4 +5,12 @@ navigation:
   icon: i-lucide-heading-1
 ---
 
-VeADK 提供了多类命令。
+VeADK 提供如下命令便捷您的操作：
+
+| 命令 | 描述 | 说明 |
+| :-- | :-- | :-- |
+| `veadk init` | 生成可在 VeFaaS 中部署的项目脚手架 | 将会在您的目录中新增 `deploy.yaml` 文件 |
+| `veadk deploy` | 将某个项目部署到 VeFaaS 中 | |
+| `veadk prompt` | 优化智能体系统提示词 | 借助火山引擎 PromptPilot 产品 |
+| `veadk web` | 支持长短期记忆、知识库的前端调试界面 | 兼容 Google ADK web |
+| `veadk eval` | 支持不同后端的评测 | 评测后端包括 `adk` 与 `deepeval`，评测数据集源包括 Google ADK 评测集格式文件，以及 Tracing 文件 |
diff --git a/docs/content/90.cli/2.commands.md b/docs/content/90.cli/2.commands.md
@@ -50,3 +50,37 @@ veadk web --session_service_uri="mysql+pymysql://{user}:{password}@{host}/{datab
 ```
 
 它们能够自动读取执行命令目录中的`agent.py`文件，并加载`root_agent`全局变量。
+
+## 评测
+
+通过使用 `veadk eval` 来进行智能体评测，相关参数如下：
+
+::field-group
+  ::field{name="--agent-dir" type="string"}
+  待评测的 Agent 目录。目录中需要符合 Google ADK 的项目结构，即需要具备一个导出 `root_agent` 的名为 `agent.py` 的文件。
+  ::
+
+  ::field{name="--agent-a2a-url" type="string"}
+  待评测的云端 Agent 路径，要求云端 Agent 使用 A2A 协议进行部署。
+  ::
+
+  ::field{name="--evalset-file" type="string"}
+  Google ADK 格式的评测集文件
+  ::
+
+  ::field{name="--evaluator" type="string"}
+  评测器类别：`adk` 为 Google ADK 内置评测器，评测模型为自身；`deepeval` 为 DeepEval 评测器，评测模型可以通过 `--judge-model-name` 参数指定。
+  ::
+
+  ::field{name="--judge-model-name" type="string"}
+  默认为 `doubao-1-5-pro-256k-250115` - 评测模型，该参数在 `--evaluator` 值为 `adk` 时无效。
+  ::
+
+  ::field{name="--volcengine-access-key" type="string"}
+  火山引擎 Access Key
+  ::
+
+  ::field{name="--volcengine-secret-key" type="string"}
+  火山引擎 Secret Key
+  ::
+::
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
@@ -62,60 +62,61 @@
 TRACE_SET_DATA = [
     {
         "name": "execute_tool get_city_weather",
-        "span_id": 5421848634094108689,
-        "trace_id": 115143748123782151752771111946932434777,
-        "start_time": 1754884672226444000,
-        "end_time": 1754884672226993000,
+        "span_id": 4497348974122733469,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158957162250000,
+        "end_time": 1758158957162426000,
         "attributes": {
             "gen_ai.tool.name": "get_city_weather",
-            "gen_ai.tool.description": "Retrieves the weather information of a given city. the args must in English",
-            "gen_ai.tool.call.id": "call_6ow5622pvcouw3tpvr7rfqtl",
-            "gcp.vertex.agent.tool_call_args": '{"city": "Xi\'an"}',
+            "gen_ai.tool.input": '{"name": "get_city_weather", "description": "Retrieves the weather information of a given city. the args must in English", "parameters": {"city": "Beijing"}}',
+            "gen_ai.tool.output": '{"id": "call_w4bj25flpvs74zgyyiquqh5s", "name": "get_city_weather", "response": {"result": "Sunny, 25°C"}}',
         },
-        "parent_span_id": 7997784243558253239,
+        "parent_span_id": 574819447039686650,
     },
     {
         "name": "call_llm",
-        "span_id": 7997784243558253239,
-        "trace_id": 115143748123782151752771111946932434777,
+        "span_id": 574819447039686650,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158945807630000,
+        "end_time": 1758158957171304000,
         "attributes": {
-            "session.id": "veadk_example_session",
-            "user.id": "veadk_default_user",
+            "gen_ai.app.name": "veadk_default_app",
+            "gen_ai.user.id": "veadk_default_user",
+            "gen_ai.prompt.0.role": "user",
+            "gen_ai.prompt.0.content": "How is the weather like in BeiJing?",
         },
-        "parent_span_id": 14844888006539887900,
+        "parent_span_id": 13789664766018020416,
     },
     {
         "name": "call_llm",
-        "span_id": 7789424022423491416,
-        "trace_id": 115143748123782151752771111946932434777,
+        "span_id": 9007934154052797946,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158957171713000,
+        "end_time": 1758158964035230000,
         "attributes": {
-            "session.id": "veadk_example_session",
-            "user.id": "veadk_default_user",
+            "gen_ai.app.name": "veadk_default_app",
+            "gen_ai.user.id": "veadk_default_user",
+            "gen_ai.prompt.0.content": "How is the weather like in BeiJing?",
+            "gen_ai.completion.0.content": "The weather in Beijing is sunny with a temperature of 25°C.",
         },
-        "parent_span_id": 14844888006539887900,
+        "parent_span_id": 13789664766018020416,
     },
     {
         "name": "agent_run [chat_robot]",
-        "span_id": 14844888006539887900,
-        "trace_id": 115143748123782151752771111946932434777,
-        "attributes": {
-            "session.id": "veadk_example_session",
-            "user.id": "veadk_default_user",
-        },
-        "parent_span_id": 2943363177785645047,
+        "span_id": 13789664766018020416,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158945807350000,
+        "end_time": 1758158964035291000,
+        "attributes": {},
+        "parent_span_id": 5589459087402275636,
     },
     {
-        "name": "invocation [veadk_default_app]",
-        "span_id": 2943363177785645047,
-        "trace_id": 115143748123782151752771111946932434777,
-        "start_time": 1754884660687962000,
-        "end_time": 1754884676664833000,
-        "attributes": {
-            "input.value": '{"user_id": "veadk_default_user", "session_id": "veadk_example_session", "new_message": "{\\"parts\\": [{\\"video_metadata\\": null, \\"thought\\": null, \\"inline_data\\": null, \\"file_data\\": null, \\"thought_signature\\": null, \\"code_execution_result\\": null, \\"executable_code\\": null, \\"function_call\\": null, \\"function_response\\": null, \\"text\\": \\"How is the weather like in Xi\'an?\\"}], \\"role\\": \\"user\\"}", "run_config": "{\\"speech_config\\": null, \\"response_modalities\\": null, \\"save_input_blobs_as_artifacts\\": false, \\"support_cfc\\": false, \\"streaming_mode\\": \\"StreamingMode.NONE\\", \\"output_audio_transcription\\": null, \\"input_audio_transcription\\": null, \\"realtime_input_config\\": null, \\"enable_affective_dialog\\": null, \\"proactivity\\": null, \\"max_llm_calls\\": 500}"}',
-            "user.id": "veadk_default_user",
-            "session.id": "veadk_example_session",
-            "output.value": '{"content":{"parts":[{"text":"The weather in Xi\'an is cool, with a temperature of 18\u00b0C."}],"role":"model"},"partial":false,"usage_metadata":{"candidates_token_count":132,"prompt_token_count":547,"total_token_count":679},"invocation_id":"e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441","author":"chat_robot","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{}},"id":"c0929124-9be0-4f75-a6ba-f7a531c9ccb6","timestamp":1754884672.227546}',
-        },
+        "name": "invocation",
+        "span_id": 5589459087402275636,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158945807233000,
+        "end_time": 1758158964035304000,
+        "attributes": {},
         "parent_span_id": None,
     },
 ]
@@ -154,8 +155,8 @@ def test_tracing_file_to_evalset():
     assert len(base_evaluator.invocation_list) == 1
     assert len(base_evaluator.invocation_list[0].invocations) == 1
     assert (
-        base_evaluator.invocation_list[0].invocations[0].invocation_id
-        == "e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441"
+        base_evaluator.invocation_list[0].invocations[0].expected_output
+        == "The weather in Beijing is sunny with a temperature of 25°C."
     )
 
     os.remove(tracing_file_path)
diff --git a/veadk/cli/cli.py b/veadk/cli/cli.py
@@ -16,10 +16,11 @@
 import click
 
 from veadk.cli.cli_deploy import deploy
+from veadk.cli.cli_eval import eval
 from veadk.cli.cli_init import init
+from veadk.cli.cli_pipeline import pipeline
 from veadk.cli.cli_prompt import prompt
 from veadk.cli.cli_web import web
-from veadk.cli.cli_pipeline import pipeline
 from veadk.version import VERSION
 
 
@@ -37,6 +38,7 @@ def veadk():
 veadk.add_command(prompt)
 veadk.add_command(web)
 veadk.add_command(pipeline)
+veadk.add_command(eval)
 
 if __name__ == "__main__":
     veadk()
diff --git a/veadk/cli/cli_eval.py b/veadk/cli/cli_eval.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import click
+
+from veadk.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@click.command()
+@click.option(
+    "--agent-dir",
+    default=".",
+    help="To-be-evaluated agent directory. Must export `root_agent` in `agent.py`",
+)
+@click.option(
+    "--agent-a2a-url",
+    default=None,
+    help="To-be-evaluated agent URL. The agent should be deployed as A2A mode.",
+)
+@click.option(
+    "--evalset-file",
+    required=True,
+    help="Google ADK formatted evalset file path",
+)
+@click.option(
+    "--evaluator",
+    type=click.Choice(["adk", "deepeval"], case_sensitive=False),
+    help="Evaluator type, choose `adk` or `deepeval`",
+)
+@click.option(
+    "--judge-model-name",
+    default="doubao-1-5-pro-256k-250115",
+    help="Judge model name, default is `doubao-1-5-pro-256k-250115`. Useless under `adk` evaluator.",
+)
+@click.option(
+    "--volcengine-access-key",
+    default=None,
+    help="Volcengine access key for using Volcengine models",
+)
+@click.option(
+    "--volcengine-secret-key",
+    default=None,
+    help="Volcengine secret key for using Volcengine models",
+)
+def eval(
+    agent_dir: str,
+    agent_a2a_url: str,
+    evalset_file: str,
+    evaluator: str,
+    judge_model_name: str,
+    volcengine_access_key: str,
+    volcengine_secret_key: str,
+) -> None:
+    import asyncio
+    import os
+    from pathlib import Path
+
+    from google.adk.cli.utils.agent_loader import AgentLoader
+
+    from veadk.a2a.remote_ve_agent import RemoteVeAgent
+    from veadk.config import getenv, settings
+    from veadk.prompts.prompt_evaluator import eval_principle_prompt
+
+    try:
+        from deepeval.metrics import GEval, ToolCorrectnessMetric
+        from deepeval.test_case import LLMTestCaseParams
+
+        from veadk.evaluation.adk_evaluator import ADKEvaluator
+        from veadk.evaluation.deepeval_evaluator import DeepevalEvaluator
+    except ImportError:
+        raise ImportError(
+            "Please install veadk with `[evaluation]` extras, e.g., `pip install veadk-python[eval]`"
+        )
+
+    # ====== prepare agent instance ======
+    if not agent_dir and not agent_a2a_url:
+        raise ValueError(
+            "Option `--agent-dir` or  `--agent-a2a-url` should be provided one of them."
+        )
+
+    if agent_dir and agent_a2a_url:
+        logger.warning(
+            "`--agent-dir` and `--agent-a2a-url` are both provided, will use `--agent-a2a-url`."
+        )
+        agent_instance = RemoteVeAgent(name="a2a_agent", url=agent_a2a_url)
+        logger.info(f"Loaded agent from {agent_a2a_url}")
+
+    if not agent_dir and agent_a2a_url:
+        agent_instance = RemoteVeAgent(name="a2a_agent", url=agent_a2a_url)
+        logger.info(f"Loaded agent from {agent_a2a_url}")
+
+    if agent_dir and not agent_a2a_url:
+        agent_instance = AgentLoader(str(Path(agent_dir).parent.resolve())).load_agent(
+            str(Path(agent_dir).name)
+        )
+        logger.info(f"Loaded agent from {agent_dir}, agent name: {agent_instance.name}")
+
+    # ====== prepare envs ======
+    if volcengine_access_key and "VOLCENGINE_ACCESS_KEY" not in os.environ:
+        os.environ["VOLCENGINE_ACCESS_KEY"] = volcengine_access_key
+    if volcengine_secret_key and "VOLCENGINE_SECRET_KEY" not in os.environ:
+        os.environ["VOLCENGINE_SECRET_KEY"] = volcengine_secret_key
+
+    # ====== prepare evaluator instance ======
+    evaluator_instance = None
+    if evaluator == "adk" and judge_model_name:
+        logger.warning(
+            "Using Google ADK evaluator, `--judge-model-name` will be ignored."
+        )
+        evaluator_instance = ADKEvaluator(agent=agent_instance)
+
+        asyncio.run(evaluator_instance.evaluate(eval_set_file_path=evalset_file))
+
+    if evaluator == "deepeval":
+        if not volcengine_access_key:
+            volcengine_access_key = getenv("VOLCENGINE_ACCESS_KEY")
+        if not volcengine_secret_key:
+            volcengine_secret_key = getenv("VOLCENGINE_SECRET_KEY")
+
+        evaluator_instance = DeepevalEvaluator(
+            agent=agent_instance,
+            judge_model_api_key=settings.model.api_key,
+            judge_model_name=judge_model_name,
+        )
+
+        judge_model = evaluator_instance.judge_model
+
+        metrics = [
+            GEval(
+                threshold=0.8,
+                name="Base Evaluation",
+                criteria=eval_principle_prompt,
+                evaluation_params=[
+                    LLMTestCaseParams.INPUT,
+                    LLMTestCaseParams.ACTUAL_OUTPUT,
+                    LLMTestCaseParams.EXPECTED_OUTPUT,
+                ],
+                model=judge_model,
+            ),
+            ToolCorrectnessMetric(threshold=0.5),
+        ]
+
+        asyncio.run(
+            evaluator_instance.evaluate(
+                eval_set_file_path=evalset_file, metrics=metrics
+            )
+        )
diff --git a/veadk/evaluation/base_evaluator.py b/veadk/evaluation/base_evaluator.py
diff --git a/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py b/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py