feat: parsing the tracing file into evaluation cases (#44)

BhAem · web-flow · commit 21a564ebeca6 · 2025-08-12T11:30:10.000+08:00
* feat: parsing the tracing file into evaluation cases

* feat: parsing the tracing file into evaluation cases

* fix: shorten tracing samples in the test

* fix: manage evalset and tracing file uniformly in one function

* fix: modify the function name

---------

Co-authored-by: wuqingfu.528 &lt;wuqingfu.528@bytedance.com&gt;
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
@@ -59,6 +59,67 @@
     ],
 }
 
+TRACE_SET_DATA = [
+    {
+        "name": "execute_tool get_city_weather",
+        "span_id": 5421848634094108689,
+        "trace_id": 115143748123782151752771111946932434777,
+        "start_time": 1754884672226444000,
+        "end_time": 1754884672226993000,
+        "attributes": {
+            "gen_ai.tool.name": "get_city_weather",
+            "gen_ai.tool.description": "Retrieves the weather information of a given city. the args must in English",
+            "gen_ai.tool.call.id": "call_6ow5622pvcouw3tpvr7rfqtl",
+            "gcp.vertex.agent.tool_call_args": '{"city": "Xi\'an"}',
+        },
+        "parent_span_id": 7997784243558253239,
+    },
+    {
+        "name": "call_llm",
+        "span_id": 7997784243558253239,
+        "trace_id": 115143748123782151752771111946932434777,
+        "attributes": {
+            "session.id": "veadk_example_session",
+            "user.id": "veadk_default_user",
+        },
+        "parent_span_id": 14844888006539887900,
+    },
+    {
+        "name": "call_llm",
+        "span_id": 7789424022423491416,
+        "trace_id": 115143748123782151752771111946932434777,
+        "attributes": {
+            "session.id": "veadk_example_session",
+            "user.id": "veadk_default_user",
+        },
+        "parent_span_id": 14844888006539887900,
+    },
+    {
+        "name": "agent_run [chat_robot]",
+        "span_id": 14844888006539887900,
+        "trace_id": 115143748123782151752771111946932434777,
+        "attributes": {
+            "session.id": "veadk_example_session",
+            "user.id": "veadk_default_user",
+        },
+        "parent_span_id": 2943363177785645047,
+    },
+    {
+        "name": "invocation [veadk_default_app]",
+        "span_id": 2943363177785645047,
+        "trace_id": 115143748123782151752771111946932434777,
+        "start_time": 1754884660687962000,
+        "end_time": 1754884676664833000,
+        "attributes": {
+            "input.value": '{"user_id": "veadk_default_user", "session_id": "veadk_example_session", "new_message": "{\\"parts\\": [{\\"video_metadata\\": null, \\"thought\\": null, \\"inline_data\\": null, \\"file_data\\": null, \\"thought_signature\\": null, \\"code_execution_result\\": null, \\"executable_code\\": null, \\"function_call\\": null, \\"function_response\\": null, \\"text\\": \\"How is the weather like in Xi\'an?\\"}], \\"role\\": \\"user\\"}", "run_config": "{\\"speech_config\\": null, \\"response_modalities\\": null, \\"save_input_blobs_as_artifacts\\": false, \\"support_cfc\\": false, \\"streaming_mode\\": \\"StreamingMode.NONE\\", \\"output_audio_transcription\\": null, \\"input_audio_transcription\\": null, \\"realtime_input_config\\": null, \\"enable_affective_dialog\\": null, \\"proactivity\\": null, \\"max_llm_calls\\": 500}"}',
+            "user.id": "veadk_default_user",
+            "session.id": "veadk_example_session",
+            "output.value": '{"content":{"parts":[{"text":"The weather in Xi\'an is cool, with a temperature of 18\u00b0C."}],"role":"model"},"partial":false,"usage_metadata":{"candidates_token_count":132,"prompt_token_count":547,"total_token_count":679},"invocation_id":"e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441","author":"chat_robot","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{}},"id":"c0929124-9be0-4f75-a6ba-f7a531c9ccb6","timestamp":1754884672.227546}',
+        },
+        "parent_span_id": None,
+    },
+]
+
 
 def test_evaluator():
     base_evaluator = BaseEvaluator(agent=None, name="test_evaluator")
@@ -68,7 +129,7 @@ def test_evaluator():
     with open(eval_set_file_path, "w") as f:
         json.dump(EVAL_SET_DATA, f)
 
-    base_evaluator.generate_eval_data(eval_set_file_path=eval_set_file_path)
+    base_evaluator.generate_eval_data(file_path=eval_set_file_path)
 
     assert len(base_evaluator.invocation_list) == 1
     assert len(base_evaluator.invocation_list[0].invocations) == 1
@@ -78,3 +139,23 @@ def test_evaluator():
     )
 
     os.remove(eval_set_file_path)
+
+
+def test_tracing_file_to_evalset():
+    base_evaluator = BaseEvaluator(agent=None, name="test_evaluator")
+
+    # save data to file
+    tracing_file_path = "./tracing_for_test_evaluator.json"
+    with open(tracing_file_path, "w") as f:
+        json.dump(TRACE_SET_DATA, f)
+
+    base_evaluator.generate_eval_data(file_path=tracing_file_path)
+
+    assert len(base_evaluator.invocation_list) == 1
+    assert len(base_evaluator.invocation_list[0].invocations) == 1
+    assert (
+        base_evaluator.invocation_list[0].invocations[0].invocation_id
+        == "e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441"
+    )
+
+    os.remove(tracing_file_path)
diff --git a/veadk/evaluation/base_evaluator.py b/veadk/evaluation/base_evaluator.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+import json
 import time
 import uuid
 from abc import abstractmethod
@@ -24,6 +25,8 @@
 from google.genai import types
 from pydantic import BaseModel
 
+from veadk.utils.misc import formatted_timestamp
+
 
 class InvocationTestData(BaseModel):
     invocation_id: str = ""
@@ -79,15 +82,139 @@ def __init__(
         self.result_list: list[EvalResultData] = []
         self.agent_information_list: list[dict] = []
 
-    def load_eval_set(self, eval_set_file: str) -> list[EvalSet]:
+    def _load_eval_set(self, eval_set_file: str) -> EvalSet:
         from .eval_set_file_loader import load_eval_set_from_file
 
         return load_eval_set_from_file(eval_set_file)
 
-    def generate_eval_data(self, eval_set_file_path: str):
+    def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
+        try:
+            with open(tracing_file, "r") as f:
+                tracing_data = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}")
+        except Exception as e:
+            raise ValueError(f"Error reading file {tracing_file}: {e}")
+
+        # Group spans by trace_id
+        trace_groups = {}
+        for span in tracing_data:
+            trace_id = span["trace_id"]
+            if trace_id not in trace_groups:
+                trace_groups[trace_id] = []
+            trace_groups[trace_id].append(span)
+
+        # Convert to evalset format
+        eval_cases, conversation = [], []
+        app_name, user_id = "", ""
+        creation_timestamp = 0
+        for trace_id, spans in trace_groups.items():
+            tool_uses = []
+
+            # Extract tool_uses from spans with name starting with "execute_tool"
+            for span in spans:
+                if span["name"].startswith("execute_tool"):
+                    tool_uses.append(
+                        {
+                            "id": span["attributes"].get("gen_ai.tool.call.id", None),
+                            "args": json.loads(
+                                span["attributes"].get(
+                                    "gcp.vertex.agent.tool_call_args", "{}"
+                                )
+                            ),
+                            "name": span["attributes"].get("gen_ai.tool.name", None),
+                        }
+                    )
+
+            # Extract conversation data from spans with name starting with "invocation"
+            for span in spans:
+                if span["name"].startswith("invocation"):
+                    # Parse input.value and output.value as JSON
+                    input_value = json.loads(
+                        span["attributes"].get("input.value", "{}")
+                    )
+                    output_value = json.loads(
+                        span["attributes"].get("output.value", "{}")
+                    )
+
+                    user_content = json.loads(input_value.get("new_message", {}))
+                    final_response = json.loads(json.dumps(user_content))
+                    final_response["parts"][0]["text"] = (
+                        output_value.get("content", {})
+                        .get("parts", [{}])[0]
+                        .get("text", None)
+                    )
+                    final_response["role"] = None
+                    conversation.append(
+                        {
+                            "invocation_id": output_value.get(
+                                "invocation_id", str(uuid.uuid4())
+                            ),
+                            "user_content": user_content,
+                            "final_response": final_response,
+                            "intermediate_data": {
+                                "tool_uses": tool_uses,
+                                "intermediate_responses": [],
+                            },
+                            "creation_timestamp": span["start_time"] / 1e9,
+                        }
+                    )
+                    user_id = input_value.get("user_id", None)
+                    app_name = (
+                        span["name"].replace("invocation", "").strip().strip("[]")
+                    )
+                    creation_timestamp = span["start_time"] / 1e9
+
+        eval_cases.append(
+            {
+                "eval_id": f"veadk_eval_{formatted_timestamp()}",
+                "conversation": conversation,
+                "session_input": {
+                    "app_name": app_name,
+                    "user_id": user_id,
+                    "state": {},
+                },
+                "creation_timestamp": creation_timestamp,
+            }
+        )
+
+        evalset = EvalSet(
+            eval_set_id="default",
+            name="default",
+            description=None,
+            eval_cases=eval_cases,
+            creation_timestamp=creation_timestamp,
+        )
+
+        return evalset
+
+    def generate_eval_data(self, file_path: str):
+        """Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
         eval_case_data_list: list[EvalCaseData] = []
 
-        eval_cases = self.load_eval_set(eval_set_file_path).eval_cases
+        try:
+            with open(file_path, "r") as f:
+                file_content = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON format in file {file_path}: {e}")
+        except Exception as e:
+            raise ValueError(f"Error reading file {file_path}: {e}")
+
+        if isinstance(file_content, dict) and "eval_cases" in file_content:
+            eval_cases = self._load_eval_set(file_path).eval_cases
+        elif (
+            isinstance(file_content, list)
+            and len(file_content) > 0
+            and all(
+                isinstance(span, dict) and "trace_id" in span for span in file_content
+            )
+        ):
+            eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases
+        else:
+            raise ValueError(
+                f"Unsupported file format in {file_path}. Please provide a valid file."
+            )
+
         for eval_case in eval_cases:
             eval_case_data = EvalCaseData(invocations=[])
             self.agent_information_list.append(