fix: build eval set from tracing json

BhAem · BhAem · commit 54e0b696a805 · 2025-09-18T12:16:43.000+08:00
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
@@ -62,60 +62,61 @@
 TRACE_SET_DATA = [
     {
         "name": "execute_tool get_city_weather",
-        "span_id": 5421848634094108689,
-        "trace_id": 115143748123782151752771111946932434777,
-        "start_time": 1754884672226444000,
-        "end_time": 1754884672226993000,
+        "span_id": 4497348974122733469,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158957162250000,
+        "end_time": 1758158957162426000,
         "attributes": {
             "gen_ai.tool.name": "get_city_weather",
-            "gen_ai.tool.description": "Retrieves the weather information of a given city. the args must in English",
-            "gen_ai.tool.call.id": "call_6ow5622pvcouw3tpvr7rfqtl",
-            "gcp.vertex.agent.tool_call_args": '{"city": "Xi\'an"}',
+            "gen_ai.tool.input": '{"name": "get_city_weather", "description": "Retrieves the weather information of a given city. the args must in English", "parameters": {"city": "Beijing"}}',
+            "gen_ai.tool.output": '{"id": "call_w4bj25flpvs74zgyyiquqh5s", "name": "get_city_weather", "response": {"result": "Sunny, 25°C"}}',
         },
-        "parent_span_id": 7997784243558253239,
+        "parent_span_id": 574819447039686650,
     },
     {
         "name": "call_llm",
-        "span_id": 7997784243558253239,
-        "trace_id": 115143748123782151752771111946932434777,
+        "span_id": 574819447039686650,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158945807630000,
+        "end_time": 1758158957171304000,
         "attributes": {
-            "session.id": "veadk_example_session",
-            "user.id": "veadk_default_user",
+            "gen_ai.app.name": "veadk_default_app",
+            "gen_ai.user.id": "veadk_default_user",
+            "gen_ai.prompt.0.role": "user",
+            "gen_ai.prompt.0.content": "How is the weather like in BeiJing?",
         },
-        "parent_span_id": 14844888006539887900,
+        "parent_span_id": 13789664766018020416,
     },
     {
         "name": "call_llm",
-        "span_id": 7789424022423491416,
-        "trace_id": 115143748123782151752771111946932434777,
+        "span_id": 9007934154052797946,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158957171713000,
+        "end_time": 1758158964035230000,
         "attributes": {
-            "session.id": "veadk_example_session",
-            "user.id": "veadk_default_user",
+            "gen_ai.app.name": "veadk_default_app",
+            "gen_ai.user.id": "veadk_default_user",
+            "gen_ai.prompt.0.content": "How is the weather like in BeiJing?",
+            "gen_ai.completion.0.content": "The weather in Beijing is sunny with a temperature of 25°C.",
         },
-        "parent_span_id": 14844888006539887900,
+        "parent_span_id": 13789664766018020416,
     },
     {
         "name": "agent_run [chat_robot]",
-        "span_id": 14844888006539887900,
-        "trace_id": 115143748123782151752771111946932434777,
-        "attributes": {
-            "session.id": "veadk_example_session",
-            "user.id": "veadk_default_user",
-        },
-        "parent_span_id": 2943363177785645047,
+        "span_id": 13789664766018020416,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158945807350000,
+        "end_time": 1758158964035291000,
+        "attributes": {},
+        "parent_span_id": 5589459087402275636,
     },
     {
-        "name": "invocation [veadk_default_app]",
-        "span_id": 2943363177785645047,
-        "trace_id": 115143748123782151752771111946932434777,
-        "start_time": 1754884660687962000,
-        "end_time": 1754884676664833000,
-        "attributes": {
-            "input.value": '{"user_id": "veadk_default_user", "session_id": "veadk_example_session", "new_message": "{\\"parts\\": [{\\"video_metadata\\": null, \\"thought\\": null, \\"inline_data\\": null, \\"file_data\\": null, \\"thought_signature\\": null, \\"code_execution_result\\": null, \\"executable_code\\": null, \\"function_call\\": null, \\"function_response\\": null, \\"text\\": \\"How is the weather like in Xi\'an?\\"}], \\"role\\": \\"user\\"}", "run_config": "{\\"speech_config\\": null, \\"response_modalities\\": null, \\"save_input_blobs_as_artifacts\\": false, \\"support_cfc\\": false, \\"streaming_mode\\": \\"StreamingMode.NONE\\", \\"output_audio_transcription\\": null, \\"input_audio_transcription\\": null, \\"realtime_input_config\\": null, \\"enable_affective_dialog\\": null, \\"proactivity\\": null, \\"max_llm_calls\\": 500}"}',
-            "user.id": "veadk_default_user",
-            "session.id": "veadk_example_session",
-            "output.value": '{"content":{"parts":[{"text":"The weather in Xi\'an is cool, with a temperature of 18\u00b0C."}],"role":"model"},"partial":false,"usage_metadata":{"candidates_token_count":132,"prompt_token_count":547,"total_token_count":679},"invocation_id":"e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441","author":"chat_robot","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{}},"id":"c0929124-9be0-4f75-a6ba-f7a531c9ccb6","timestamp":1754884672.227546}',
-        },
+        "name": "invocation",
+        "span_id": 5589459087402275636,
+        "trace_id": 142655176138954930885272077198014871976,
+        "start_time": 1758158945807233000,
+        "end_time": 1758158964035304000,
+        "attributes": {},
         "parent_span_id": None,
     },
 ]
@@ -154,8 +155,8 @@ def test_tracing_file_to_evalset():
     assert len(base_evaluator.invocation_list) == 1
     assert len(base_evaluator.invocation_list[0].invocations) == 1
     assert (
-        base_evaluator.invocation_list[0].invocations[0].invocation_id
-        == "e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441"
+        base_evaluator.invocation_list[0].invocations[0].expected_output
+        == "The weather in Beijing is sunny with a temperature of 25°C."
     )
 
     os.remove(tracing_file_path)
diff --git a/veadk/evaluation/base_evaluator.py b/veadk/evaluation/base_evaluator.py
@@ -120,56 +120,72 @@ def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
             # Extract tool_uses from spans with name starting with "execute_tool"
             for span in spans:
                 if span["name"].startswith("execute_tool"):
+                    # Extract tool parameters from gen_ai.tool.input
+                    tool_input_str = span["attributes"].get("gen_ai.tool.input", "{}")
+                    try:
+                        tool_input = json.loads(tool_input_str)
+                        tool_args = tool_input.get("parameters", {})
+                    except json.JSONDecodeError:
+                        tool_args = {}
+
+                    # Extract the tool call ID from gen_ai.tool.output
+                    tool_output_str = span["attributes"].get("gen_ai.tool.output", "{}")
+                    tool_call_id = None
+                    try:
+                        tool_output = json.loads(tool_output_str)
+                        tool_call_id = tool_output.get("id", None)
+                    except json.JSONDecodeError:
+                        tool_call_id = None
+
                     tool_uses.append(
                         {
-                            "id": span["attributes"].get("gen_ai.tool.call.id", None),
-                            "args": json.loads(
-                                span["attributes"].get(
-                                    "gcp.vertex.agent.tool_call_args", "{}"
-                                )
-                            ),
+                            "id": tool_call_id,
+                            "args": tool_args,
                             "name": span["attributes"].get("gen_ai.tool.name", None),
                         }
                     )
 
-            # Extract conversation data from spans with name starting with "invocation"
-            for span in spans:
-                if span["name"].startswith("invocation"):
-                    # Parse input.value and output.value as JSON
-                    input_value = json.loads(
-                        span["attributes"].get("input.value", "{}")
-                    )
-                    output_value = json.loads(
-                        span["attributes"].get("output.value", "{}")
-                    )
+            # Extract conversation data from call_llm spans
+            user_input = ""
+            final_output = ""
 
-                    user_content = json.loads(input_value.get("new_message", {}))
-                    final_response = json.loads(json.dumps(user_content))
-                    final_response["parts"][0]["text"] = (
-                        output_value.get("content", {})
-                        .get("parts", [{}])[0]
-                        .get("text", None)
-                    )
-                    final_response["role"] = None
-                    conversation.append(
-                        {
-                            "invocation_id": output_value.get(
-                                "invocation_id", str(uuid.uuid4())
-                            ),
-                            "user_content": user_content,
-                            "final_response": final_response,
-                            "intermediate_data": {
-                                "tool_uses": tool_uses,
-                                "intermediate_responses": [],
-                            },
-                            "creation_timestamp": span["start_time"] / 1e9,
-                        }
-                    )
-                    user_id = input_value.get("user_id", None)
-                    app_name = (
-                        span["name"].replace("invocation", "").strip().strip("[]")
-                    )
-                    creation_timestamp = span["start_time"] / 1e9
+            # Find the first call_llm span for user input and the last one for final output
+            call_llm_spans = [span for span in spans if span["name"] == "call_llm"]
+
+            if call_llm_spans:
+                # Get user input from the first call_llm span
+                first_span = call_llm_spans[0]
+                user_input = first_span["attributes"].get("gen_ai.prompt.0.content", "")
+
+                # Get final output from the last call_llm span
+                last_span = call_llm_spans[-1]
+                final_output = last_span["attributes"].get(
+                    "gen_ai.completion.0.content", ""
+                )
+
+                # Get metadata from any span
+                app_name = first_span["attributes"].get("gen_ai.app.name", "")
+                user_id = first_span["attributes"].get("gen_ai.user.id", "")
+                creation_timestamp = first_span["start_time"] / 1e9
+
+            if user_input and final_output:
+                # Create user_content and final_response in the expected format
+                user_content = {"role": "user", "parts": [{"text": user_input}]}
+
+                final_response = {"role": "model", "parts": [{"text": final_output}]}
+
+                conversation.append(
+                    {
+                        "invocation_id": str(uuid.uuid4()),
+                        "user_content": user_content,
+                        "final_response": final_response,
+                        "intermediate_data": {
+                            "tool_uses": tool_uses,
+                            "intermediate_responses": [],
+                        },
+                        "creation_timestamp": creation_timestamp,
+                    }
+                )
 
         eval_cases.append(
             {