Skip to content

Commit 54e0b69

Browse files
committed
fix: build eval set from tracing json
1 parent 445b17f commit 54e0b69

File tree

2 files changed

+98
-81
lines changed

2 files changed

+98
-81
lines changed

tests/test_evaluator.py

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -62,60 +62,61 @@
6262
TRACE_SET_DATA = [
6363
{
6464
"name": "execute_tool get_city_weather",
65-
"span_id": 5421848634094108689,
66-
"trace_id": 115143748123782151752771111946932434777,
67-
"start_time": 1754884672226444000,
68-
"end_time": 1754884672226993000,
65+
"span_id": 4497348974122733469,
66+
"trace_id": 142655176138954930885272077198014871976,
67+
"start_time": 1758158957162250000,
68+
"end_time": 1758158957162426000,
6969
"attributes": {
7070
"gen_ai.tool.name": "get_city_weather",
71-
"gen_ai.tool.description": "Retrieves the weather information of a given city. the args must in English",
72-
"gen_ai.tool.call.id": "call_6ow5622pvcouw3tpvr7rfqtl",
73-
"gcp.vertex.agent.tool_call_args": '{"city": "Xi\'an"}',
71+
"gen_ai.tool.input": '{"name": "get_city_weather", "description": "Retrieves the weather information of a given city. the args must in English", "parameters": {"city": "Beijing"}}',
72+
"gen_ai.tool.output": '{"id": "call_w4bj25flpvs74zgyyiquqh5s", "name": "get_city_weather", "response": {"result": "Sunny, 25°C"}}',
7473
},
75-
"parent_span_id": 7997784243558253239,
74+
"parent_span_id": 574819447039686650,
7675
},
7776
{
7877
"name": "call_llm",
79-
"span_id": 7997784243558253239,
80-
"trace_id": 115143748123782151752771111946932434777,
78+
"span_id": 574819447039686650,
79+
"trace_id": 142655176138954930885272077198014871976,
80+
"start_time": 1758158945807630000,
81+
"end_time": 1758158957171304000,
8182
"attributes": {
82-
"session.id": "veadk_example_session",
83-
"user.id": "veadk_default_user",
83+
"gen_ai.app.name": "veadk_default_app",
84+
"gen_ai.user.id": "veadk_default_user",
85+
"gen_ai.prompt.0.role": "user",
86+
"gen_ai.prompt.0.content": "How is the weather like in BeiJing?",
8487
},
85-
"parent_span_id": 14844888006539887900,
88+
"parent_span_id": 13789664766018020416,
8689
},
8790
{
8891
"name": "call_llm",
89-
"span_id": 7789424022423491416,
90-
"trace_id": 115143748123782151752771111946932434777,
92+
"span_id": 9007934154052797946,
93+
"trace_id": 142655176138954930885272077198014871976,
94+
"start_time": 1758158957171713000,
95+
"end_time": 1758158964035230000,
9196
"attributes": {
92-
"session.id": "veadk_example_session",
93-
"user.id": "veadk_default_user",
97+
"gen_ai.app.name": "veadk_default_app",
98+
"gen_ai.user.id": "veadk_default_user",
99+
"gen_ai.prompt.0.content": "How is the weather like in BeiJing?",
100+
"gen_ai.completion.0.content": "The weather in Beijing is sunny with a temperature of 25°C.",
94101
},
95-
"parent_span_id": 14844888006539887900,
102+
"parent_span_id": 13789664766018020416,
96103
},
97104
{
98105
"name": "agent_run [chat_robot]",
99-
"span_id": 14844888006539887900,
100-
"trace_id": 115143748123782151752771111946932434777,
101-
"attributes": {
102-
"session.id": "veadk_example_session",
103-
"user.id": "veadk_default_user",
104-
},
105-
"parent_span_id": 2943363177785645047,
106+
"span_id": 13789664766018020416,
107+
"trace_id": 142655176138954930885272077198014871976,
108+
"start_time": 1758158945807350000,
109+
"end_time": 1758158964035291000,
110+
"attributes": {},
111+
"parent_span_id": 5589459087402275636,
106112
},
107113
{
108-
"name": "invocation [veadk_default_app]",
109-
"span_id": 2943363177785645047,
110-
"trace_id": 115143748123782151752771111946932434777,
111-
"start_time": 1754884660687962000,
112-
"end_time": 1754884676664833000,
113-
"attributes": {
114-
"input.value": '{"user_id": "veadk_default_user", "session_id": "veadk_example_session", "new_message": "{\\"parts\\": [{\\"video_metadata\\": null, \\"thought\\": null, \\"inline_data\\": null, \\"file_data\\": null, \\"thought_signature\\": null, \\"code_execution_result\\": null, \\"executable_code\\": null, \\"function_call\\": null, \\"function_response\\": null, \\"text\\": \\"How is the weather like in Xi\'an?\\"}], \\"role\\": \\"user\\"}", "run_config": "{\\"speech_config\\": null, \\"response_modalities\\": null, \\"save_input_blobs_as_artifacts\\": false, \\"support_cfc\\": false, \\"streaming_mode\\": \\"StreamingMode.NONE\\", \\"output_audio_transcription\\": null, \\"input_audio_transcription\\": null, \\"realtime_input_config\\": null, \\"enable_affective_dialog\\": null, \\"proactivity\\": null, \\"max_llm_calls\\": 500}"}',
115-
"user.id": "veadk_default_user",
116-
"session.id": "veadk_example_session",
117-
"output.value": '{"content":{"parts":[{"text":"The weather in Xi\'an is cool, with a temperature of 18\u00b0C."}],"role":"model"},"partial":false,"usage_metadata":{"candidates_token_count":132,"prompt_token_count":547,"total_token_count":679},"invocation_id":"e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441","author":"chat_robot","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{}},"id":"c0929124-9be0-4f75-a6ba-f7a531c9ccb6","timestamp":1754884672.227546}',
118-
},
114+
"name": "invocation",
115+
"span_id": 5589459087402275636,
116+
"trace_id": 142655176138954930885272077198014871976,
117+
"start_time": 1758158945807233000,
118+
"end_time": 1758158964035304000,
119+
"attributes": {},
119120
"parent_span_id": None,
120121
},
121122
]
@@ -154,8 +155,8 @@ def test_tracing_file_to_evalset():
154155
assert len(base_evaluator.invocation_list) == 1
155156
assert len(base_evaluator.invocation_list[0].invocations) == 1
156157
assert (
157-
base_evaluator.invocation_list[0].invocations[0].invocation_id
158-
== "e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441"
158+
base_evaluator.invocation_list[0].invocations[0].expected_output
159+
== "The weather in Beijing is sunny with a temperature of 25°C."
159160
)
160161

161162
os.remove(tracing_file_path)

veadk/evaluation/base_evaluator.py

Lines changed: 59 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -120,56 +120,72 @@ def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
120120
# Extract tool_uses from spans with name starting with "execute_tool"
121121
for span in spans:
122122
if span["name"].startswith("execute_tool"):
123+
# Extract tool parameters from gen_ai.tool.input
124+
tool_input_str = span["attributes"].get("gen_ai.tool.input", "{}")
125+
try:
126+
tool_input = json.loads(tool_input_str)
127+
tool_args = tool_input.get("parameters", {})
128+
except json.JSONDecodeError:
129+
tool_args = {}
130+
131+
# Extract the tool call ID from gen_ai.tool.output
132+
tool_output_str = span["attributes"].get("gen_ai.tool.output", "{}")
133+
tool_call_id = None
134+
try:
135+
tool_output = json.loads(tool_output_str)
136+
tool_call_id = tool_output.get("id", None)
137+
except json.JSONDecodeError:
138+
tool_call_id = None
139+
123140
tool_uses.append(
124141
{
125-
"id": span["attributes"].get("gen_ai.tool.call.id", None),
126-
"args": json.loads(
127-
span["attributes"].get(
128-
"gcp.vertex.agent.tool_call_args", "{}"
129-
)
130-
),
142+
"id": tool_call_id,
143+
"args": tool_args,
131144
"name": span["attributes"].get("gen_ai.tool.name", None),
132145
}
133146
)
134147

135-
# Extract conversation data from spans with name starting with "invocation"
136-
for span in spans:
137-
if span["name"].startswith("invocation"):
138-
# Parse input.value and output.value as JSON
139-
input_value = json.loads(
140-
span["attributes"].get("input.value", "{}")
141-
)
142-
output_value = json.loads(
143-
span["attributes"].get("output.value", "{}")
144-
)
148+
# Extract conversation data from call_llm spans
149+
user_input = ""
150+
final_output = ""
145151

146-
user_content = json.loads(input_value.get("new_message", {}))
147-
final_response = json.loads(json.dumps(user_content))
148-
final_response["parts"][0]["text"] = (
149-
output_value.get("content", {})
150-
.get("parts", [{}])[0]
151-
.get("text", None)
152-
)
153-
final_response["role"] = None
154-
conversation.append(
155-
{
156-
"invocation_id": output_value.get(
157-
"invocation_id", str(uuid.uuid4())
158-
),
159-
"user_content": user_content,
160-
"final_response": final_response,
161-
"intermediate_data": {
162-
"tool_uses": tool_uses,
163-
"intermediate_responses": [],
164-
},
165-
"creation_timestamp": span["start_time"] / 1e9,
166-
}
167-
)
168-
user_id = input_value.get("user_id", None)
169-
app_name = (
170-
span["name"].replace("invocation", "").strip().strip("[]")
171-
)
172-
creation_timestamp = span["start_time"] / 1e9
152+
# Find the first call_llm span for user input and the last one for final output
153+
call_llm_spans = [span for span in spans if span["name"] == "call_llm"]
154+
155+
if call_llm_spans:
156+
# Get user input from the first call_llm span
157+
first_span = call_llm_spans[0]
158+
user_input = first_span["attributes"].get("gen_ai.prompt.0.content", "")
159+
160+
# Get final output from the last call_llm span
161+
last_span = call_llm_spans[-1]
162+
final_output = last_span["attributes"].get(
163+
"gen_ai.completion.0.content", ""
164+
)
165+
166+
# Get metadata from any span
167+
app_name = first_span["attributes"].get("gen_ai.app.name", "")
168+
user_id = first_span["attributes"].get("gen_ai.user.id", "")
169+
creation_timestamp = first_span["start_time"] / 1e9
170+
171+
if user_input and final_output:
172+
# Create user_content and final_response in the expected format
173+
user_content = {"role": "user", "parts": [{"text": user_input}]}
174+
175+
final_response = {"role": "model", "parts": [{"text": final_output}]}
176+
177+
conversation.append(
178+
{
179+
"invocation_id": str(uuid.uuid4()),
180+
"user_content": user_content,
181+
"final_response": final_response,
182+
"intermediate_data": {
183+
"tool_uses": tool_uses,
184+
"intermediate_responses": [],
185+
},
186+
"creation_timestamp": creation_timestamp,
187+
}
188+
)
173189

174190
eval_cases.append(
175191
{

0 commit comments

Comments
 (0)