Skip to content

Commit 21a564e

Browse files
authored
feat: parsing the tracing file into evaluation cases (#44)
* feat: parsing the tracing file into evaluation cases * feat: parsing the tracing file into evaluation cases * fix: shorten tracing samples in the test * fix: manage evalset and tracing file uniformly in one function * fix: modify the function name --------- Co-authored-by: wuqingfu.528 <[email protected]>
1 parent c898dda commit 21a564e

File tree

2 files changed

+212
-4
lines changed

2 files changed

+212
-4
lines changed

tests/test_evaluator.py

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,67 @@
5959
],
6060
}
6161

62+
TRACE_SET_DATA = [
63+
{
64+
"name": "execute_tool get_city_weather",
65+
"span_id": 5421848634094108689,
66+
"trace_id": 115143748123782151752771111946932434777,
67+
"start_time": 1754884672226444000,
68+
"end_time": 1754884672226993000,
69+
"attributes": {
70+
"gen_ai.tool.name": "get_city_weather",
71+
"gen_ai.tool.description": "Retrieves the weather information of a given city. the args must in English",
72+
"gen_ai.tool.call.id": "call_6ow5622pvcouw3tpvr7rfqtl",
73+
"gcp.vertex.agent.tool_call_args": '{"city": "Xi\'an"}',
74+
},
75+
"parent_span_id": 7997784243558253239,
76+
},
77+
{
78+
"name": "call_llm",
79+
"span_id": 7997784243558253239,
80+
"trace_id": 115143748123782151752771111946932434777,
81+
"attributes": {
82+
"session.id": "veadk_example_session",
83+
"user.id": "veadk_default_user",
84+
},
85+
"parent_span_id": 14844888006539887900,
86+
},
87+
{
88+
"name": "call_llm",
89+
"span_id": 7789424022423491416,
90+
"trace_id": 115143748123782151752771111946932434777,
91+
"attributes": {
92+
"session.id": "veadk_example_session",
93+
"user.id": "veadk_default_user",
94+
},
95+
"parent_span_id": 14844888006539887900,
96+
},
97+
{
98+
"name": "agent_run [chat_robot]",
99+
"span_id": 14844888006539887900,
100+
"trace_id": 115143748123782151752771111946932434777,
101+
"attributes": {
102+
"session.id": "veadk_example_session",
103+
"user.id": "veadk_default_user",
104+
},
105+
"parent_span_id": 2943363177785645047,
106+
},
107+
{
108+
"name": "invocation [veadk_default_app]",
109+
"span_id": 2943363177785645047,
110+
"trace_id": 115143748123782151752771111946932434777,
111+
"start_time": 1754884660687962000,
112+
"end_time": 1754884676664833000,
113+
"attributes": {
114+
"input.value": '{"user_id": "veadk_default_user", "session_id": "veadk_example_session", "new_message": "{\\"parts\\": [{\\"video_metadata\\": null, \\"thought\\": null, \\"inline_data\\": null, \\"file_data\\": null, \\"thought_signature\\": null, \\"code_execution_result\\": null, \\"executable_code\\": null, \\"function_call\\": null, \\"function_response\\": null, \\"text\\": \\"How is the weather like in Xi\'an?\\"}], \\"role\\": \\"user\\"}", "run_config": "{\\"speech_config\\": null, \\"response_modalities\\": null, \\"save_input_blobs_as_artifacts\\": false, \\"support_cfc\\": false, \\"streaming_mode\\": \\"StreamingMode.NONE\\", \\"output_audio_transcription\\": null, \\"input_audio_transcription\\": null, \\"realtime_input_config\\": null, \\"enable_affective_dialog\\": null, \\"proactivity\\": null, \\"max_llm_calls\\": 500}"}',
115+
"user.id": "veadk_default_user",
116+
"session.id": "veadk_example_session",
117+
"output.value": '{"content":{"parts":[{"text":"The weather in Xi\'an is cool, with a temperature of 18\u00b0C."}],"role":"model"},"partial":false,"usage_metadata":{"candidates_token_count":132,"prompt_token_count":547,"total_token_count":679},"invocation_id":"e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441","author":"chat_robot","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{}},"id":"c0929124-9be0-4f75-a6ba-f7a531c9ccb6","timestamp":1754884672.227546}',
118+
},
119+
"parent_span_id": None,
120+
},
121+
]
122+
62123

63124
def test_evaluator():
64125
base_evaluator = BaseEvaluator(agent=None, name="test_evaluator")
@@ -68,7 +129,7 @@ def test_evaluator():
68129
with open(eval_set_file_path, "w") as f:
69130
json.dump(EVAL_SET_DATA, f)
70131

71-
base_evaluator.generate_eval_data(eval_set_file_path=eval_set_file_path)
132+
base_evaluator.generate_eval_data(file_path=eval_set_file_path)
72133

73134
assert len(base_evaluator.invocation_list) == 1
74135
assert len(base_evaluator.invocation_list[0].invocations) == 1
@@ -78,3 +139,23 @@ def test_evaluator():
78139
)
79140

80141
os.remove(eval_set_file_path)
142+
143+
144+
def test_tracing_file_to_evalset():
145+
base_evaluator = BaseEvaluator(agent=None, name="test_evaluator")
146+
147+
# save data to file
148+
tracing_file_path = "./tracing_for_test_evaluator.json"
149+
with open(tracing_file_path, "w") as f:
150+
json.dump(TRACE_SET_DATA, f)
151+
152+
base_evaluator.generate_eval_data(file_path=tracing_file_path)
153+
154+
assert len(base_evaluator.invocation_list) == 1
155+
assert len(base_evaluator.invocation_list[0].invocations) == 1
156+
assert (
157+
base_evaluator.invocation_list[0].invocations[0].invocation_id
158+
== "e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441"
159+
)
160+
161+
os.remove(tracing_file_path)

veadk/evaluation/base_evaluator.py

Lines changed: 130 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515

16+
import json
1617
import time
1718
import uuid
1819
from abc import abstractmethod
@@ -24,6 +25,8 @@
2425
from google.genai import types
2526
from pydantic import BaseModel
2627

28+
from veadk.utils.misc import formatted_timestamp
29+
2730

2831
class InvocationTestData(BaseModel):
2932
invocation_id: str = ""
@@ -79,15 +82,139 @@ def __init__(
7982
self.result_list: list[EvalResultData] = []
8083
self.agent_information_list: list[dict] = []
8184

82-
def load_eval_set(self, eval_set_file: str) -> list[EvalSet]:
85+
def _load_eval_set(self, eval_set_file: str) -> EvalSet:
8386
from .eval_set_file_loader import load_eval_set_from_file
8487

8588
return load_eval_set_from_file(eval_set_file)
8689

87-
def generate_eval_data(self, eval_set_file_path: str):
90+
def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
91+
try:
92+
with open(tracing_file, "r") as f:
93+
tracing_data = json.load(f)
94+
except json.JSONDecodeError as e:
95+
raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}")
96+
except Exception as e:
97+
raise ValueError(f"Error reading file {tracing_file}: {e}")
98+
99+
# Group spans by trace_id
100+
trace_groups = {}
101+
for span in tracing_data:
102+
trace_id = span["trace_id"]
103+
if trace_id not in trace_groups:
104+
trace_groups[trace_id] = []
105+
trace_groups[trace_id].append(span)
106+
107+
# Convert to evalset format
108+
eval_cases, conversation = [], []
109+
app_name, user_id = "", ""
110+
creation_timestamp = 0
111+
for trace_id, spans in trace_groups.items():
112+
tool_uses = []
113+
114+
# Extract tool_uses from spans with name starting with "execute_tool"
115+
for span in spans:
116+
if span["name"].startswith("execute_tool"):
117+
tool_uses.append(
118+
{
119+
"id": span["attributes"].get("gen_ai.tool.call.id", None),
120+
"args": json.loads(
121+
span["attributes"].get(
122+
"gcp.vertex.agent.tool_call_args", "{}"
123+
)
124+
),
125+
"name": span["attributes"].get("gen_ai.tool.name", None),
126+
}
127+
)
128+
129+
# Extract conversation data from spans with name starting with "invocation"
130+
for span in spans:
131+
if span["name"].startswith("invocation"):
132+
# Parse input.value and output.value as JSON
133+
input_value = json.loads(
134+
span["attributes"].get("input.value", "{}")
135+
)
136+
output_value = json.loads(
137+
span["attributes"].get("output.value", "{}")
138+
)
139+
140+
user_content = json.loads(input_value.get("new_message", {}))
141+
final_response = json.loads(json.dumps(user_content))
142+
final_response["parts"][0]["text"] = (
143+
output_value.get("content", {})
144+
.get("parts", [{}])[0]
145+
.get("text", None)
146+
)
147+
final_response["role"] = None
148+
conversation.append(
149+
{
150+
"invocation_id": output_value.get(
151+
"invocation_id", str(uuid.uuid4())
152+
),
153+
"user_content": user_content,
154+
"final_response": final_response,
155+
"intermediate_data": {
156+
"tool_uses": tool_uses,
157+
"intermediate_responses": [],
158+
},
159+
"creation_timestamp": span["start_time"] / 1e9,
160+
}
161+
)
162+
user_id = input_value.get("user_id", None)
163+
app_name = (
164+
span["name"].replace("invocation", "").strip().strip("[]")
165+
)
166+
creation_timestamp = span["start_time"] / 1e9
167+
168+
eval_cases.append(
169+
{
170+
"eval_id": f"veadk_eval_{formatted_timestamp()}",
171+
"conversation": conversation,
172+
"session_input": {
173+
"app_name": app_name,
174+
"user_id": user_id,
175+
"state": {},
176+
},
177+
"creation_timestamp": creation_timestamp,
178+
}
179+
)
180+
181+
evalset = EvalSet(
182+
eval_set_id="default",
183+
name="default",
184+
description=None,
185+
eval_cases=eval_cases,
186+
creation_timestamp=creation_timestamp,
187+
)
188+
189+
return evalset
190+
191+
def generate_eval_data(self, file_path: str):
192+
"""Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
88193
eval_case_data_list: list[EvalCaseData] = []
89194

90-
eval_cases = self.load_eval_set(eval_set_file_path).eval_cases
195+
try:
196+
with open(file_path, "r") as f:
197+
file_content = json.load(f)
198+
except json.JSONDecodeError as e:
199+
raise ValueError(f"Invalid JSON format in file {file_path}: {e}")
200+
except Exception as e:
201+
raise ValueError(f"Error reading file {file_path}: {e}")
202+
203+
if isinstance(file_content, dict) and "eval_cases" in file_content:
204+
eval_cases = self._load_eval_set(file_path).eval_cases
205+
elif (
206+
isinstance(file_content, list)
207+
and len(file_content) > 0
208+
and all(
209+
isinstance(span, dict) and "trace_id" in span for span in file_content
210+
)
211+
):
212+
eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases
213+
else:
214+
raise ValueError(
215+
f"Unsupported file format in {file_path}. Please provide a valid file."
216+
)
217+
91218
for eval_case in eval_cases:
92219
eval_case_data = EvalCaseData(invocations=[])
93220
self.agent_information_list.append(

0 commit comments

Comments
 (0)