Skip to content

Commit 163ace2

Browse files
authored
Groundedness for agents (#42298)
* Groundedness for agents * Tests for groundedness changes * Fixing pylint issues * Fixing service based evaluator * Fixing converter tests * Fixing import issue * Fixing formatting issues * Updating Changelog * Update CHANGELOG.md * Update _version.py * Update _groundedness.py
1 parent 7ddc4bf commit 163ace2

File tree

11 files changed

+398
-82
lines changed

11 files changed

+398
-82
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22

33
## 1.11.0 (Unreleased)
44

5-
### Features Added
5+
### Breaking Changes
66

7+
### Features Added
78
- Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
9+
- Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
810

11+
### Bugs Fixed
12+
13+
### Other Changes
914

1015
## 1.10.0 (2025-07-31)
1116

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -243,16 +243,30 @@ def _extract_typed_messages(ai_services_messages) -> List[Message]:
243243
if len(single_turn.content) < 1:
244244
continue
245245

246-
# Build the content of the text message.
247-
content = {
248-
"type": "text",
249-
"text": single_turn.content[0].text.value,
250-
}
246+
content_list = []
247+
# If content is a list, process all content items.
248+
for content_item in single_turn.content:
249+
if content_item.type == "text":
250+
content_list.append(
251+
{
252+
"type": "text",
253+
"text": content_item.text.value,
254+
}
255+
)
256+
elif content_item.type == "image":
257+
content_list.append(
258+
{
259+
"type": "image",
260+
"image": {
261+
"file_id": content_item.image_file.file_id,
262+
},
263+
}
264+
)
251265

252266
# If we have a user message, then we save it as such and since it's a human message, there is no
253267
# run_id associated with it.
254268
if single_turn.role == _USER:
255-
final_messages.append(UserMessage(content=[content], createdAt=single_turn.created_at))
269+
final_messages.append(UserMessage(content=content_list, createdAt=single_turn.created_at))
256270
continue
257271

258272
# In this case, we have an assistant message. Unfortunately, this would only have the user-facing
@@ -261,7 +275,7 @@ def _extract_typed_messages(ai_services_messages) -> List[Message]:
261275
if single_turn.role == _AGENT:
262276
# We are required to put the run_id in the assistant message.
263277
final_messages.append(
264-
AssistantMessage(content=[content], run_id=single_turn.run_id, createdAt=single_turn.created_at)
278+
AssistantMessage(content=content_list, run_id=single_turn.run_id, createdAt=single_turn.created_at)
265279
)
266280
continue
267281

@@ -791,6 +805,7 @@ def _list_run_steps_chronological(self, thread_id: str, run_id: str):
791805
limit=self._AI_SERVICES_API_MAX_LIMIT,
792806
order="asc",
793807
after=after,
808+
include=["step_details.tool_calls[*].file_search.results[*].content"],
794809
)
795810
has_more = run_steps.has_more
796811
after = run_steps.last_id
@@ -838,7 +853,11 @@ def _list_messages_chronological(self, thread_id: str):
838853
def _list_run_steps_chronological(self, thread_id: str, run_id: str):
839854

840855
return self.project_client.agents.run_steps.list(
841-
thread_id=thread_id, run_id=run_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
856+
thread_id=thread_id,
857+
run_id=run_id,
858+
limit=self._AI_SERVICES_API_MAX_LIMIT,
859+
order="asc",
860+
include=["step_details.tool_calls[*].file_search.results[*].content"],
842861
)
843862

844863
def _list_run_ids_chronological(self, thread_id: str) -> List[str]:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -330,19 +330,11 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
330330
# Try to retrieve it, but if we don't find anything, skip adding the message
331331
# Just manually converting to dicts for easy serialization for now rather than custom serializers
332332
if tool_call.details.type == _CODE_INTERPRETER:
333-
output = tool_call.details.code_interpreter.outputs
333+
output = [result.as_dict() for result in tool_call.details.code_interpreter.outputs]
334334
elif tool_call.details.type == _BING_GROUNDING:
335335
return messages # not supported yet from bing grounding tool
336336
elif tool_call.details.type == _FILE_SEARCH:
337-
output = [
338-
{
339-
"file_id": result.file_id,
340-
"file_name": result.file_name,
341-
"score": result.score,
342-
"content": result.content,
343-
}
344-
for result in tool_call.details.file_search.results
345-
]
337+
output = [result.as_dict() for result in tool_call.details.file_search.results]
346338
elif tool_call.details.type == _AZURE_AI_SEARCH:
347339
output = tool_call.details.azure_ai_search["output"]
348340
elif tool_call.details.type == _FABRIC_DATAAGENT:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 113 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -170,31 +170,82 @@ async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
170170

171171
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
172172

173-
def _derive_singleton_inputs(self) -> List[str]:
173+
def _derive_singleton_inputs(self) -> List[List[str]]:
174174
"""Inspect the evaluator's __call__ function to determine what singleton inputs are expected
175175
when the evaluator is being used in a non-conversation context.
176176
By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
177177
Thankfully this works the way you'd hope, with the call_signature being based on the child
178178
function's signature, not the parent's.
179179
180-
:return: A list of strings representing the names of singleton inputs.
181-
:rtype: List[str]
180+
:return: A list of lists, where each inner list represents the singleton inputs for each overload.
181+
:rtype: List[List[str]]
182182
"""
183183

184184
overloads = get_overloads(self.__call__)
185185
if not overloads:
186186
call_signatures = [inspect.signature(self.__call__)]
187187
else:
188188
call_signatures = [inspect.signature(overload) for overload in overloads]
189-
call_signature = inspect.signature(self.__call__)
190-
singletons = []
189+
190+
overload_inputs = []
191191
for call_signature in call_signatures:
192192
params = call_signature.parameters
193193
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
194194
continue
195195
# exclude self since it is not a singleton input
196-
singletons.extend([p for p in params if p != "self"])
197-
return singletons
196+
overload_inputs.append([p for p in params if p != "self"])
197+
return overload_inputs
198+
199+
def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
200+
"""Find the overload that matches the provided kwargs and return its input parameters.
201+
202+
:keyword kwargs: The keyword arguments to match against overloads.
203+
:type kwargs: Dict
204+
:return: List of input parameter names for the matching overload.
205+
:rtype: List[str]
206+
"""
207+
overload_inputs = self._singleton_inputs
208+
provided_keys = set(key for key, value in kwargs.items() if value is not None)
209+
210+
# Find the overload that best matches the provided parameters
211+
best_match = None
212+
best_score = -1
213+
214+
for inputs in overload_inputs:
215+
input_set = set(inputs)
216+
217+
# Calculate match score: how many of the overload's params are provided
218+
if input_set.issubset(provided_keys):
219+
score = len(input_set)
220+
if score > best_score:
221+
best_score = score
222+
best_match = inputs
223+
224+
# If exact match found, return it
225+
if best_match is not None:
226+
return best_match
227+
228+
# If no exact match, find the overload with the most overlap
229+
for inputs in overload_inputs:
230+
input_set = set(inputs)
231+
overlap = len(input_set.intersection(provided_keys))
232+
if overlap > best_score:
233+
best_score = overlap
234+
best_match = inputs
235+
236+
# Return the best match or the first overload as fallback
237+
return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
238+
239+
def _get_all_singleton_inputs(self) -> List[str]:
240+
"""Get a flattened list of all possible singleton inputs across all overloads.
241+
242+
:return: Flattened list of all singleton input names.
243+
:rtype: List[str]
244+
"""
245+
all_inputs = set()
246+
for inputs in self._singleton_inputs:
247+
all_inputs.update(inputs)
248+
return list(all_inputs)
198249

199250
def _derive_conversation_converter(
200251
self,
@@ -206,10 +257,11 @@ def _derive_conversation_converter(
206257
:return: The function that will be used to convert conversations to evaluable inputs.
207258
:rtype: Callable
208259
"""
209-
include_context = "context" in self._singleton_inputs
210-
include_query = "query" in self._singleton_inputs
211-
include_response = "response" in self._singleton_inputs
212-
include_ground_truth = "ground_truth" in self._singleton_inputs
260+
all_singleton_inputs = self._get_all_singleton_inputs()
261+
include_context = "context" in all_singleton_inputs
262+
include_query = "query" in all_singleton_inputs
263+
include_response = "response" in all_singleton_inputs
264+
include_ground_truth = "ground_truth" in all_singleton_inputs
213265

214266
def converter(conversation: Dict) -> List[DerivedEvalInput]:
215267
messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -319,9 +371,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
319371
(like a query and response), or they receive conversation that iss a list of dictionary
320372
values.
321373
322-
The self._singleton_inputs list assigned during initialization is used to find and extract
323-
singleton keywords, and self._allow_conversation_input is used to determine if a conversation
324-
is a valid input.
374+
The self._singleton_inputs list (containing overload signatures) assigned during initialization
375+
is used to find and extract singleton keywords, and determine which overload matches the
376+
provided arguments.
325377
326378
If both conversations and singletons are allowed, the function will raise an exception if both
327379
are inputted.
@@ -339,7 +391,10 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
339391
conversation = kwargs.get("conversation", None)
340392
singletons = {}
341393
if len(self._singleton_inputs) > 0:
342-
singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
394+
# Get all possible singleton inputs and check what's provided
395+
all_singleton_inputs = self._get_all_singleton_inputs()
396+
singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
397+
343398
# Check that both conversation and other inputs aren't set
344399
if conversation is not None and any(singletons.values()):
345400
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
@@ -354,10 +409,16 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
354409
if self._is_multi_modal_conversation(conversation):
355410
return self._derive_multi_modal_conversation_converter()(conversation)
356411
return self._derive_conversation_converter()(conversation)
357-
# Handle Singletons
358-
required_singletons = remove_optional_singletons(self, singletons)
359-
if all(value is not None for value in required_singletons.values()):
360-
return [singletons]
412+
413+
# Handle Singletons - find matching overload
414+
matching_inputs = self._get_matching_overload_inputs(**kwargs)
415+
if matching_inputs:
416+
# Check if all required inputs for this overload are provided
417+
required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
418+
required_singletons = remove_optional_singletons(self, required_singletons)
419+
if all(value is not None for value in required_singletons.values()):
420+
return [singletons]
421+
361422
# Missing input
362423
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
363424
raise EvaluationException(
@@ -416,6 +477,39 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
416477
aggregated["evaluation_per_turn"] = evaluation_per_turn
417478
return aggregated
418479

480+
def _parse_tools_from_response(self, response):
481+
"""Parse the response to extract tool calls and results.
482+
:param response: The response to parse.
483+
:type response: Union[str, List[dict]]
484+
:return: List of tool calls extracted from the response.
485+
:rtype: List[dict]
486+
"""
487+
tool_calls = []
488+
tool_results_map = {}
489+
if isinstance(response, list):
490+
for message in response:
491+
# Extract tool calls from assistant messages
492+
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
493+
for content_item in message.get("content"):
494+
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
495+
tool_calls.append(content_item)
496+
497+
# Extract tool results from tool messages
498+
elif message.get("role") == "tool" and message.get("tool_call_id"):
499+
tool_call_id = message.get("tool_call_id")
500+
if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
501+
result_content = message.get("content")[0]
502+
if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
503+
tool_results_map[tool_call_id] = result_content
504+
505+
# Attach results to their corresponding calls
506+
for tool_call in tool_calls:
507+
tool_call_id = tool_call.get("tool_call_id")
508+
if tool_call_id in tool_results_map:
509+
tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
510+
511+
return tool_calls
512+
419513
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
420514
"""The asynchronous call where real end-to-end evaluation logic is performed.
421515

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
153153
if query is not None and self._evaluate_query:
154154
input_data["query"] = str(query)
155155

156-
if "context" in self._singleton_inputs:
156+
if "context" in self._get_all_singleton_inputs():
157157
context = eval_input.get("context", None)
158158
if context is None:
159159
raise EvaluationException(

0 commit comments

Comments
 (0)