Skip to content

Commit 7b49744

Browse files
ahibrahimmkdestin
andauthored
Groundedness Eval Update (#42674)
* updates * updates * updates * remove commented code * remove commented code * formats * fixes * black * skip logic for non-query calls * black * remove deprecated test * prompt update * guidelines with better variance * remove groudnedness from custom cred * remove unused import * Revert "remove unused import" This reverts commit d4ea5c3. * chore: Update assets.json * chore: Update assets.json --------- Co-authored-by: kdestin <[email protected]>
1 parent 41c9c46 commit 7b49744

File tree

8 files changed

+201
-91
lines changed

8 files changed

+201
-91
lines changed

sdk/evaluation/azure-ai-evaluation/assets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_e9fbe5cd65"
5+
"Tag": "python/evaluation/azure-ai-evaluation_d7b00f22b8"
66
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,74 @@ def reformat_tool_definitions(tool_definitions, logger=None):
659659
return tool_definitions
660660

661661

662+
def simplify_messages(messages, drop_system=True, drop_tool_calls=False, logger=None):
663+
"""
664+
Simplify a list of conversation messages by keeping only role and content.
665+
Optionally filter out system messages and/or tool calls.
666+
667+
:param messages: List of message dicts (e.g., from query or response)
668+
:param drop_system: If True, remove system role messages
669+
:param drop_tool_calls: If True, remove tool_call items from assistant content
670+
:return: New simplified list of messages
671+
"""
672+
if isinstance(messages, str):
673+
return messages
674+
try:
675+
# Validate input is a list
676+
if not isinstance(messages, list):
677+
return messages
678+
679+
simplified_msgs = []
680+
for msg in messages:
681+
# Ensure msg is a dict
682+
if not isinstance(msg, dict):
683+
simplified_msgs.append(msg)
684+
continue
685+
686+
role = msg.get("role")
687+
content = msg.get("content", [])
688+
689+
# Drop system message (if should)
690+
if drop_system and role == "system":
691+
continue
692+
693+
# Simplify user messages
694+
if role == "user":
695+
simplified_msg = {
696+
"role": role,
697+
"content": _extract_text_from_content(content),
698+
}
699+
simplified_msgs.append(simplified_msg)
700+
continue
701+
702+
# Drop tool results (if should)
703+
if drop_tool_calls and role == "tool":
704+
continue
705+
706+
# Simplify assistant messages
707+
if role == "assistant":
708+
simplified_content = _extract_text_from_content(content)
709+
# Check if message has content
710+
if simplified_content:
711+
simplified_msg = {"role": role, "content": simplified_content}
712+
simplified_msgs.append(simplified_msg)
713+
continue
714+
715+
# Drop tool calls (if should)
716+
if drop_tool_calls and any(c.get("type") == "tool_call" for c in content if isinstance(c, dict)):
717+
continue
718+
719+
# If we reach here, it means we want to keep the message
720+
simplified_msgs.append(msg)
721+
722+
return simplified_msgs
723+
724+
except Exception as ex:
725+
if logger:
726+
logger.debug(f"Error simplifying messages: {str(ex)}. Returning original messages.")
727+
return messages
728+
729+
662730
def upload(path: str, container_client: ContainerClient, logger=None):
663731
"""Upload files or directories to Azure Blob Storage using a container client.
664732

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939

4040
from ._conversation_aggregators import GetAggregator, GetAggregatorType
4141

42+
import copy
43+
4244
P = ParamSpec("P")
4345
T = TypeVar("T")
4446
T_EvalValue = TypeVar("T_EvalValue")
@@ -488,8 +490,12 @@ def _parse_tools_from_response(self, response):
488490
"""
489491
tool_calls = []
490492
tool_results_map = {}
491-
if isinstance(response, list):
492-
for message in response:
493+
494+
# Work on a deep copy to avoid modifying the original object
495+
response_copy = copy.deepcopy(response)
496+
497+
if isinstance(response_copy, list):
498+
for message in response_copy:
493499
# Extract tool calls from assistant messages
494500
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
495501
for content_item in message.get("content"):
@@ -582,7 +588,11 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
582588
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
583589
"""
584590
# Convert inputs into list of evaluable inputs.
585-
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
591+
try:
592+
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
593+
except Exception as e:
594+
print(f"Error converting kwargs to eval_input_list: {e}")
595+
raise e
586596
per_turn_results = []
587597
# Evaluate all inputs.
588598
for eval_input in eval_input_list:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py

Lines changed: 86 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44
import os, logging
5-
from typing import Dict, List, Optional, Union
5+
from typing import Dict, List, Optional, Union, Any, Tuple
66

77
from typing_extensions import overload, override
88
from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
@@ -16,6 +16,7 @@
1616
ErrorCategory,
1717
construct_prompty_model_config,
1818
validate_model_config,
19+
simplify_messages,
1920
)
2021

2122
try:
@@ -213,6 +214,42 @@ def __call__( # pylint: disable=docstring-missing-param
213214

214215
return super().__call__(*args, **kwargs)
215216

217+
def has_context(self, eval_input: dict) -> bool:
218+
"""
219+
Return True if eval_input contains a non-empty 'context' field.
220+
Treats None, empty strings, empty lists, and lists of empty strings as no context.
221+
"""
222+
context = eval_input.get("context", None)
223+
if not context:
224+
return False
225+
if context == "<>": # Special marker for no context
226+
return False
227+
if isinstance(context, list):
228+
return any(str(c).strip() for c in context)
229+
if isinstance(context, str):
230+
return bool(context.strip())
231+
return True
232+
233+
@override
234+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
235+
if "query" not in eval_input:
236+
return await super()._do_eval(eval_input)
237+
238+
contains_context = self.has_context(eval_input)
239+
240+
simplified_query = simplify_messages(eval_input["query"], drop_tool_calls=contains_context)
241+
simplified_response = simplify_messages(eval_input["response"], drop_tool_calls=False)
242+
243+
# Build simplified input
244+
simplified_eval_input = {
245+
"query": simplified_query,
246+
"response": simplified_response,
247+
"context": eval_input["context"],
248+
}
249+
250+
# Replace and call the parent method
251+
return await super()._do_eval(simplified_eval_input)
252+
216253
async def _real_call(self, **kwargs):
217254
"""The asynchronous call where real end-to-end evaluation logic is performed.
218255
@@ -236,57 +273,73 @@ async def _real_call(self, **kwargs):
236273
raise ex
237274

238275
def _convert_kwargs_to_eval_input(self, **kwargs):
239-
if "context" in kwargs or "conversation" in kwargs:
276+
if kwargs.get("context") or kwargs.get("conversation"):
240277
return super()._convert_kwargs_to_eval_input(**kwargs)
241-
242278
query = kwargs.get("query")
243279
response = kwargs.get("response")
244280
tool_definitions = kwargs.get("tool_definitions")
245281

246-
if not query or not response or not tool_definitions:
247-
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query', 'response' and 'tool_definitions' are required."
282+
if (not query) or (not response): # or not tool_definitions:
283+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
248284
raise EvaluationException(
249285
message=msg,
250286
blame=ErrorBlame.USER_ERROR,
251287
category=ErrorCategory.INVALID_VALUE,
252288
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
253289
)
254-
255290
context = self._get_context_from_agent_response(response, tool_definitions)
256-
if not context:
257-
raise EvaluationException(
258-
message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
259-
blame=ErrorBlame.USER_ERROR,
260-
category=ErrorCategory.NOT_APPLICABLE,
261-
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
262-
)
263291

264-
return super()._convert_kwargs_to_eval_input(response=response[-1], context=context, query=query)
292+
filtered_response = self._filter_file_search_results(response)
293+
return super()._convert_kwargs_to_eval_input(response=filtered_response, context=context, query=query)
294+
295+
def _filter_file_search_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
296+
"""Filter out file_search tool results from the messages."""
297+
file_search_ids = self._get_file_search_tool_call_ids(messages)
298+
return [
299+
msg for msg in messages if not (msg.get("role") == "tool" and msg.get("tool_call_id") in file_search_ids)
300+
]
265301

266302
def _get_context_from_agent_response(self, response, tool_definitions):
303+
"""Extract context text from file_search tool results in the agent response."""
304+
NO_CONTEXT = "<>"
267305
context = ""
268306
try:
269307
logger.debug("Extracting context from response")
270308
tool_calls = self._parse_tools_from_response(response=response)
271-
logger.debug(f"Tool Calls parsed successfully : {tool_calls}")
272-
if tool_calls:
273-
for tool_call in tool_calls:
274-
if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
275-
tool_name = tool_call.get("name")
276-
for tool in tool_definitions:
277-
if tool.get("name") == tool_name and tool.get("type") in self._SUPPORTED_TOOLS:
278-
if tool_name == "file_search":
279-
tool_result = tool_call.get("tool_result")
280-
if tool_result:
281-
for result in tool_result:
282-
content_list = result.get("content")
283-
if content_list:
284-
for content in content_list:
285-
text = content.get("text")
286-
if text:
287-
context = context + "\n" + str(text)
309+
logger.debug(f"Tool Calls parsed successfully: {tool_calls}")
310+
311+
if not tool_calls:
312+
return NO_CONTEXT
313+
314+
context_lines = []
315+
for tool_call in tool_calls:
316+
if not isinstance(tool_call, dict) or tool_call.get("type") != "tool_call":
317+
continue
318+
319+
tool_name = tool_call.get("name")
320+
if tool_name != "file_search":
321+
continue
322+
323+
# Extract tool results
324+
for result in tool_call.get("tool_result", []):
325+
results = result if isinstance(result, list) else [result]
326+
for r in results:
327+
file_name = r.get("file_name", "Unknown file name")
328+
for content in r.get("content", []):
329+
text = content.get("text")
330+
if text:
331+
context_lines.append(f"{file_name}:\n- {text}---\n\n")
332+
333+
context = "\n".join(context_lines) if len(context_lines) > 0 else None
334+
288335
except Exception as ex:
289336
logger.debug(f"Error extracting context from agent response : {str(ex)}")
290-
context = ""
337+
context = None
338+
339+
context = context if context else NO_CONTEXT
340+
return context
291341

292-
return context if context else None
342+
def _get_file_search_tool_call_ids(self, query_or_response):
343+
"""Return a list of tool_call_ids for file search tool calls."""
344+
tool_calls = self._parse_tools_from_response(query_or_response)
345+
return [tc.get("tool_call_id") for tc in tool_calls if tc.get("name") == "file_search"]

0 commit comments

Comments
 (0)