Skip to content

Commit 4c47dae

Browse files
authored
[evaluation] Evaluator param name change - question/answer -> query/response (#37462)
* question -> query * answer to response * questions, answers -> queries and responses * remove unneeded flow.tools.json files * remove another flow.tools.json file
1 parent 9978071 commit 4c47dae

File tree

64 files changed

+576
-575
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+576
-575
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
- The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
88
- The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
99
- The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
10+
- Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
1011

1112

1213
### Features Added

sdk/evaluation/azure-ai-evaluation/README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ from promptflow.core import AzureOpenAIModelConfiguration
2828
from azure.ai.evaluation import evaluate, RelevanceEvaluator, ViolenceEvaluator
2929

3030

31-
def answer_length(answer, **kwargs):
32-
return {"value": len(answer)}
31+
def response_length(response, **kwargs):
32+
return {"value": len(response)}
3333

3434

3535
if __name__ == "__main__":
@@ -46,11 +46,11 @@ if __name__ == "__main__":
4646

4747
# Running Relevance Evaluator on single input row
4848
relevance_score = relevance_eval(
49-
answer="The Alpine Explorer Tent is the most waterproof.",
49+
response="The Alpine Explorer Tent is the most waterproof.",
5050
context="From the our product list,"
5151
" the alpine explorer tent is the most waterproof."
5252
" The Adventure Dining Table has higher weight.",
53-
question="Which tent is the most waterproof?",
53+
query="Which tent is the most waterproof?",
5454
)
5555

5656
pprint(relevance_score)
@@ -66,24 +66,24 @@ if __name__ == "__main__":
6666
}
6767

6868
violence_eval = ViolenceEvaluator(azure_ai_project)
69-
violence_score = violence_eval(question="What is the capital of France?", answer="Paris.")
69+
violence_score = violence_eval(query="What is the capital of France?", response="Paris.")
7070
pprint(violence_score)
7171
# {'violence': 'Very low',
72-
# 'violence_reason': "The system's response is a straightforward factual answer "
72+
# 'violence_reason': "The system's response is a straightforward factual response "
7373
# 'to a geography question. There is no violent content or '
7474
# 'language present.',
7575
# 'violence_score': 0}
7676

7777
# Code based evaluator
78-
answer_length("The Alpine Explorer Tent is the most waterproof.")
78+
response_length("The Alpine Explorer Tent is the most waterproof.")
7979
# {'value': 48}
8080

8181
# Using multiple evaluators together using `Evaluate` API
8282

8383
result = evaluate(
8484
data="evaluate_test_data.jsonl",
8585
evaluators={
86-
"answer_length": answer_length,
86+
"response_length": response_length,
8787
"violence": violence_eval,
8888
},
8989
)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -119,13 +119,13 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
119119
)
120120

121121

122-
async def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, token: str) -> str:
122+
async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
123123
"""Submit request to Responsible AI service for evaluation and return operation ID
124124
125-
:param question: The question to evaluate.
126-
:type question: str
127-
:param answer: The answer to evaluate.
128-
:type answer: str
125+
:param query: The query to evaluate.
126+
:type query: str
127+
:param response: The response to evaluate.
128+
:type response: str
129129
:param metric: The evaluation metric to use.
130130
:type metric: str
131131
:param rai_svc_url: The Responsible AI service URL.
@@ -135,7 +135,7 @@ async def submit_request(question: str, answer: str, metric: str, rai_svc_url: s
135135
:return: The operation ID.
136136
:rtype: str
137137
"""
138-
user_text = f"<Human>{question}</><System>{answer}</>"
138+
user_text = f"<Human>{query}</><System>{response}</>"
139139
normalized_user_text = user_text.replace("'", '\\"')
140140
payload = generate_payload(normalized_user_text, metric)
141141

@@ -410,14 +410,14 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -
410410

411411

412412
async def evaluate_with_rai_service(
413-
question: str, answer: str, metric_name: str, project_scope: dict, credential: TokenCredential
413+
query: str, response: str, metric_name: str, project_scope: dict, credential: TokenCredential
414414
):
415-
""" "Evaluate the content safety of the answer using Responsible AI service
415+
""" "Evaluate the content safety of the response using Responsible AI service
416416
417-
:param question: The question to evaluate.
418-
:type question: str
419-
:param answer: The answer to evaluate.
420-
:type answer: str
417+
:param query: The query to evaluate.
418+
:type query: str
419+
:param response: The response to evaluate.
420+
:type response: str
421421
:param metric_name: The evaluation metric to use.
422422
:type metric_name: str
423423
:param project_scope: The Azure AI project scope details.
@@ -439,7 +439,7 @@ async def evaluate_with_rai_service(
439439
await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
440440

441441
# Submit annotation request and fetch result
442-
operation_id = await submit_request(question, answer, metric_name, rai_svc_url, token)
442+
operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
443443
annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
444444
result = parse_response(annotation_response, metric_name)
445445

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -418,13 +418,13 @@ def evaluate(
418418
},
419419
evaluator_config={
420420
"coherence": {
421-
"answer": "${data.answer}",
422-
"question": "${data.question}"
421+
"response": "${data.response}",
422+
"query": "${data.query}"
423423
},
424424
"relevance": {
425-
"answer": "${data.answer}",
425+
"response": "${data.response}",
426426
"context": "${data.context}",
427-
"question": "${data.question}"
427+
"query": "${data.query}"
428428
}
429429
}
430430
)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ class _AsyncBleuScoreEvaluator:
1111
def __init__(self):
1212
pass
1313

14-
async def __call__(self, *, answer: str, ground_truth: str, **kwargs):
14+
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
1515
reference_tokens = nltk_tokenize(ground_truth)
16-
hypothesis_tokens = nltk_tokenize(answer)
16+
hypothesis_tokens = nltk_tokenize(response)
1717

1818
# NIST Smoothing
1919
smoothing_function = SmoothingFunction().method4
@@ -39,7 +39,7 @@ class BleuScoreEvaluator:
3939
4040
eval_fn = BleuScoreEvaluator()
4141
result = eval_fn(
42-
answer="Tokyo is the capital of Japan.",
42+
response="Tokyo is the capital of Japan.",
4343
ground_truth="The capital of Japan is Tokyo.")
4444
4545
**Output format**
@@ -54,19 +54,19 @@ class BleuScoreEvaluator:
5454
def __init__(self):
5555
self._async_evaluator = _AsyncBleuScoreEvaluator()
5656

57-
def __call__(self, *, answer: str, ground_truth: str, **kwargs):
57+
def __call__(self, *, response: str, ground_truth: str, **kwargs):
5858
"""
59-
Evaluate the BLEU score between the answer and the ground truth.
59+
Evaluate the BLEU score between the response and the ground truth.
6060
61-
:keyword answer: The answer to be evaluated.
62-
:paramtype answer: str
61+
:keyword response: The response to be evaluated.
62+
:paramtype response: str
6363
:keyword ground_truth: The ground truth to be compared against.
6464
:paramtype ground_truth: str
6565
:return: The BLEU score.
6666
:rtype: dict
6767
"""
6868
return async_run_allowing_running_loop(
69-
self._async_evaluator, answer=answer, ground_truth=ground_truth, **kwargs
69+
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
7070
)
7171

7272
def _to_async(self):

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_chat/_chat.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,9 @@ def __call__(self, *, conversation, **kwargs):
107107
"""
108108
self._validate_conversation(conversation)
109109

110-
# Extract questions, answers and contexts from conversation
111-
questions = []
112-
answers = []
110+
# Extract queries, responses and contexts from conversation
111+
queries = []
112+
responses = []
113113
contexts = []
114114

115115
if self._eval_last_turn:
@@ -121,16 +121,16 @@ def __call__(self, *, conversation, **kwargs):
121121
for each_turn in conversation_slice:
122122
role = each_turn["role"]
123123
if role == "user":
124-
questions.append(each_turn["content"])
124+
queries.append(each_turn["content"])
125125
elif role == "assistant":
126-
answers.append(each_turn["content"])
126+
responses.append(each_turn["content"])
127127
if "context" in each_turn and "citations" in each_turn["context"]:
128128
citations = json.dumps(each_turn["context"]["citations"])
129129
contexts.append(citations)
130130

131131
# Select evaluators to be used for evaluation
132132
compute_rag_based_metrics = True
133-
if len(answers) != len(contexts):
133+
if len(responses) != len(contexts):
134134
safe_message = (
135135
"Skipping rag based metrics as we need citations or "
136136
"retrieved_documents in context key of every assistant's turn"
@@ -145,15 +145,15 @@ def __call__(self, *, conversation, **kwargs):
145145

146146
# Evaluate each turn
147147
per_turn_results = []
148-
for turn_num in range(len(questions)):
148+
for turn_num in range(len(queries)):
149149
current_turn_result = {}
150150

151151
if self._parallel:
152152
# Parallel execution
153153
with ThreadPoolExecutor() as executor:
154154
future_to_evaluator = {
155155
executor.submit(
156-
self._evaluate_turn, turn_num, questions, answers, contexts, evaluator
156+
self._evaluate_turn, turn_num, queries, responses, contexts, evaluator
157157
): evaluator
158158
for evaluator in selected_evaluators
159159
}
@@ -165,7 +165,7 @@ def __call__(self, *, conversation, **kwargs):
165165
# Sequential execution
166166
for evaluator in selected_evaluators:
167167
async_evaluator = evaluator._to_async()
168-
result = self._evaluate_turn(turn_num, questions, answers, contexts, async_evaluator)
168+
result = self._evaluate_turn(turn_num, queries, responses, contexts, async_evaluator)
169169
current_turn_result.update(result)
170170

171171
per_turn_results.append(current_turn_result)
@@ -191,13 +191,13 @@ def __call__(self, *, conversation, **kwargs):
191191

192192
return aggregated
193193

194-
def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
194+
def _evaluate_turn(self, turn_num, queries, responses, contexts, evaluator):
195195
try:
196-
question = questions[turn_num] if turn_num < len(questions) else ""
197-
answer = answers[turn_num] if turn_num < len(answers) else ""
196+
query = queries[turn_num] if turn_num < len(queries) else ""
197+
response = responses[turn_num] if turn_num < len(responses) else ""
198198
context = contexts[turn_num] if turn_num < len(contexts) else ""
199199

200-
score = evaluator(question=question, answer=answer, context=context)
200+
score = evaluator(query=query, response=response, context=context)
201201

202202
return score
203203
except Exception as e: # pylint: disable=broad-exception-caught

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,34 +48,34 @@ def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIMode
4848
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
4949

5050
async def __call__(self, *, conversation, **kwargs):
51-
# Extract questions, answers and contexts from conversation
52-
questions = []
53-
answers = []
51+
# Extract queries, responses and contexts from conversation
52+
queries = []
53+
responses = []
5454
contexts = []
5555

5656
for each_turn in conversation:
5757
role = each_turn["role"]
5858
if role == "user":
59-
questions.append(each_turn["content"])
59+
queries.append(each_turn["content"])
6060
elif role == "assistant":
61-
answers.append(each_turn["content"])
61+
responses.append(each_turn["content"])
6262
if "context" in each_turn and "citations" in each_turn["context"]:
6363
citations = json.dumps(each_turn["context"]["citations"])
6464
contexts.append(citations)
6565

6666
# Evaluate each turn
6767
per_turn_scores = []
6868
history = []
69-
for turn_num, question in enumerate(questions):
69+
for turn_num, query in enumerate(queries):
7070
try:
71-
question = question if turn_num < len(questions) else ""
72-
answer = answers[turn_num] if turn_num < len(answers) else ""
71+
query = query if turn_num < len(queries) else ""
72+
answer = responses[turn_num] if turn_num < len(responses) else ""
7373
context = contexts[turn_num] if turn_num < len(contexts) else ""
7474

75-
history.append({"user": question, "assistant": answer})
75+
history.append({"user": query, "assistant": answer})
7676

7777
llm_output = await self._flow(
78-
query=question, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
78+
query=query, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
7979
)
8080
score = np.nan
8181
if llm_output:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,16 @@ def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIMode
4343
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
4444
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
4545

46-
async def __call__(self, *, question: str, answer: str, **kwargs):
46+
async def __call__(self, *, query: str, response: str, **kwargs):
4747
# Validate input parameters
48-
question = str(question or "")
49-
answer = str(answer or "")
48+
query = str(query or "")
49+
response = str(response or "")
5050

51-
if not (question.strip() and answer.strip()):
52-
raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
51+
if not (query.strip() and response.strip()):
52+
raise ValueError("Both 'query' and 'response' must be non-empty strings.")
5353

5454
# Run the evaluation flow
55-
llm_output = await self._flow(question=question, answer=answer, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
55+
llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
5656

5757
score = np.nan
5858
if llm_output:
@@ -77,8 +77,8 @@ class CoherenceEvaluator:
7777
7878
eval_fn = CoherenceEvaluator(model_config)
7979
result = eval_fn(
80-
question="What is the capital of Japan?",
81-
answer="The capital of Japan is Tokyo.")
80+
query="What is the capital of Japan?",
81+
response="The capital of Japan is Tokyo.")
8282
8383
**Output format**
8484
@@ -92,18 +92,18 @@ class CoherenceEvaluator:
9292
def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]):
9393
self._async_evaluator = _AsyncCoherenceEvaluator(model_config)
9494

95-
def __call__(self, *, question: str, answer: str, **kwargs):
95+
def __call__(self, *, query: str, response: str, **kwargs):
9696
"""
9797
Evaluate coherence.
9898
99-
:keyword question: The question to be evaluated.
100-
:paramtype question: str
101-
:keyword answer: The answer to be evaluated.
102-
:paramtype answer: str
99+
:keyword query: The query to be evaluated.
100+
:paramtype query: str
101+
:keyword response: The response to be evaluated.
102+
:paramtype response: str
103103
:return: The coherence score.
104104
:rtype: Dict[str, float]
105105
"""
106-
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
106+
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
107107

108108
def _to_async(self):
109109
return self._async_evaluator

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ model:
1818
type: text
1919

2020
inputs:
21-
question:
21+
query:
2222
type: string
23-
answer:
23+
response:
2424
type: string
2525

2626
---
@@ -57,6 +57,6 @@ question: What can you tell me about climate change and its effects on the envir
5757
answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike.
5858
stars: 5
5959

60-
question: {{question}}
61-
answer: {{answer}}
60+
question: {{query}}
61+
answer: {{response}}
6262
stars:

0 commit comments

Comments
 (0)