Skip to content

Commit 2b5c8b2

Browse files
authored
modify qa evaluation flow (#34364)
* fix KeyError bug in chat evaluation flow * modify supported metrics in both qa and chat evaluation * fix KeyError bug in chat evaluation flow * fix evaluation output column filtering * modify qa evaluation flow: * remove ada_similarity * change default value of context and ground_truth to "" * fix aggregation and concat_result of qa eval flow * modify qa eval flow: * modify rai service request_body format * add data validation for safety metrics * fix bugs in aggregation step
1 parent fc77045 commit 2b5c8b2

File tree

13 files changed

+92
-114
lines changed

13 files changed

+92
-114
lines changed

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,6 @@ class EvaluationMetrics:
2626
GPT_FLUENCY = "gpt_fluency"
2727
GPT_SIMILARITY = "gpt_similarity"
2828
F1_SCORE = "f1_score"
29-
EXACT_MATCH = "exact_match"
30-
ADA_SIMILARITY = "ada_similarity"
31-
BERTSCORE = "bertscore"
3229
GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
3330
# These are temporary until names are fixed in metrics package
3431
RETRIEVAL_SCORE = "retrieval_score"
@@ -53,10 +50,6 @@ class QaMetrics:
5350
EvaluationMetrics.GPT_FLUENCY,
5451
EvaluationMetrics.GPT_SIMILARITY,
5552
EvaluationMetrics.F1_SCORE,
56-
EvaluationMetrics.EXACT_MATCH,
57-
EvaluationMetrics.ADA_SIMILARITY,
58-
EvaluationMetrics.BERTSCORE,
59-
EvaluationMetrics.BERTSCORE,
6053
EvaluationMetrics.HATE_FAIRNESS,
6154
EvaluationMetrics.VIOLENCE,
6255
EvaluationMetrics.SELF_HARM,
@@ -71,6 +64,8 @@ class ChatMetrics:
7164
EvaluationMetrics.GPT_RETRIEVAL_SCORE
7265
]
7366
SUPPORTED_LIST = [
67+
EvaluationMetrics.GPT_COHERENCE,
68+
EvaluationMetrics.GPT_FLUENCY,
7469
EvaluationMetrics.GPT_GROUNDEDNESS,
7570
EvaluationMetrics.GPT_RELEVANCE,
7671
EvaluationMetrics.GPT_RETRIEVAL_SCORE,

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,17 @@ def calculate_metrics(self) -> Dict:
112112
result_metrics = pf_client.get_metrics(pf_run.name)
113113

114114
# Drop unselected output columns
115-
columns_to_drop = [col for col in result_df.columns if col.replace("outputs.", "") not in metrics]
115+
#columns_to_drop = [col for col in result_df.columns if col.replace("outputs.", "").replace("_reasoning", "").replace("_score", "") not in metrics]
116+
columns_to_drop = []
117+
for col in result_df.columns:
118+
is_col_to_delete = True
119+
if col.startswith("outputs"):
120+
for metric in metrics:
121+
if col.replace("outputs.", "").startswith(metric):
122+
is_col_to_delete = False
123+
break
124+
if is_col_to_delete:
125+
columns_to_drop.append(col)
116126
result_df.drop(columns_to_drop, axis=1, inplace=True)
117127

118128
# Rename inputs/outputs columns. E.g. inputs.question -> question, outputs.gpt_fluency -> gpt_fluency

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/parse_chat.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@ def parse_chat(chat: list) -> dict:
1717
if "user" in each_turn and "assistant" in each_turn: # legacy rag-evaluation format
1818
question = each_turn["user"]["content"]
1919
answer = each_turn["assistant"]["content"]
20-
retrieved_documents = each_turn["retrieved_documents"]
20+
try:
21+
retrieved_documents = each_turn["retrieved_documents"]
22+
except KeyError:
23+
retrieved_documents = None
2124

2225
questions.append(question)
2326
answers.append(answer)
@@ -29,7 +32,10 @@ def parse_chat(chat: list) -> dict:
2932
questions.append(content)
3033
elif persona == "assistant":
3134
answers.append(content)
32-
retrieved_documents = json.dumps(each_turn["context"]["citations"])
35+
try:
36+
retrieved_documents = json.dumps(each_turn["context"]["citations"])
37+
except KeyError:
38+
retrieved_documents = None
3339
retrieved_documents_per_chat.append(retrieved_documents)
3440

3541
parsed_chat["questions"] = questions

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/ada_cosine_similarity_score.py

Lines changed: 0 additions & 8 deletions
This file was deleted.

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/aggregate_variants_results.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,22 @@
22
from promptflow import tool, log_metric
33
import numpy as np
44
from utils import filter_metrics
5+
from constants import RAIService
6+
57

68
@tool
79
def aggregate_results(results: List[dict], selected_metrics: List[dict], thresholds: List[int]) -> dict:
8-
selected_safety_metrics = filter_metrics(selected_metrics[0]["safety_metrics"])
9-
selected_quality_metrics = filter_metrics(selected_metrics[0]["quality_metrics"])
10+
if selected_metrics:
11+
selected_safety_metrics = filter_metrics(selected_metrics[0]["safety_metrics"])
12+
selected_quality_metrics = filter_metrics(selected_metrics[0]["quality_metrics"])
13+
else:
14+
selected_safety_metrics = []
15+
selected_quality_metrics = []
1016

1117
if thresholds != [] and thresholds is not None:
1218
threshold = np.float16(thresholds[0])
1319
else:
14-
threshold = RAIService.HARM_SEVERITY_THRESHOLD
15-
print(threshold)
20+
threshold = np.float16(RAIService.HARM_SEVERITY_THRESHOLD)
1621

1722
aggregate_results = {}
1823
for result in results:
@@ -29,19 +34,21 @@ def aggregate_results(results: List[dict], selected_metrics: List[dict], thresho
2934
float_val = float(metric_value)
3035
except Exception:
3136
float_val = np.nan
32-
aggregate_results[name].append(float_val)
33-
37+
if float_val >= 0:
38+
aggregate_results[name].append(float_val)
3439
aggregate_output = {}
35-
for name, value in aggregate_results.items():
40+
for name, values in aggregate_results.items():
3641
metric_name = name
37-
if name in selected_quality_metrics:
38-
aggregate_output[metric_name] = np.nanmean(value)
39-
elif name in selected_safety_metrics:
40-
values = aggregate_results[name]
42+
if name in selected_safety_metrics:
4143
metric_name = name+'_defect_rate'
42-
aggregate_output[metric_name] = np.sum(values >= threshold) / len(values)
43-
44-
aggregate_output[metric_name] = round(aggregate_output[metric_name], 2)
44+
if len(values) == 0:
45+
aggregate_output[metric_name] = np.nan
46+
else:
47+
if name in selected_quality_metrics:
48+
aggregate_output[metric_name] = round(np.nanmean(values), 2)
49+
elif name in selected_safety_metrics:
50+
aggregate_output[metric_name] = round(np.sum(values >= threshold) / len(values), 2)
51+
else:
52+
aggregate_output[metric_name] = np.nan
4553
log_metric(metric_name, aggregate_output[metric_name])
46-
return aggregate_output
47-
54+
return aggregate_output

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/concat_quality_scores.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,17 @@ def concat_results(gpt_coherence_score: str = None,
99
gpt_fluency_score: str = None,
1010
gpt_relevance_score: str = None,
1111
gpt_groundedness_score: str = None,
12-
f1_score: float = None,
13-
ada_cosine_similarity: float = None) -> dict:
12+
f1_score: float = None) -> dict:
1413

1514
load_list = [{'name': 'gpt_coherence', 'score': gpt_coherence_score},
1615
{'name': 'gpt_similarity', 'score': gpt_similarity_score},
1716
{'name': 'gpt_fluency', 'score': gpt_fluency_score},
1817
{'name': 'gpt_relevance', 'score': gpt_relevance_score},
1918
{'name': 'gpt_groundedness', 'score': gpt_groundedness_score},
20-
{'name': 'f1_score', 'score': f1_score},
21-
{'name': 'ada_similarity', 'score': ada_cosine_similarity}]
19+
{'name': 'f1_score', 'score': f1_score}
20+
]
2221

23-
scalar_metrics = ["f1_score", "ada_similarity"]
22+
scalar_metrics = ["f1_score"]
2423
score_list = []
2524
errors = []
2625
for item in load_list:

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/concat_results.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def default_quality_results():
2323
# Adding type to arguments and return value will help the system show the types properly
2424
# Please update the function name/signature per need
2525
@tool
26-
def concat_results(quality_results: dict = None, safety_results: dict = None) -> dict:
26+
def concat_results(selected_metrics: dict, quality_results: dict = None, safety_results: dict = None) -> dict:
2727
if quality_results:
2828
concated_results = quality_results.copy()
2929
else:

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/constants.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@ class Metric:
2424
"gpt_fluency",
2525
"gpt_coherence",
2626
"gpt_relevance",
27-
"f1_score",
28-
"ada_similarity"
27+
"f1_score"
2928
}
3029

3130
# Content harm metric set
Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,16 @@
11
from promptflow import tool
2-
import json
3-
### helping fuction to check if the user_text is chat
4-
2+
53
def normalize_user_text(user_text):
64
return user_text.replace("'", "\\\"")
75

86
# The inputs section will change based on the arguments of the tool function, after you save the code
97
# Adding type to arguments and return value will help the system show the types properly
108
# Please update the function name/signature per need
119
@tool
12-
def construct_request(question: str, answer:str, context: str, ground_truth: str, selected_metrics: dict) -> dict:
10+
def construct_request(question: str, answer:str, selected_metrics: dict) -> dict:
1311
selected_safety_metrics = selected_metrics["safety_metrics"]
1412
metrics = [metric for metric in selected_safety_metrics if selected_safety_metrics[metric]]
15-
user_text_json = {"question": question, "answer": answer, "context": context, "ground_truth": ground_truth}
16-
user_text = json.dumps(user_text_json)
13+
user_text = f"<Human>{question}</><System>{answer}</>"
1714
parsed_user_text = normalize_user_text(user_text)
1815
request_body = {"UserTextList": [parsed_user_text], "AnnotationTask": "content harm", "MetricList":metrics}
19-
#request_body = {"UserTextList": [user_text], "AnnotationTask": annotation_task}
2016
return request_body

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/flow.dag.yaml

Lines changed: 12 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,15 @@ inputs:
55
is_chat_input: false
66
answer:
77
type: string
8-
default: The Alpine Explorer Tent is the most waterproof.
8+
default: ""
99
is_chat_input: false
1010
context:
1111
type: string
12-
default: From the our product list, the alpine explorer tent is the most
13-
waterproof. The Adventure Dining Tabbe has higher weight.
12+
default: ""
1413
is_chat_input: false
1514
ground_truth:
1615
type: string
17-
default: The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m
16+
default: ""
1817
is_chat_input: false
1918
metrics:
2019
type: string
@@ -31,9 +30,6 @@ outputs:
3130
hate_fairness_score:
3231
type: string
3332
reference: ${concat_scores.output.hate_fairness_score}
34-
ada_similarity:
35-
type: string
36-
reference: ${concat_scores.output.ada_similarity}
3733
gpt_relevance:
3834
type: string
3935
reference: ${concat_scores.output.gpt_relevance}
@@ -111,13 +107,12 @@ nodes:
111107
type: code
112108
path: concat_quality_scores.py
113109
inputs:
114-
ada_cosine_similarity: ${ada_similarity.output}
115-
f1_score: ${f1_score.output}
116110
gpt_coherence_score: ${gpt_coherence.output}
111+
gpt_similarity_score: ${gpt_similarity.output}
117112
gpt_fluency_score: ${gpt_fluency.output}
118-
gpt_groundedness_score: ${gpt_groundedness.output}
119113
gpt_relevance_score: ${gpt_relevance.output}
120-
gpt_similarity_score: ${gpt_similarity.output}
114+
gpt_groundedness_score: ${gpt_groundedness.output}
115+
f1_score: ${f1_score.output}
121116
use_variants: false
122117
- name: gpt_similarity
123118
type: llm
@@ -132,8 +127,8 @@ nodes:
132127
presence_penalty: 0
133128
frequency_penalty: 0
134129
answer: ${inputs.answer}
135-
ground_truth: ${inputs.ground_truth}
136130
question: ${inputs.question}
131+
ground_truth: ${inputs.ground_truth}
137132
provider: AzureOpenAI
138133
connection: Default_AzureOpenAI
139134
api: chat
@@ -155,8 +150,8 @@ nodes:
155150
presence_penalty: 0
156151
frequency_penalty: 0
157152
answer: ${inputs.answer}
158-
context: ${inputs.context}
159153
question: ${inputs.question}
154+
context: ${inputs.context}
160155
provider: AzureOpenAI
161156
connection: Default_AzureOpenAI
162157
api: chat
@@ -193,8 +188,8 @@ nodes:
193188
type: code
194189
path: f1_score.py
195190
inputs:
196-
answer: ${inputs.answer}
197191
ground_truth: ${inputs.ground_truth}
192+
answer: ${inputs.answer}
198193
activate:
199194
when: ${validate_input.output.f1_score}
200195
is: true
@@ -240,44 +235,6 @@ nodes:
240235
inputs:
241236
metrics: ${inputs.metrics}
242237
use_variants: false
243-
- name: embeded_ground_truth
244-
type: python
245-
source:
246-
type: package
247-
tool: promptflow.tools.embedding.embedding
248-
inputs:
249-
connection: Default_AzureOpenAI
250-
deployment_name: text-embedding-ada-002
251-
input: ${inputs.ground_truth}
252-
activate:
253-
when: ${validate_input.output.ada_similarity}
254-
is: true
255-
use_variants: false
256-
- name: embeded_answer
257-
type: python
258-
source:
259-
type: package
260-
tool: promptflow.tools.embedding.embedding
261-
inputs:
262-
connection: Default_AzureOpenAI
263-
deployment_name: text-embedding-ada-002
264-
input: ${inputs.answer}
265-
activate:
266-
when: ${validate_input.output.ada_similarity}
267-
is: true
268-
use_variants: false
269-
- name: ada_similarity
270-
type: python
271-
source:
272-
type: code
273-
path: ada_cosine_similarity_score.py
274-
inputs:
275-
a: ${embeded_ground_truth.output}
276-
b: ${embeded_answer.output}
277-
activate:
278-
when: ${validate_input.output.ada_similarity}
279-
is: true
280-
use_variants: false
281238
- name: validate_input
282239
type: python
283240
source:
@@ -296,6 +253,8 @@ nodes:
296253
type: code
297254
path: validate_safety_metric_input.py
298255
inputs:
256+
answer: ${inputs.answer}
257+
question: ${inputs.question}
299258
selected_metrics: ${select_metrics.output}
300259
use_variants: false
301260
- name: construct_service_request
@@ -305,8 +264,6 @@ nodes:
305264
path: construct_service_request.py
306265
inputs:
307266
answer: ${inputs.answer}
308-
context: ${inputs.context}
309-
ground_truth: ${inputs.ground_truth}
310267
question: ${inputs.question}
311268
selected_metrics: ${select_metrics.output}
312269
activate:
@@ -355,6 +312,7 @@ nodes:
355312
inputs:
356313
quality_results: ${concat_quality_scores.output}
357314
safety_results: ${format_service_output.output}
315+
selected_metrics: ${select_metrics.output}
358316
use_variants: false
359317
node_variants: {}
360318
$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json

0 commit comments

Comments
 (0)