Skip to content

Commit 555560a

Browse files
authored
update built in flows pupr 0325 (#34951)
* update built-in chat flow: * add more logging in validate_service * fix parsing error in parse_groundedness_responses.py * update built-in qa flow * update qa groundedness input validation logic * add logging in validate_groundedness_service
1 parent c105d6f commit 555560a

File tree

8 files changed

+49
-39
lines changed

8 files changed

+49
-39
lines changed

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/flow.meta.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
$schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json
2-
name: template_eval_flow
3-
display_name: Template Evaluation Flow
2+
name: chat_quality_safety_eval
3+
display_name: Chat Quality Safety Evaluation
44
type: evaluate
55
path: ./flow.dag.yaml
66
description: Template Evaluation Flow

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/parse_groundedness_responses.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,10 @@ def parse_single_response(response: dict) -> list:
4848
else:
4949
metric_value = np.nan
5050
reasoning = ""
51-
parsed_harm_response[harm_type] = float(metric_value)
51+
try:
52+
parsed_harm_response[harm_type] = float(metric_value)
53+
except Exception:
54+
parsed_harm_response[harm_type] = np.nan
5255
parsed_harm_response[harm_type + "_reason"] = reasoning
5356
parsed_response.append(parsed_harm_response)
5457
return parsed_response

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/validate_service.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def is_service_available(flight: bool):
1818

1919
if response.status_code != 200:
2020
print("Fail to get RAI service availability in this region.")
21-
print(response.status_code)
21+
print("Response_code: %d" % response.status_code)
2222
else:
2323
available_service = response.json()
2424
if "content harm" in available_service:
@@ -27,10 +27,12 @@ def is_service_available(flight: bool):
2727
print("RAI service is not available in this region.")
2828
if "groundedness" in available_service and flight:
2929
groundedness_service = True
30-
else:
30+
if not flight:
31+
print("GroundednessServiceFlight is off.")
32+
if "groundedness" not in available_service:
3133
print("AACS service is not available in this region.")
3234
except Exception:
33-
print("Fail to get RAI service availability in this region.")
35+
print("Failed to call checkannotation endpoint.")
3436
return {"content_harm_service": content_harm_service,
3537
"groundedness_service": groundedness_service
3638
}
@@ -53,6 +55,8 @@ def is_safety_metrics_selected(selected_metrics):
5355

5456

5557
def is_groundedness_metric_selected(selected_metrics: dict) -> bool:
58+
if not selected_metrics["rag_metrics"]["gpt_groundedness"]:
59+
print("gpt_groundedness is not selected.")
5660
return selected_metrics["rag_metrics"]["gpt_groundedness"]
5761

5862

@@ -75,8 +79,8 @@ def validate_safety_metric_input(
7579
chat: [dict],
7680
validate_chat_result: dict,
7781
flight: bool = True) -> dict:
78-
service_available = is_service_available(flight)
7982
tracking_uri_set = is_tracking_uri_set()
83+
service_available = is_service_available(flight)
8084
valid_chat = is_chat_valid(chat)
8185
groundedness_selected = is_groundedness_metric_selected(selected_metrics)
8286
content_harm_service = is_safety_metrics_selected(selected_metrics) \

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/construct_groundedness_request.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ def normalize_user_text(user_text):
77

88

99
@tool
10-
def construct_request(question: str,
11-
answer: str,
12-
context: str) -> dict:
10+
def construct_request(answer: str,
11+
context: str,
12+
question: str = "") -> dict:
1313
metrics = ["generic_groundedness"]
1414
user_text = json.dumps({"question": question,
1515
"answer": answer,

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/flow.dag.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -295,10 +295,7 @@ nodes:
295295
type: code
296296
path: validate_groundedness_service.py
297297
inputs:
298-
answer: ${inputs.answer}
299-
context: ${inputs.context}
300298
flight: ${inputs.groundedness_service_flight}
301-
question: ${inputs.question}
302299
selected_metrics: ${select_metrics.output}
303300
validate_input_result: ${validate_input.output}
304301
use_variants: false

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_groundedness_service.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from promptflow import tool
22
import mlflow
33
from mlflow.utils.rest_utils import http_request
4-
from utils import get_cred, is_valid_string
4+
from utils import get_cred
55

66

77
def is_service_available(flight: bool):
@@ -18,19 +18,23 @@ def is_service_available(flight: bool):
1818

1919
if response.status_code != 200:
2020
print("Fail to get RAI service availability in this region.")
21-
print(response.status_code)
21+
print("Response_code: %d" % response.status_code)
2222
else:
2323
available_service = response.json()
24+
# check if content harm service is avilable
2425
if "content harm" in available_service:
2526
content_harm_service = True
2627
else:
2728
print("Content harm service is not available in this region.")
29+
# check if groundedness service is avilable
2830
if "groundedness" in available_service and flight:
2931
groundedness_service = True
30-
else:
32+
if not flight:
33+
print("GroundednessServiceFlight is off.")
34+
if "groundedness" not in available_service:
3135
print("AACS service is not available in this region.")
3236
except Exception:
33-
print("Fail to get RAI service availability in this region.")
37+
print("Failed to call checkannotation endpoint.")
3438
return {"content_harm_service": content_harm_service,
3539
"groundedness_service": groundedness_service
3640
}
@@ -54,44 +58,46 @@ def is_safety_metric_selected(selected_metrics: dict) -> bool:
5458

5559

5660
def is_groundedness_metric_selected(selected_metrics: dict) -> bool:
61+
if not selected_metrics["quality_metrics"]["gpt_groundedness"]:
62+
print("gpt_groundedness is not selected.")
5763
return selected_metrics["quality_metrics"]["gpt_groundedness"]
5864

5965

60-
def is_input_valid_for_safety_metrics(question: str, answer: str):
61-
if is_valid_string(question) and is_valid_string(answer):
62-
return True
63-
else:
64-
print("Input is not valid for safety metrics evaluation")
65-
return False
66-
67-
68-
# check if RAI service is available in this region. If not, return False.
66+
# check if RAI service is avilable in this region. If not, return False.
6967
# check if tracking_uri is set. If not, return False
7068
# if tracking_rui is set, check if any safety metric is selected.
7169
# if no safety metric is selected, return False
7270
@tool
7371
def validate_safety_metric_input(
7472
selected_metrics: dict,
7573
validate_input_result: dict,
76-
question: str,
77-
answer: str,
7874
flight: bool = True,
79-
context: str = None) -> dict:
80-
service_available = is_service_available(flight)
75+
) -> dict:
8176
tracking_uri_set = is_tracking_uri_set()
77+
service_available = is_service_available(flight)
78+
safety_metrics_selected = is_safety_metric_selected(selected_metrics)
79+
gpt_groundedness_selected = is_groundedness_metric_selected(
80+
selected_metrics)
8281

83-
content_harm_service = is_safety_metric_selected(selected_metrics) \
82+
content_harm_service = safety_metrics_selected \
8483
and service_available["content_harm_service"] and tracking_uri_set \
8584
and validate_input_result["safety_metrics"]
8685

87-
groundedness_service = is_groundedness_metric_selected(selected_metrics)\
86+
groundedness_service = gpt_groundedness_selected\
8887
and validate_input_result["gpt_groundedness"] and tracking_uri_set \
8988
and service_available["groundedness_service"]
9089

91-
groundedness_prompt = is_groundedness_metric_selected(selected_metrics) \
92-
and validate_input_result["gpt_groundedness"] \
90+
groundedness_prompt = gpt_groundedness_selected \
91+
and validate_input_result["gpt_groundedness"] \
9392
and (not service_available["groundedness_service"])
9493

94+
if not validate_input_result["gpt_groundedness"] \
95+
and gpt_groundedness_selected:
96+
print("Input for gpt_groundedness is not valid")
97+
98+
if not validate_input_result["safety_metrics"] and safety_metrics_selected:
99+
print("Input for safety metrics evaluation is not valid")
100+
95101
return {"content_harm_service": content_harm_service,
96102
"groundedness_service": groundedness_service,
97103
"groundedness_prompt": groundedness_prompt

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_input.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def is_input_valid_for_safety_metrics(
77
if is_valid_string(question) and is_valid_string(answer):
88
return True
99
else:
10-
print("Input is not valid for safety metrics evaluation")
10+
print("Input for safety metrics evaluation is not valid")
1111
return False
1212

1313

@@ -23,8 +23,7 @@ def validate_input(question: str,
2323
"ground_truth": ground_truth}
2424
expected_input_cols = set(input_data.keys())
2525
dict_metric_required_fields = {
26-
"gpt_groundedness": set(["question",
27-
"answer",
26+
"gpt_groundedness": set(["answer",
2827
"context"]),
2928
"gpt_relevance": set(["question",
3029
"answer",
@@ -49,7 +48,7 @@ def validate_input(question: str,
4948
if metric_required_fields <= actual_input_cols:
5049
data_validation[metric] = True
5150
else:
52-
print("input for %s is not valid" % metric)
51+
print("Input for %s is not valid." % metric)
5352

5453
safety_metrics = is_input_valid_for_safety_metrics(question, answer)
5554
data_validation["safety_metrics"] = safety_metrics

sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def test_evaluate_built_in_qa_fallback_groundedness(self, e2e_openai_api_base, e
7474
e2e_openai_completion_deployment_name, tmpdir):
7575
test_data = [
7676
{"context": "Some are reported as not having been wanted at all.",
77-
"question": "are all reported as being wanted?",
77+
"question": "",
7878
"answer": "All are reported as being completely and fully wanted."
7979
},
8080
{"question": "How do you log a model?",
@@ -315,6 +315,7 @@ def test_task_type_chat(self, ai_client, e2e_openai_api_base, e2e_openai_api_key
315315
assert "gpt_groundedness" in columns_in_tabular_data
316316
assert "gpt_retrieval_score" in columns_in_tabular_data
317317
assert "evaluation_per_turn" in columns_in_tabular_data
318+
assert "messages" in columns_in_tabular_data
318319

319320
def test_task_type_chat_fallback_groundedness(self, ai_client, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
320321
data_path = os.path.join(pathlib.Path(__file__).parent.parent.resolve(), "data")

0 commit comments

Comments
 (0)