update built in flows pupr 0325 (#34951)

qusongms · web-flow · commit 555560a7e6fb · 2024-03-27T16:54:50.000-07:00
* update built-in chat flow:

* add more logging in validate_service
* fix parsing error in parse_groundedness_responses.py

* update built-in qa flow

* update qa groundedness input validation logic
* add logging in validate_groundedness_service
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/flow.meta.yaml b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/flow.meta.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json
-name: template_eval_flow
-display_name: Template Evaluation Flow
+name: chat_quality_safety_eval
+display_name: Chat Quality Safety Evaluation
 type: evaluate
 path: ./flow.dag.yaml
 description: Template Evaluation Flow
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/parse_groundedness_responses.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/parse_groundedness_responses.py
@@ -48,7 +48,10 @@ def parse_single_response(response: dict) -> list:
         else:
             metric_value = np.nan
             reasoning = ""
-        parsed_harm_response[harm_type] = float(metric_value)
+        try:    
+            parsed_harm_response[harm_type] = float(metric_value)
+        except Exception:
+            parsed_harm_response[harm_type] = np.nan
         parsed_harm_response[harm_type + "_reason"] = reasoning
         parsed_response.append(parsed_harm_response)
     return parsed_response
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/validate_service.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/validate_service.py
@@ -18,7 +18,7 @@ def is_service_available(flight: bool):
 
         if response.status_code != 200:
             print("Fail to get RAI service availability in this region.")
-            print(response.status_code)
+            print("Response_code: %d" % response.status_code)
         else:
             available_service = response.json()
             if "content harm" in available_service:
@@ -27,10 +27,12 @@ def is_service_available(flight: bool):
                 print("RAI service is not available in this region.")
             if "groundedness" in available_service and flight:
                 groundedness_service = True
-            else:
+            if not flight:
+                print("GroundednessServiceFlight is off.")
+            if  "groundedness" not in available_service:
                 print("AACS service is not available in this region.")
     except Exception:
-        print("Fail to get RAI service availability in this region.")
+        print("Failed to call checkannotation endpoint.")
     return {"content_harm_service": content_harm_service,
             "groundedness_service": groundedness_service
             }
@@ -53,6 +55,8 @@ def is_safety_metrics_selected(selected_metrics):
 
 
 def is_groundedness_metric_selected(selected_metrics: dict) -> bool:
+    if not selected_metrics["rag_metrics"]["gpt_groundedness"]:
+        print("gpt_groundedness is not selected.")
     return selected_metrics["rag_metrics"]["gpt_groundedness"]
 
 
@@ -75,8 +79,8 @@ def validate_safety_metric_input(
         chat: [dict],
         validate_chat_result: dict,
         flight: bool = True) -> dict:
-    service_available = is_service_available(flight)
     tracking_uri_set = is_tracking_uri_set()
+    service_available = is_service_available(flight)
     valid_chat = is_chat_valid(chat)
     groundedness_selected = is_groundedness_metric_selected(selected_metrics)
     content_harm_service = is_safety_metrics_selected(selected_metrics) \
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/construct_groundedness_request.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/construct_groundedness_request.py
@@ -7,9 +7,9 @@ def normalize_user_text(user_text):
 
 
 @tool
-def construct_request(question: str,
-                      answer: str,
-                      context: str) -> dict:
+def construct_request(answer: str,
+                      context: str,
+                      question: str = "") -> dict:
     metrics = ["generic_groundedness"]
     user_text = json.dumps({"question": question,
                             "answer": answer,
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/flow.dag.yaml b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/flow.dag.yaml
@@ -295,10 +295,7 @@ nodes:
     type: code
     path: validate_groundedness_service.py
   inputs:
-    answer: ${inputs.answer}
-    context: ${inputs.context}
     flight: ${inputs.groundedness_service_flight}
-    question: ${inputs.question}
     selected_metrics: ${select_metrics.output}
     validate_input_result: ${validate_input.output}
   use_variants: false
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_groundedness_service.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_groundedness_service.py
@@ -1,7 +1,7 @@
 from promptflow import tool
 import mlflow
 from mlflow.utils.rest_utils import http_request
-from utils import get_cred, is_valid_string
+from utils import get_cred
 
 
 def is_service_available(flight: bool):
@@ -18,19 +18,23 @@ def is_service_available(flight: bool):
 
         if response.status_code != 200:
             print("Fail to get RAI service availability in this region.")
-            print(response.status_code)
+            print("Response_code: %d" % response.status_code)
         else:
             available_service = response.json()
+            # check if content harm service is avilable
             if "content harm" in available_service:
                 content_harm_service = True
             else:
                 print("Content harm service is not available in this region.")
+            # check if groundedness service is avilable
             if "groundedness" in available_service and flight:
                 groundedness_service = True
-            else:
+            if not flight:
+                print("GroundednessServiceFlight is off.")
+            if "groundedness" not in available_service:
                 print("AACS service is not available in this region.")
     except Exception:
-        print("Fail to get RAI service availability in this region.")
+        print("Failed to call checkannotation endpoint.")
     return {"content_harm_service": content_harm_service,
             "groundedness_service": groundedness_service
             }
@@ -54,44 +58,46 @@ def is_safety_metric_selected(selected_metrics: dict) -> bool:
 
 
 def is_groundedness_metric_selected(selected_metrics: dict) -> bool:
+    if not selected_metrics["quality_metrics"]["gpt_groundedness"]:
+        print("gpt_groundedness is not selected.")
     return selected_metrics["quality_metrics"]["gpt_groundedness"]
 
 
-def is_input_valid_for_safety_metrics(question: str, answer: str):
-    if is_valid_string(question) and is_valid_string(answer):
-        return True
-    else:
-        print("Input is not valid for safety metrics evaluation")
-        return False
-
-
-# check if RAI service is available in this region. If not, return False.
+# check if RAI service is avilable in this region. If not, return False.
 # check if tracking_uri is set. If not, return False
 # if tracking_rui is set, check if any safety metric is selected.
 # if no safety metric is selected, return False
 @tool
 def validate_safety_metric_input(
         selected_metrics: dict,
         validate_input_result: dict,
-        question: str,
-        answer: str,
         flight: bool = True,
-        context: str = None) -> dict:
-    service_available = is_service_available(flight)
+        ) -> dict:
     tracking_uri_set = is_tracking_uri_set()
+    service_available = is_service_available(flight)
+    safety_metrics_selected = is_safety_metric_selected(selected_metrics)
+    gpt_groundedness_selected = is_groundedness_metric_selected(
+        selected_metrics)
 
-    content_harm_service = is_safety_metric_selected(selected_metrics) \
+    content_harm_service = safety_metrics_selected \
         and service_available["content_harm_service"] and tracking_uri_set \
         and validate_input_result["safety_metrics"]
 
-    groundedness_service = is_groundedness_metric_selected(selected_metrics)\
+    groundedness_service = gpt_groundedness_selected\
         and validate_input_result["gpt_groundedness"] and tracking_uri_set \
         and service_available["groundedness_service"]
 
-    groundedness_prompt = is_groundedness_metric_selected(selected_metrics) \
-        and validate_input_result["gpt_groundedness"]  \
+    groundedness_prompt = gpt_groundedness_selected \
+        and validate_input_result["gpt_groundedness"] \
         and (not service_available["groundedness_service"])
 
+    if not validate_input_result["gpt_groundedness"] \
+            and gpt_groundedness_selected:
+        print("Input for gpt_groundedness is not valid")
+
+    if not validate_input_result["safety_metrics"] and safety_metrics_selected:
+        print("Input for safety metrics evaluation is not valid")
+
     return {"content_harm_service": content_harm_service,
             "groundedness_service": groundedness_service,
             "groundedness_prompt": groundedness_prompt
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_input.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_input.py
@@ -7,7 +7,7 @@ def is_input_valid_for_safety_metrics(
     if is_valid_string(question) and is_valid_string(answer):
         return True
     else:
-        print("Input is not valid for safety metrics evaluation")
+        print("Input for safety metrics evaluation is not valid")
         return False
 
 
@@ -23,8 +23,7 @@ def validate_input(question: str,
                   "ground_truth": ground_truth}
     expected_input_cols = set(input_data.keys())
     dict_metric_required_fields = {
-        "gpt_groundedness": set(["question",
-                                 "answer",
+        "gpt_groundedness": set(["answer",
                                  "context"]),
         "gpt_relevance": set(["question",
                               "answer",
@@ -49,7 +48,7 @@ def validate_input(question: str,
             if metric_required_fields <= actual_input_cols:
                 data_validation[metric] = True
             else:
-                print("input for %s is not valid" % metric)
+                print("Input for %s is not valid." % metric)
 
     safety_metrics = is_input_valid_for_safety_metrics(question, answer)
     data_validation["safety_metrics"] = safety_metrics
diff --git a/sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py b/sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py
@@ -74,7 +74,7 @@ def test_evaluate_built_in_qa_fallback_groundedness(self, e2e_openai_api_base, e
                                                         e2e_openai_completion_deployment_name, tmpdir):
         test_data = [
             {"context": "Some are reported as not having been wanted at all.",
-             "question": "are all reported as being wanted?",
+             "question": "",
              "answer": "All are reported as being completely and fully wanted."
             },
             {"question": "How do you log a model?",
@@ -315,6 +315,7 @@ def test_task_type_chat(self, ai_client, e2e_openai_api_base, e2e_openai_api_key
         assert "gpt_groundedness" in columns_in_tabular_data
         assert "gpt_retrieval_score" in columns_in_tabular_data
         assert "evaluation_per_turn" in columns_in_tabular_data
+        assert "messages" in columns_in_tabular_data
 
     def test_task_type_chat_fallback_groundedness(self, ai_client, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
         data_path = os.path.join(pathlib.Path(__file__).parent.parent.resolve(), "data")