modify qa evaluation flow (#34364)

qusongms · web-flow · commit 2b5c8b2e9fab · 2024-02-23T13:55:43.000-08:00
* fix KeyError bug in chat evaluation flow

* modify supported metrics in both qa and chat evaluation
* fix KeyError bug in chat evaluation flow

* fix evaluation output column filtering

* modify qa evaluation flow:

* remove ada_similarity
* change default value of context and ground_truth to ""

* fix aggregation and concat_result of qa eval flow

* modify qa eval flow:

* modify rai service request_body format
* add data validation for safety metrics
* fix bugs in aggregation step
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py
@@ -26,9 +26,6 @@ class EvaluationMetrics:
     GPT_FLUENCY = "gpt_fluency"
     GPT_SIMILARITY = "gpt_similarity"
     F1_SCORE = "f1_score"
-    EXACT_MATCH = "exact_match"
-    ADA_SIMILARITY = "ada_similarity"
-    BERTSCORE = "bertscore"
     GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
     # These are temporary until names are fixed in metrics package
     RETRIEVAL_SCORE = "retrieval_score"
@@ -53,10 +50,6 @@ class QaMetrics:
         EvaluationMetrics.GPT_FLUENCY,
         EvaluationMetrics.GPT_SIMILARITY,
         EvaluationMetrics.F1_SCORE,
-        EvaluationMetrics.EXACT_MATCH,
-        EvaluationMetrics.ADA_SIMILARITY,
-        EvaluationMetrics.BERTSCORE,
-        EvaluationMetrics.BERTSCORE,
         EvaluationMetrics.HATE_FAIRNESS,
         EvaluationMetrics.VIOLENCE,
         EvaluationMetrics.SELF_HARM,
@@ -71,6 +64,8 @@ class ChatMetrics:
         EvaluationMetrics.GPT_RETRIEVAL_SCORE
     ]
     SUPPORTED_LIST = [
+        EvaluationMetrics.GPT_COHERENCE,
+        EvaluationMetrics.GPT_FLUENCY,
         EvaluationMetrics.GPT_GROUNDEDNESS,
         EvaluationMetrics.GPT_RELEVANCE,
         EvaluationMetrics.GPT_RETRIEVAL_SCORE,
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py
@@ -112,7 +112,17 @@ def calculate_metrics(self) -> Dict:
         result_metrics = pf_client.get_metrics(pf_run.name)
 
         # Drop unselected output columns
-        columns_to_drop = [col for col in result_df.columns if col.replace("outputs.", "") not in metrics]
+        #columns_to_drop = [col for col in result_df.columns if col.replace("outputs.", "").replace("_reasoning", "").replace("_score", "") not in metrics]
+        columns_to_drop = []
+        for col in result_df.columns:
+            is_col_to_delete = True
+            if col.startswith("outputs"):
+                for metric in metrics:
+                    if col.replace("outputs.", "").startswith(metric):
+                        is_col_to_delete = False
+                        break
+            if is_col_to_delete:
+                columns_to_drop.append(col)
         result_df.drop(columns_to_drop, axis=1, inplace=True)
 
         # Rename inputs/outputs columns. E.g. inputs.question -> question, outputs.gpt_fluency -> gpt_fluency
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/parse_chat.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/parse_chat.py
@@ -17,7 +17,10 @@ def parse_chat(chat: list) -> dict:
         if "user" in each_turn and "assistant" in each_turn: # legacy rag-evaluation format
             question = each_turn["user"]["content"]
             answer = each_turn["assistant"]["content"]
-            retrieved_documents = each_turn["retrieved_documents"]
+            try:
+                retrieved_documents = each_turn["retrieved_documents"]
+            except KeyError:
+                retrieved_documents = None
 
             questions.append(question)
             answers.append(answer)
@@ -29,7 +32,10 @@ def parse_chat(chat: list) -> dict:
                 questions.append(content)
             elif persona == "assistant":
                 answers.append(content)
-                retrieved_documents = json.dumps(each_turn["context"]["citations"])
+                try:
+                    retrieved_documents = json.dumps(each_turn["context"]["citations"])
+                except KeyError:
+                    retrieved_documents = None
                 retrieved_documents_per_chat.append(retrieved_documents)
             
     parsed_chat["questions"] = questions
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/ada_cosine_similarity_score.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/ada_cosine_similarity_score.py
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/aggregate_variants_results.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/aggregate_variants_results.py
@@ -2,17 +2,22 @@
 from promptflow import tool, log_metric
 import numpy as np
 from utils import filter_metrics
+from constants import RAIService
+
 
 @tool
 def aggregate_results(results: List[dict], selected_metrics: List[dict], thresholds: List[int]) -> dict:
-    selected_safety_metrics = filter_metrics(selected_metrics[0]["safety_metrics"])
-    selected_quality_metrics = filter_metrics(selected_metrics[0]["quality_metrics"])
+    if selected_metrics:
+        selected_safety_metrics = filter_metrics(selected_metrics[0]["safety_metrics"])
+        selected_quality_metrics = filter_metrics(selected_metrics[0]["quality_metrics"])
+    else:
+        selected_safety_metrics = []
+        selected_quality_metrics = []
 
     if thresholds != [] and thresholds is not None:
         threshold = np.float16(thresholds[0])
     else:
-        threshold = RAIService.HARM_SEVERITY_THRESHOLD
-    print(threshold)
+        threshold = np.float16(RAIService.HARM_SEVERITY_THRESHOLD)
 
     aggregate_results = {}
     for result in results:
@@ -29,19 +34,21 @@ def aggregate_results(results: List[dict], selected_metrics: List[dict], thresho
                     float_val = float(metric_value)
                 except Exception:
                     float_val = np.nan
-                aggregate_results[name].append(float_val)
-
+                if float_val >= 0:
+                    aggregate_results[name].append(float_val)
     aggregate_output = {}
-    for name, value in aggregate_results.items():
+    for name, values in aggregate_results.items():
         metric_name = name
-        if name in selected_quality_metrics:
-            aggregate_output[metric_name] = np.nanmean(value)
-        elif name in selected_safety_metrics:
-            values = aggregate_results[name]
+        if name in selected_safety_metrics:
             metric_name = name+'_defect_rate'
-            aggregate_output[metric_name] = np.sum(values >= threshold) / len(values)
-
-        aggregate_output[metric_name] = round(aggregate_output[metric_name], 2)
+        if len(values) == 0:
+            aggregate_output[metric_name] = np.nan
+        else:
+            if name in selected_quality_metrics:
+                aggregate_output[metric_name] = round(np.nanmean(values), 2)
+            elif name in selected_safety_metrics:
+                aggregate_output[metric_name] = round(np.sum(values >= threshold) / len(values), 2)
+            else:
+                aggregate_output[metric_name] = np.nan
         log_metric(metric_name, aggregate_output[metric_name])
-    return aggregate_output
-
+    return aggregate_output
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/concat_quality_scores.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/concat_quality_scores.py
@@ -9,18 +9,17 @@ def concat_results(gpt_coherence_score: str = None,
                    gpt_fluency_score: str = None,
                    gpt_relevance_score: str = None,
                    gpt_groundedness_score: str = None,
-                   f1_score: float = None,
-                   ada_cosine_similarity: float = None) -> dict:
+                   f1_score: float = None) -> dict:
 
     load_list = [{'name': 'gpt_coherence', 'score': gpt_coherence_score},
                  {'name': 'gpt_similarity', 'score': gpt_similarity_score},
                  {'name': 'gpt_fluency', 'score': gpt_fluency_score},
                  {'name': 'gpt_relevance', 'score': gpt_relevance_score},
                  {'name': 'gpt_groundedness', 'score': gpt_groundedness_score},
-                 {'name': 'f1_score', 'score': f1_score},
-                 {'name': 'ada_similarity', 'score': ada_cosine_similarity}]
+                 {'name': 'f1_score', 'score': f1_score}
+                 ]
 
-    scalar_metrics = ["f1_score", "ada_similarity"]
+    scalar_metrics = ["f1_score"]
     score_list = []
     errors = []
     for item in load_list:
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/concat_results.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/concat_results.py
@@ -23,7 +23,7 @@ def default_quality_results():
 # Adding type to arguments and return value will help the system show the types properly
 # Please update the function name/signature per need
 @tool
-def concat_results(quality_results: dict = None, safety_results: dict = None) -> dict:
+def concat_results(selected_metrics: dict, quality_results: dict = None, safety_results: dict = None) -> dict:
     if quality_results:
         concated_results = quality_results.copy()
     else:
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/constants.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/constants.py
@@ -24,8 +24,7 @@ class Metric:
         "gpt_fluency", 
         "gpt_coherence",
         "gpt_relevance", 
-        "f1_score",
-        "ada_similarity"
+        "f1_score"
         }
 
     # Content harm metric set
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/construct_service_request.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/construct_service_request.py
@@ -1,20 +1,16 @@
 from promptflow import tool
-import json
-### helping fuction to check if the user_text is chat
-        
+
 def normalize_user_text(user_text):
     return user_text.replace("'", "\\\"")
 
 # The inputs section will change based on the arguments of the tool function, after you save the code
 # Adding type to arguments and return value will help the system show the types properly
 # Please update the function name/signature per need
 @tool
-def construct_request(question: str, answer:str, context: str, ground_truth: str, selected_metrics: dict) -> dict:
+def construct_request(question: str, answer:str, selected_metrics: dict) -> dict:
     selected_safety_metrics = selected_metrics["safety_metrics"]
     metrics = [metric for metric in selected_safety_metrics if selected_safety_metrics[metric]]
-    user_text_json = {"question": question, "answer": answer, "context": context, "ground_truth": ground_truth}
-    user_text = json.dumps(user_text_json)
+    user_text = f"<Human>{question}</><System>{answer}</>"
     parsed_user_text = normalize_user_text(user_text)
     request_body = {"UserTextList": [parsed_user_text], "AnnotationTask": "content harm", "MetricList":metrics}
-    #request_body = {"UserTextList": [user_text], "AnnotationTask": annotation_task}
     return request_body
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/flow.dag.yaml b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/flow.dag.yaml
@@ -5,16 +5,15 @@ inputs:
     is_chat_input: false
   answer:
     type: string
-    default: The Alpine Explorer Tent is the most waterproof.
+    default: ""
     is_chat_input: false
   context:
     type: string
-    default: From the our product list, the alpine explorer tent is the most
-      waterproof. The Adventure Dining Tabbe has higher weight.
+    default: ""
     is_chat_input: false
   ground_truth:
     type: string
-    default: The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m
+    default: ""
     is_chat_input: false
   metrics:
     type: string
@@ -31,9 +30,6 @@ outputs:
   hate_fairness_score:
     type: string
     reference: ${concat_scores.output.hate_fairness_score}
-  ada_similarity:
-    type: string
-    reference: ${concat_scores.output.ada_similarity}
   gpt_relevance:
     type: string
     reference: ${concat_scores.output.gpt_relevance}
@@ -111,13 +107,12 @@ nodes:
     type: code
     path: concat_quality_scores.py
   inputs:
-    ada_cosine_similarity: ${ada_similarity.output}
-    f1_score: ${f1_score.output}
     gpt_coherence_score: ${gpt_coherence.output}
+    gpt_similarity_score: ${gpt_similarity.output}
     gpt_fluency_score: ${gpt_fluency.output}
-    gpt_groundedness_score: ${gpt_groundedness.output}
     gpt_relevance_score: ${gpt_relevance.output}
-    gpt_similarity_score: ${gpt_similarity.output}
+    gpt_groundedness_score: ${gpt_groundedness.output}
+    f1_score: ${f1_score.output}
   use_variants: false
 - name: gpt_similarity
   type: llm
@@ -132,8 +127,8 @@ nodes:
     presence_penalty: 0
     frequency_penalty: 0
     answer: ${inputs.answer}
-    ground_truth: ${inputs.ground_truth}
     question: ${inputs.question}
+    ground_truth: ${inputs.ground_truth}
   provider: AzureOpenAI
   connection: Default_AzureOpenAI
   api: chat
@@ -155,8 +150,8 @@ nodes:
     presence_penalty: 0
     frequency_penalty: 0
     answer: ${inputs.answer}
-    context: ${inputs.context}
     question: ${inputs.question}
+    context: ${inputs.context}
   provider: AzureOpenAI
   connection: Default_AzureOpenAI
   api: chat
@@ -193,8 +188,8 @@ nodes:
     type: code
     path: f1_score.py
   inputs:
-    answer: ${inputs.answer}
     ground_truth: ${inputs.ground_truth}
+    answer: ${inputs.answer}
   activate:
     when: ${validate_input.output.f1_score}
     is: true
@@ -240,44 +235,6 @@ nodes:
   inputs:
     metrics: ${inputs.metrics}
   use_variants: false
-- name: embeded_ground_truth
-  type: python
-  source:
-    type: package
-    tool: promptflow.tools.embedding.embedding
-  inputs:
-    connection: Default_AzureOpenAI
-    deployment_name: text-embedding-ada-002
-    input: ${inputs.ground_truth}
-  activate:
-    when: ${validate_input.output.ada_similarity}
-    is: true
-  use_variants: false
-- name: embeded_answer
-  type: python
-  source:
-    type: package
-    tool: promptflow.tools.embedding.embedding
-  inputs:
-    connection: Default_AzureOpenAI
-    deployment_name: text-embedding-ada-002
-    input: ${inputs.answer}
-  activate:
-    when: ${validate_input.output.ada_similarity}
-    is: true
-  use_variants: false
-- name: ada_similarity
-  type: python
-  source:
-    type: code
-    path: ada_cosine_similarity_score.py
-  inputs:
-    a: ${embeded_ground_truth.output}
-    b: ${embeded_answer.output}
-  activate:
-    when: ${validate_input.output.ada_similarity}
-    is: true
-  use_variants: false
 - name: validate_input
   type: python
   source:
@@ -296,6 +253,8 @@ nodes:
     type: code
     path: validate_safety_metric_input.py
   inputs:
+    answer: ${inputs.answer}
+    question: ${inputs.question}
     selected_metrics: ${select_metrics.output}
   use_variants: false
 - name: construct_service_request
@@ -305,8 +264,6 @@ nodes:
     path: construct_service_request.py
   inputs:
     answer: ${inputs.answer}
-    context: ${inputs.context}
-    ground_truth: ${inputs.ground_truth}
     question: ${inputs.question}
     selected_metrics: ${select_metrics.output}
   activate:
@@ -355,6 +312,7 @@ nodes:
   inputs:
     quality_results: ${concat_quality_scores.output}
     safety_results: ${format_service_output.output}
+    selected_metrics: ${select_metrics.output}
   use_variants: false
 node_variants: {}
 $schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/utils.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/utils.py
@@ -1,5 +1,6 @@
 import constants
 import numpy as np
+import re
 
 def get_cred():
     from mlflow.tracking import MlflowClient
@@ -28,4 +29,11 @@ def get_harm_severity_level(harm_score: int) -> str:
     for harm_level, harm_score_range in HAMR_SEVERITY_LEVEL_MAPPING.items():
         if harm_score >= harm_score_range[0] and harm_score <= harm_score_range[1]:
             return harm_level.name
-    return np.nan
+    return np.nan
+
+def is_valid_string(input_string: str) -> bool:
+    # if input_string contains any letter or number, 
+    # it is a valid string
+    if not input_string:
+        return False
+    return bool(re.search(r'\d|\w', input_string))
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_input.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_input.py
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_safety_metric_input.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/validate_safety_metric_input.py

Original file line number	Diff line number	Diff line change
`@@ -24,8 +24,7 @@ class Metric:`
`24`	`24`	`"gpt_fluency",`
`25`	`25`	`"gpt_coherence",`
`26`	`26`	`"gpt_relevance",`
`27`		`- "f1_score",`
`28`		`- "ada_similarity"`
	`27`	`+ "f1_score"`
`29`	`28`	`}`
`30`	`29`
`31`	`30`	`# Content harm metric set`