modify built-in chat eval flow: (#34511)

qusongms · web-flow · commit 53ba5aa79a31 · 2024-03-19T14:45:38.000-07:00
* modify built-in chat eval flow:

* modify rai service request format
* fix aggregation logic
* add input validation for rai service

* fix bug in validate_conversation of chat eval flow
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/aggregate.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/aggregate.py
@@ -7,17 +7,22 @@
 
 @tool
 def aggregate_variants_results(results: List[dict], metrics: List[dict],
-                                thresholds = None):             
-    rag_metrics = filter_metrics(metrics[0]["rag_metrics"])
-    non_rag_metrics = filter_metrics(metrics[0]["non_rag_metrics"])
-    safety_metrics = filter_metrics(metrics[0]["safety_metrics"])
+                                thresholds = None):
+    if metrics:         
+        rag_metrics = filter_metrics(metrics[0]["rag_metrics"])
+        non_rag_metrics = filter_metrics(metrics[0]["non_rag_metrics"])
+        safety_metrics = filter_metrics(metrics[0]["safety_metrics"])
+    else:
+        rag_metrics = []
+        non_rag_metrics = []
+        safety_metrics = []
     selected_quality_metrics =  rag_metrics + non_rag_metrics 
 
     # set threshold for safety metric defect_rate
     if thresholds != [] and thresholds is not None:
         threshold = np.float16(thresholds[0])
     else:
-        threshold = RAIService.HARM_SEVERITY_THRESHOLD
+        threshold = np.float16(RAIService.HARM_SEVERITY_THRESHOLD)
 
     aggregate_results = {}
     for result in results:
@@ -32,18 +37,22 @@ def aggregate_variants_results(results: List[dict], metrics: List[dict],
                     float_val = float(metric_value)
                 except Exception:
                     float_val = np.nan
-                aggregate_results[name].append(float_val)
+                if float_val >= 0:
+                    aggregate_results[name].append(float_val)
     aggregate_output = {}
 
-    for name, value in aggregate_results.items():
+    for name, values in aggregate_results.items():
         metric_name = name
-        if name in selected_quality_metrics:
-            aggregate_output[metric_name] = np.nanmean(value)
-        elif name in safety_metrics:
-            values = aggregate_results[name]
+        if name in safety_metrics:
             metric_name = name+'_defect_rate'
-            aggregate_output[metric_name] = np.sum(values >= threshold) / len(values)
-
-        aggregate_output[metric_name] = round(aggregate_output[metric_name], 2)
+        if len(values) == 0:
+            aggregate_output[metric_name] = np.nan
+        else:
+            if name in selected_quality_metrics:
+                aggregate_output[metric_name] = round(np.nanmean(values), 2)
+            elif name in safety_metrics:
+                aggregate_output[metric_name] = round(np.sum(values >= threshold) / len(values), 2)
+            else:
+                aggregate_output[metric_name] = np.nan
         log_metric(metric_name, aggregate_output[metric_name])
     return aggregate_output
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/concatenate_metrics.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/concatenate_metrics.py
@@ -7,8 +7,12 @@ def format_rag_results(rag_results: dict, supported_metrics):
     result_per_turn = {}
     if rag_results:
         for metric, value in rag_results['artifacts'].items():
-            result_per_chat[metric] = rag_results['metrics']["mean_" + metric]
-            result_per_turn[metric] = {"reason": value['reason'], "score": value['score_per_turn']}
+            try:
+                result_per_chat[metric] = rag_results['metrics']["mean_" + metric]
+                result_per_turn[metric] = {"reason": value['reason'], "score": value['score_per_turn']}
+            except KeyError:
+                result_per_chat[metric] = np.nan
+                result_per_turn[metric] = np.nan
     for metric in supported_metrics:
         if metric not in result_per_turn:
             result_per_chat[metric] = np.nan
@@ -21,7 +25,10 @@ def format_non_rag_results(non_rag_results: dict, supported_metrics):
     result_per_turn = {}
     if non_rag_results:
         for metric in non_rag_results['artifacts']:
-            result_per_chat[metric] = non_rag_results['metrics']['mean_' + metric]
+            try:
+                result_per_chat[metric] = non_rag_results['metrics']['mean_' + metric]
+            except:
+                result_per_chat[metric] = np.nan
         result_per_turn = non_rag_results['artifacts']
     for metric in supported_metrics:
         if metric not in result_per_turn:
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/construct_service_request.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/construct_service_request.py
@@ -7,11 +7,16 @@ def parse_chat(user_text: list):
         try:
             role = turn["role"]
             content = turn["content"]
-            content_str = "<" + role + ">" + content + "</>\n"
+            if role == "user":
+                content_str = "<Human>" + content + "</>\n"
+            elif role == "assistant":
+                content_str = "<System>" + content + "</>\n"
+            else:
+                content_str = "<" + role + ">" + content + "</>\n"
         except KeyError:
             content_str = json.dumps(turn) + "\n"
         parsed_chat.append(content_str)
-    return "{\"conversation\": \"" + "".join(parsed_chat) + "\"}"
+    return "".join(parsed_chat)
         
 def normalize_user_text(user_text):
     return user_text.replace("'", "\\\"")
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/flow.dag.yaml b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/flow.dag.yaml
@@ -137,7 +137,7 @@ nodes:
     deployment_name: ${inputs.deployment_name}
     selected_metrics: ${select_metrics.output}
   activate:
-    when: ${validate_conversation.output}
+    when: ${validate_conversation.output.rag_metrics}
     is: true
   use_variants: false
 - name: evaluate_coherence_fluency
@@ -151,7 +151,7 @@ nodes:
     parsed_qa: ${parse_chat.output}
     selected_metrics: ${select_metrics.output}
   activate:
-    when: ${validate_conversation.output}
+    when: ${validate_conversation.output.non_rag_metrics}
     is: true
   use_variants: false
 - name: parse_chat
@@ -162,7 +162,7 @@ nodes:
   inputs:
     chat: ${inputs.messages}
   activate:
-    when: ${validate_conversation.output}
+    when: ${validate_conversation.output.non_rag_metrics}
     is: true
   use_variants: false
 - name: concatenate_metrics
@@ -191,6 +191,7 @@ nodes:
     type: code
     path: validate_service.py
   inputs:
+    chat: ${inputs.messages}
     selected_metrics: ${select_metrics.output}
   use_variants: false
 - name: construct_service_request
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/utils.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/utils.py
@@ -1,6 +1,9 @@
 from promptflow.connections import AzureOpenAIConnection
 import constants
 import numpy as np
+from azureml.metrics.common import _validation
+from azureml.metrics.common.contract import Contract
+from azureml.metrics.common.exceptions import InvalidOperationException
 
 def get_openai_parameters(connection: AzureOpenAIConnection, deployment_name: str) -> dict:
     openai_params = {
@@ -55,3 +58,42 @@ def get_harm_severity_level(harm_score: int) -> str:
         if harm_score >= harm_score_range[0] and harm_score <= harm_score_range[1]:
             return harm_level.name
     return np.nan
+
+def is_conversation_valid(chat: []) -> bool:
+    reference_code = "validate_conversation"
+    name = "chat_format"
+    # check if role and content keys exist in every turn
+    _validation._check_chat_conversation([chat], name, reference_code=reference_code)
+    return True
+
+def is_conversation_valid_with_context(chat: []) -> bool:
+    reference_code = "validate_conversation"
+    name = "chat_context_format"
+
+    # check if context/documents keys exist for rag evaluation
+    for turn_num, each_turn in enumerate(chat):
+        # to accept legacy rag_evaluation format:
+        # [{"user": {"content": "<user_content>"}, 
+        #  "assistant": {"content": "<assistang_content>"}, 
+        #  "retrieved_documents": "<retrieved_documents>"}]
+        if "user" in each_turn and "assistant" in each_turn: # legancy rag_evaluation format
+            Contract.assert_true("retrieved_documents" in each_turn, 
+                message = "Please ensure to have retrieved_documents key in each turn for rag_evaluation."
+                    + " Please check turn_number: {}".format(turn_num),
+                target=name, log_safe=True, 
+                reference_code = reference_code)
+        elif "role" in each_turn and each_turn["role"] == "assistant":
+            #if "context" not in each_turn:
+            Contract.assert_true("context" in each_turn, 
+                message = "Please ensure to have context key in assistant turn for rag_evaluation."
+                    + " Please check turn_number: {}".format(turn_num),
+                target=name, log_safe=True, 
+                reference_code = reference_code)
+            if "context" in each_turn: #and "citations" not in each_turn["context"]:
+                Contract.assert_true("citations" in each_turn["context"], 
+                message = "Please ensure to have citations key in assistant turn context for rag_evaluation."
+                    + " Please check turn_number: {}".format(turn_num),
+                target=name, log_safe=True, 
+                reference_code = reference_code)
+
+    return True
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/validate_conversation.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/validate_conversation.py
@@ -1,51 +1,53 @@
 from promptflow import tool
-from azureml.metrics.common import _validation
-from azureml.metrics.common.contract import Contract
-from azureml.metrics.common.exceptions import InvalidOperationException
-from utils import filter_metrics
+#from azureml.metrics.common import _validation
+#from azureml.metrics.common.contract import Contract
+#from azureml.metrics.common.exceptions import InvalidOperationException
+from utils import filter_metrics, is_conversation_valid, is_conversation_valid_with_context
 
-def is_conversation_valid(chat: [], selected_metrics: dict) -> bool:
-    reference_code = "validate_conversation"
-    name = "chat_format"
-    # check if role and content keys exist in every turn
-    _validation._check_chat_conversation([chat], name, reference_code=reference_code)
+def is_metric_group_selected(selected_metrics: dict) -> dict:
+    group_selected = {}
+    for metric_group in selected_metrics:
+        group_selected[metric_group] = False
+        for metric in selected_metrics[metric_group]:
+            if selected_metrics[metric_group][metric]:
+                group_selected[metric_group] = True
+                break
+    return group_selected
 
-    # check if context/documents keys exist for rag evaluation
-    rag_metrics = filter_metrics(selected_metrics["rag_metrics"])
-    if len(rag_metrics) > 0:
-        for turn_num, each_turn in enumerate(chat):
-            # to accept legacy rag_evaluation format:
-            # [{"user": {"content": "<user_content>"}, 
-            #  "assistant": {"content": "<assistang_content>"}, 
-            #  "retrieved_documents": "<retrieved_documents>"}]
-            if "user" in each_turn and "assistant" in each_turn: # legancy rag_evaluation format
-                Contract.assert_true("retrieved_documents" in each_turn, 
-                    message = "Please ensure to have retrieved_documents key in each turn for rag_evaluation."
-                        + " Please check turn_number: {}".format(turn_num),
-                    target=name, log_safe=True, 
-                    reference_code = reference_code)
-            elif "role" in each_turn and each_turn["role"] == "assistant":
-                #if "context" not in each_turn:
-                Contract.assert_true("context" in each_turn, 
-                    message = "Please ensure to have context key in assistant turn for rag_evaluation."
-                        + " Please check turn_number: {}".format(turn_num),
-                    target=name, log_safe=True, 
-                    reference_code = reference_code)
-                if "context" in each_turn: #and "citations" not in each_turn["context"]:
-                    Contract.assert_true("citations" in each_turn["context"], 
-                    message = "Please ensure to have citations key in assistant turn context for rag_evaluation."
-                        + " Please check turn_number: {}".format(turn_num),
-                    target=name, log_safe=True, 
-                    reference_code = reference_code)
-    return True
                 
 # The inputs section will change based on the arguments of the tool function, after you save the code
 # Adding type to arguments and return value will help the system show the types properly
 # Please update the function name/signature per need
 @tool
 def validate_conversation(chat: [], selected_metrics: dict) -> bool:
+    is_group_selected = is_metric_group_selected(selected_metrics)
+
+    # no quality metrics are selected
+    if (not is_group_selected['rag_metrics']) and (not is_group_selected['non_rag_metrics']):
+        print("no quality metrics selected. ")
+        return {"non_rag_metrics": False,
+            "rag_metrics": False}
+    
+    # check if chat format is valid
+    #is_valid_chat = is_conversation_valid(chat)
     try:
-        is_valid_chat = is_conversation_valid(chat, selected_metrics)
-    except Exception:
+        is_valid_chat = is_conversation_valid(chat)
+    except:
         is_valid_chat = False
-    return is_valid_chat
+    
+    # chat format is not valid
+    if not is_valid_chat:
+        print("chat format is not valid")
+        return {"non_rag_metrics": False,
+            "rag_metrics": False}
+
+    non_rag_node = is_group_selected['non_rag_metrics'] and is_valid_chat
+    rag_node = False
+    if is_group_selected['rag_metrics'] and is_valid_chat:
+        try:
+            rag_node = is_conversation_valid_with_context(chat)
+        except:
+            rag_node = False
+    print("non_rag_metrics:", non_rag_node, "rag_metrics:", rag_node)
+
+    return {"non_rag_metrics": non_rag_node, "rag_metrics": rag_node}
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/validate_service.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/validate_service.py
@@ -1,7 +1,7 @@
 from promptflow import tool
 import mlflow
 from mlflow.utils.rest_utils import http_request
-from utils import get_cred
+from utils import get_cred, is_conversation_valid
 
 def is_service_available():
     try:
@@ -35,13 +35,21 @@ def is_safety_metrics_selected(selected_metrics):
     print("No safety metrics are selected.")
     return False
 
+def is_chat_valid(chat) -> bool:
+    try:
+        is_valid_chat_format = is_conversation_valid(chat)
+    except:
+        print("The chat format is not valid for safety metrics")
+        is_valid_chat_format = False
+    return is_valid_chat_format
+
 
 # check if RAI service is avilable in this region. If not, return False.
 # check if tracking_uri is set. If not, return False
 # if tracking_rui is set, check if any safety metric is selected. 
 # if no safety metric is selected, return False
 @tool
-def validate_safety_metric_input(selected_metrics: dict) -> dict:
+def validate_safety_metric_input(selected_metrics: dict, chat: [dict]) -> dict:
     return is_safety_metrics_selected(selected_metrics) and \
             is_service_available() and \
-            is_tracking_uri_set()
+            is_tracking_uri_set() and is_chat_valid(chat)