Azure
diff --git a/‎sdk/ai/azure-ai-generative/MANIFEST.in‎
Lines changed: 1 addition & 0 deletions b/‎sdk/ai/azure-ai-generative/MANIFEST.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py‎
Lines changed: 1 addition & 1 deletion b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py‎
Lines changed: 1 addition & 4 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py‎
Lines changed: 30 additions & 9 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py‎
Lines changed: 30 additions & 9 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_utils.py‎
Lines changed: 9 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_utils.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/aggregate.py‎
Lines changed: 46 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/aggregate.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/call_rai_service.py‎
Lines changed: 79 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/call_rai_service.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/concatenate_metrics.py‎
Lines changed: 60 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/concatenate_metrics.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/constants.py‎
Lines changed: 31 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/constants.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/construct_service_request.py‎
Lines changed: 30 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/construct_service_request.py‎
Lines changed: 30 additions & 0 deletions
@@ -6,5 +6,6 @@ include azure/ai/__init__.py
 include azure/ai/generative/py.typed
 include azure/ai/generative/index/_utils/encodings/*
 include azure/ai/generative/evaluate/metrics/templates/*
+recursive-include azure/ai/generative/evaluate/pf_templates/*
 recursive-include azure/ai/generative/synthetic/templates *.txt
 recursive-include azure/ai/generative/synthetic/simulator/templates *.md
@@ -11,7 +11,7 @@
 
 SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING = {
     QA: "qa",
-    CHAT: "rag-evaluation",
+    CHAT: "chat",
 }
 
 
 
@@ -483,10 +483,7 @@ def _get_chat_instance_table(metrics):
 
 def _get_instance_table(metrics, task_type, asset_handler):
 
-    if task_type == CHAT:
-        instance_level_metrics_table = _get_chat_instance_table(metrics.get("artifacts"))
-    else:
-        instance_level_metrics_table = pd.DataFrame(metrics.get("artifacts"))
+    instance_level_metrics_table = pd.DataFrame(metrics.get("artifacts"))
 
     combined_table = pd.concat(
         [asset_handler.input_output_data,
 
@@ -1,19 +1,25 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import mlflow
 import pandas as pd
 import logging
 
 from os import path
 from typing import Dict, Optional
 
-from azure.ai.generative.evaluate._constants import TASK_TYPE_TO_METRICS_MAPPING
+from azure.ai.generative.evaluate._constants import TASK_TYPE_TO_METRICS_MAPPING, CHAT
 from ._user_agent import USER_AGENT
 
 from ._utils import run_pf_flow_with_dict_list, df_to_dict_list, wait_for_pf_run_to_complete
 
 LOGGER = logging.getLogger(__name__)
 
+NODE_LIST_BY_TASK = {
+    "qa": ["gpt_coherence", "gpt_similarity", "gpt_relevance", "gpt_fluency", "gpt_groundedness"],
+    "chat": ["evaluate_chat_rag", "evaluate_coherence_fluency"]
+}
+
 
 class MetricHandler(object):
 
@@ -42,15 +48,27 @@ def _get_data_for_pf(self) -> pd.DataFrame:
         else:
             return self.input_output_data
 
+    def _get_data_for_pf_by_task_type(self, metrics):
+        metrics_calculation_data = self._get_data_for_pf()
+        metrics = metrics if metrics is not None else TASK_TYPE_TO_METRICS_MAPPING[
+            self.task_type].DEFAULT_LIST
+
+        extra_inputs = {"metrics": ','.join(metrics)}
+
+        if self.task_type == CHAT:
+            extra_inputs.update({"deployment_name": self.metrics_mapping["openai_params"]["deployment_id"]})
+
+        # The PF eval template expects metrics names to be passed in as a input parameter
+        return df_to_dict_list(metrics_calculation_data, extra_inputs)
+
+
     def calculate_metrics(self) -> Dict:
 
-        metrics_calculation_data = self._get_data_for_pf()
+        metrics = self.metrics if self.metrics is not None else TASK_TYPE_TO_METRICS_MAPPING[
+            self.task_type].DEFAULT_LIST
+        dict_list = self._get_data_for_pf_by_task_type(metrics)
 
-        metrics = self.metrics if self.metrics is not None else TASK_TYPE_TO_METRICS_MAPPING[self.task_type].DEFAULT_LIST
-        
-        dict_list = df_to_dict_list(metrics_calculation_data, {"metrics": ','.join(metrics)}) # The PF eval template expects metrics names to be passed in as a input parameter 
-        
-        flow_path = path.join(path.dirname(__file__), "pf_templates", "built_in_metrics")
+        flow_path = path.join(path.dirname(__file__), "pf_templates", "built_in_metrics", self.task_type)
 
         from promptflow import PFClient
         from promptflow.entities import AzureOpenAIConnection, OpenAIConnection
@@ -82,9 +100,12 @@ def calculate_metrics(self) -> Dict:
             "connection": conn_name,
             "deployment_name": deployment_id,
         }
-        nodes_list = ["gpt_coherence", "gpt_similarity", "gpt_relevance", "gpt_fluency", "gpt_groundedness"]
+        nodes_list = NODE_LIST_BY_TASK[self.task_type]
 
-        pf_run = run_pf_flow_with_dict_list(flow_path, dict_list, flow_params={"connections": {node: connection_override for node in nodes_list}})
+        if self.task_type == CHAT:
+            pf_run = run_pf_flow_with_dict_list(flow_path, dict_list)
+        else:
+            pf_run = run_pf_flow_with_dict_list(flow_path, dict_list, flow_params={"connections": {node: connection_override for node in nodes_list}})
         wait_for_pf_run_to_complete(pf_run.name)
 
         result_df = pf_client.get_details(pf_run.name, all_results=True)
 
@@ -9,6 +9,8 @@
 import time
 from pathlib import Path
 from typing import Optional, Dict, List
+
+import mlflow
 import pandas as pd
 import tempfile
 
@@ -54,10 +56,17 @@ def run_pf_flow_with_dict_list(flow_path, data: List[Dict], flow_params=None):
         if flow_params is None:
             flow_params = {}
 
+        env_vars = None
+        if mlflow.get_tracking_uri() and mlflow.get_tracking_uri().startswith("azureml:"):
+            env_vars = {
+                "MLFLOW_TRACKING_URI": mlflow.get_tracking_uri()
+            }
+
         return pf_client.run(
             flow=flow_path,
             data=tmp_path,
             column_mapping=column_mapping,
+            environment_variables=env_vars,
             **flow_params
         )
 
 
@@ -0,0 +1,46 @@
+from typing import List, Union, Optional
+from collections import defaultdict
+from promptflow import tool, log_metric
+import numpy as np
+from utils import filter_metrics
+
+
+@tool
+def aggregate_variants_results(results: List[dict], metrics: List[dict],
+                                thresholds):             
+    rag_metrics = filter_metrics(metrics[0]["rag_metrics"])
+    non_rag_metrics = filter_metrics(metrics[0]["non_rag_metrics"])
+    safety_metrics = filter_metrics(metrics[0]["safety_metrics"])
+    selected_quality_metrics =  rag_metrics + non_rag_metrics 
+
+    # set threshold for safety metric defect_rate
+    if thresholds != [] and thresholds is not None:
+        threshold = np.float16(thresholds[0])
+    else:
+        threshold = 3
+
+    aggregate_results = {}
+    for result in results:
+        for name, value in result["results_per_chat"].items():
+            if name in selected_quality_metrics or name in safety_metrics:
+                if name not in aggregate_results.keys():
+                    aggregate_results[name] = []
+                try:
+                    float_val = float(value)
+                except Exception:
+                    float_val = np.nan
+                aggregate_results[name].append(float_val)
+    aggregate_output = {}
+
+    for name, value in aggregate_results.items():
+        metric_name = name
+        if name in selected_quality_metrics:
+            aggregate_output[metric_name] = np.nanmean(value)
+        elif name in safety_metrics:
+            values = aggregate_results[name]
+            metric_name = name+'_defect_rate'
+            aggregate_output[metric_name] = np.sum(values >= threshold) / len(values)
+
+        aggregate_output[metric_name] = round(aggregate_output[metric_name], 2)
+        log_metric(metric_name, aggregate_output[metric_name])
+    return aggregate_output
@@ -0,0 +1,79 @@
+from promptflow import tool
+from mlflow.utils.rest_utils import http_request
+import time
+from utils import get_cred
+from constants import RAIService
+
+
+def submit_annotation(cred, request_body):
+    try:        
+        response = http_request(
+            host_creds=cred,
+            endpoint="/submitannotation",
+            method="POST",
+            json=request_body,
+        )
+
+        if response.status_code != 202:
+            print("Fail evaluating '%s' with error message: %s", request_body["UserTextList"], response.text)
+            response.raise_for_status()
+    except AttributeError as e:
+        response = None
+        print("Fail evaluating '%s' with error message: %s", request_body["UserTextList"], e)
+    if response is not None:
+        json_obj = response.json()
+    else:
+        json_obj = {}
+    return json_obj
+
+def check_status(cred, request_id):
+        try:
+            response = http_request(
+                host_creds = cred,
+                endpoint="/operations/" + request_id,
+                method="GET"
+            )
+        except AttributeError as e:
+            response = None
+        return response
+
+def retrieve_annotation_result(cred, submitannotation_response):
+        request_id = submitannotation_response["location"].split("/")[-1]
+        annotation_result = None
+        start = time.time()
+        time_elapsed = 0
+        request_count = 1
+        while True and time_elapsed <= RAIService.TIMEOUT:
+            try:
+                request_status = check_status(cred, request_id)
+            except Exception:
+                request_status = None
+            if request_status:
+                request_status_code = request_status.status_code
+                #if request_status_code >= 400:
+                    #request_status.raise_for_status()
+                if request_status_code == 200:
+                    annotation_result = request_status.json()
+                    break
+            else:
+                print("Failed to retrieve the status of RequestID: %s" % request_id)
+            request_count += 1
+            sleep_time = RAIService.SLEEPTIME ** request_count
+            time.sleep(sleep_time)
+            time_elapsed = time.time() - start
+    
+        if time_elapsed > RAIService.TIMEOUT:
+            raise TimeoutError("Request times out after %d seconds", RAIService.TIMEOUT)
+    
+        return annotation_result
+
+# The inputs section will change based on the arguments of the tool function, after you save the code
+# Adding type to arguments and return value will help the system show the types properly
+# Please update the function name/signature per need
+@tool
+def call_rai_service(request_body: dict) -> dict:
+    cred = get_cred()
+    submitannotation_response = submit_annotation(cred, request_body)
+    annotation_result = retrieve_annotation_result(cred, submitannotation_response)
+    return annotation_result
+    
@@ -0,0 +1,60 @@
+from promptflow import tool
+import numpy as np
+import constants
+
+def format_rag_results(rag_results: dict, supported_metrics):
+    result_per_chat = {}
+    result_per_turn = {}
+    if rag_results:
+        #result_per_chat = rag_results['metrics']
+        for metric, value in rag_results['artifacts'].items():
+            result_per_chat[metric] = rag_results['metrics']["mean_" + metric]
+            result_per_turn[metric] = {"reason": value['reason'], "score": value['score_per_turn']}
+    for metric in supported_metrics:
+        if metric not in result_per_turn:
+            result_per_chat[metric] = np.nan
+            result_per_turn[metric] = np.nan
+    return {"results_per_turn": result_per_turn, "results_per_chat": result_per_chat}
+
+
+def format_non_rag_results(non_rag_results: dict, supported_metrics):
+    result_per_chat = {}
+    result_per_turn = {}
+    if non_rag_results:
+        for metric in non_rag_results['artifacts']:
+            result_per_chat[metric] = non_rag_results['metrics']['mean_' + metric]
+        result_per_turn = non_rag_results['artifacts']
+    for metric in supported_metrics:
+        if metric not in result_per_turn:
+            result_per_turn[metric] = np.nan
+            result_per_chat[metric] = np.nan
+    return {"results_per_turn": result_per_turn, "results_per_chat": result_per_chat}
+
+def format_safety_results(safety_results: dict, supported_metrics):
+    result_per_chat = {}
+    if safety_results:
+        result_per_chat = safety_results
+    for metric in supported_metrics:
+        if metric not in result_per_chat:
+            result_per_chat[metric] = np.nan
+            result_per_chat[metric + "_reasoning"] = np.nan
+    return result_per_chat
+
+# The inputs section will change based on the arguments of the tool function, after you save the code
+# Adding type to arguments and return value will help the system show the types properly
+# Please update the function name/signature per need
+@tool
+def concatenate_metrics(rag_results: dict, non_rag_results: dict, 
+                       safety_results: dict,
+                       selected_metrics: dict) -> dict:
+    formatted_rag = format_rag_results(rag_results, selected_metrics['rag_metrics'])
+    formatted_non_rag = format_non_rag_results(non_rag_results, selected_metrics['non_rag_metrics'])
+    formatted_safety = format_safety_results(safety_results, selected_metrics['safety_metrics'])
+    results = {}
+    for key in ["results_per_turn", "results_per_chat"]:
+        result_concat = formatted_rag[key].copy()
+        result_concat.update(formatted_non_rag[key])
+        if key == "results_per_chat":
+            result_concat.update(formatted_safety)
+        results[key] = result_concat
+    return results
@@ -0,0 +1,31 @@
+from azureml.metrics import constants
+
+class RAIService:
+    """Define constants related to RAI service"""
+    API_BASE = "https://int.api.azureml-test.ms/raisvc/v1.0/subscriptions"
+    TIMEOUT = 1800
+    SLEEPTIME = 2
+
+class Metric:
+    """Defines all metrics supported by RAI service"""
+    # Content harm
+    SelfHarm = "self_harm"
+    Violence = "violence"
+    Sexual = "sexual"
+    HateFairness = "hate_fairness"
+
+class Tasks:
+    """Defines types of annotation tasks supported by RAI Service."""
+    CONTENT_HARM = "content harm"
+
+# Content harm metric set
+CONTENT_HARM_METRICS = {
+    Metric.SelfHarm,
+    Metric.Violence,
+    Metric.Sexual,
+    Metric.HateFairness
+}
+
+RAG_EVALUATION_SET = constants.Metric.RAG_EVALUATION_SET
+NON_RAG_EAVLUATION_SET = {constants.Metric.GPTCoherence, constants.Metric.GPTFluency}
+QUALITY_METRIC_SET = RAG_EVALUATION_SET | NON_RAG_EAVLUATION_SET
@@ -0,0 +1,30 @@
+from promptflow import tool
+import json
+
+def parse_chat(user_text: list):
+    parsed_chat = []
+    for turn in user_text:
+        try:
+            role = turn["role"]
+            content = turn["content"]
+            content_str = "<" + role + ">" + content + "</>\n"
+        except KeyError:
+            content_str = json.dumps(turn) + "\n"
+        parsed_chat.append(content_str)
+    return "{\"conversation\": \"" + "".join(parsed_chat) + "\"}"
+        
+def normalize_user_text(user_text):
+    return user_text.replace("'", "\\\"")
+
+# The inputs section will change based on the arguments of the tool function, after you save the code
+# Adding type to arguments and return value will help the system show the types properly
+# Please update the function name/signature per need
+@tool
+def construct_request(user_text: list, selected_metrics: dict) -> dict:
+    selected_safety_metrics = selected_metrics["safety_metrics"]
+    metrics = [metric for metric in selected_safety_metrics if selected_safety_metrics[metric]]
+    parsed_user_text = parse_chat(user_text)
+    request_body = {"UserTextList": [parsed_user_text], 
+                    "AnnotationTask": "content harm", 
+                    "MetricList": metrics}
+    return request_body
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`
`12`	`12`	`SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING = {`
`13`	`13`	`QA: "qa",`
`14`		`- CHAT: "rag-evaluation",`
	`14`	`+ CHAT: "chat",`
`15`	`15`	`}`
`16`	`16`
`17`	`17`