azure-sdk
diff --git a/‎.vscode/cspell.json‎
Lines changed: 6 additions & 1 deletion b/‎.vscode/cspell.json‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py‎
Lines changed: 3 additions & 1 deletion b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py‎
Lines changed: 37 additions & 5 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py‎
Lines changed: 37 additions & 5 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py‎
Lines changed: 28 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py‎
Lines changed: 11 additions & 1 deletion b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_eval_run.py‎
Lines changed: 2 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_eval_run.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py‎
Lines changed: 88 additions & 9 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py‎
Lines changed: 88 additions & 9 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/__init__.py‎
Lines changed: 13 additions & 3 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/__init__.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/_bleu/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/_bleu/__init__.py‎
Lines changed: 9 additions & 0 deletions
@@ -1326,7 +1326,12 @@
         "tparam",
         "tqdm",
         "ncols",
-        "datas"
+        "datas",
+        "punkt",
+        "gleu",
+        "fmeasure",
+        "upia",
+        "xpia",
       ]
     },
     {
 
@@ -32,6 +32,7 @@ class Tasks:
 
     CONTENT_HARM = "content harm"
     PROTECTED_MATERIAL = "protected material"
+    XPIA = "xpia"
 
 
 class _InternalAnnotationTasks:
@@ -52,6 +53,7 @@ class EvaluationMetrics:
     SELF_HARM = "self_harm"
     SEXUAL = "sexual"
     PROTECTED_MATERIAL = "protected_material"
+    XPIA = "xpia"
 
 
 class _InternalEvaluationMetrics:
@@ -60,4 +62,4 @@ class _InternalEvaluationMetrics:
     enum over time.
     """
 
-    ECI = "eci"
+    ECI = "eci"
@@ -103,6 +103,9 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
     elif metric == _InternalEvaluationMetrics.ECI:
         task = _InternalAnnotationTasks.ECI
         include_metric = False
+    elif metric == EvaluationMetrics.XPIA:
+        task = Tasks.XPIA
+        include_metric = False
     return (
         {
             "UserTextList": [normalized_user_text],
@@ -207,21 +210,50 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     :return: The parsed annotation result.
     :rtype: List[List[Dict]]
     """
-
-    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
+    # non-numeric metrics
+    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
         if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
             return {}
         response = batch_response[0][metric_name]
         response = response.replace("false", "False")
         response = response.replace("true", "True")
         parsed_response = literal_eval(response)
         result = {}
-        result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan
-        result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        metric_prefix = _get_metric_prefix(metric_name)
+        # Use label instead of score since these are assumed to be boolean results.
+        # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
+        result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
+        result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+
+        if metric_name == EvaluationMetrics.XPIA:
+            # Add "manipulated_content", "intrusion" and "information_gathering" to the result
+            # if present else set them to np.nan
+            result[metric_prefix + "_manipulated_content"] = (
+                parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
+            )
+            result[metric_prefix + "_intrusion"] = (
+                parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
+            )
+            result[metric_prefix + "_information_gathering"] = (
+                parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
+            )
         return result
     return _parse_content_harm_response(batch_response, metric_name)
 
 
+def _get_metric_prefix(metric_name: str) -> str:
+    """Get the prefix for the evaluation metric. This is usually the metric name.
+
+    :param metric_name: The evaluation metric to use.
+    :type metric_name: str
+    :return: The prefix for the evaluation metric.
+    :rtype: str
+    """
+    if metric_name == _InternalEvaluationMetrics.ECI:
+        return "ECI"
+    return metric_name
+
+
 def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.
 
@@ -415,4 +447,4 @@ async def evaluate_with_rai_service(
     annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
     result = parse_response(annotation_response, metric_name)
 
-    return result
+    return result
@@ -5,8 +5,20 @@
     from . import constants
 except ImportError:
     import constants
+
+from typing import List, cast
+
+import nltk
 import numpy as np
 
+try:
+    from nltk.tokenize.nist import NISTTokenizer
+except LookupError:
+    nltk.download("perluniprops")
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
+    from nltk.tokenize.nist import NISTTokenizer
+
 
 def get_harm_severity_level(harm_score: int) -> str:
     """Generate harm severity level based on harm score.
@@ -28,3 +40,19 @@ def get_harm_severity_level(harm_score: int) -> str:
         if harm_score_range[0] <= harm_score <= harm_score_range[1]:
             return harm_level.value
     return np.nan
+
+
+def nltk_tokenize(text: str) -> List[str]:
+    """Tokenize the input text using the NLTK tokenizer."""
+
+    is_latin_or_numeric = all(
+        ("\u0020" <= c <= "\u007E")  # Basic Latin
+        or ("\u00A0" <= c <= "\u00FF")  # Latin-1 Supplement
+        or ("0" <= c <= "9")  # Digits
+        for c in text
+    )
+
+    if is_latin_or_numeric:
+        return cast(List[str], nltk.word_tokenize(text))
+
+    return list(NISTTokenizer().international_tokenize(text))
@@ -18,6 +18,16 @@ class EvaluationMetrics:
     VIOLENCE = "violence"
     SELF_HARM = "self_harm"
     SEXUAL = "sexual"
+    PROTECTED_MATERIAL = "protected_material"
+
+
+class _InternalEvaluationMetrics:
+    """Evaluation metrics that are not publicly supported.
+    These metrics are experimental and subject to potential change or migration to the main
+    enum over time.
+    """
+
+    ECI = "eci"
 
 
 class Prefixes:
@@ -36,4 +46,4 @@ class Prefixes:
 PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
 
 OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
-OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
+OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
@@ -170,9 +170,9 @@ def _start_run(self) -> None:
                 url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
                 body = {
                     "experiment_id": "0",
-                    "user_id": "promptflow-evals",
+                    "user_id": "azure-ai-evaluation",
                     "start_time": int(time.time() * 1000),
-                    "tags": [{"key": "mlflow.user", "value": "promptflow-evals"}],
+                    "tags": [{"key": "mlflow.user", "value": "azure-ai-evaluation"}],
                 }
                 if self._run_name:
                     body["run_name"] = self._run_name
 
@@ -4,15 +4,20 @@
 import inspect
 import os
 import re
-from typing import Any, Callable, Dict, Optional, Set, Tuple
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
 
 import numpy as np
 import pandas as pd
 
 from promptflow._sdk._constants import LINE_NUMBER
 from promptflow.client import PFClient
 
-from .._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, Prefixes
+from .._constants import (
+    CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
+    EvaluationMetrics,
+    Prefixes,
+    _InternalEvaluationMetrics,
+)
 from .._user_agent import USER_AGENT
 from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
 from ._telemetry import log_evaluate_activity
@@ -24,10 +29,22 @@
 )
 
 
-def _aggregate_metrics(df, evaluators) -> Dict[str, float]:
-    df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
-
-    # Calculate defect rates for content safety metrics
+# pylint: disable=line-too-long
+def _aggregate_content_safety_metrics(
+    df: pd.DataFrame, evaluators: Dict[str, Type]
+) -> Tuple[List[str], Dict[str, float]]:
+    """Find and aggregate defect rates for content safety metrics. Returns both a list
+    of columns that were used to calculate defect rates and the defect rates themselves.
+
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :param evaluators:  A dictionary mapping of strings to evaluator classes. This is used to identify
+        content safety metrics, since they should start with a string that matches an evaluator name.
+    :type evaluators: Dict[str, type]
+    :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
+        and the second element is a dictionary of defect column names and defect rates.
+    :rtype: Tuple[List[str], Dict[str, float]]
+    """
     content_safety_metrics = [
         EvaluationMetrics.SEXUAL,
         EvaluationMetrics.SELF_HARM,
@@ -59,12 +76,74 @@ def _aggregate_metrics(df, evaluators) -> Dict[str, float]:
             / col_with_numeric_values.count(),
             2,
         )
+    return content_safety_cols, defect_rates
+
+
+def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
+    """Find and aggregate defect rates for label-based metrics. Returns both a list
+    of columns that were used to calculate defect rates and the defect rates themselves.
+
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
+        and the second element is a dictionary of defect column names and defect rates.
+    :rtype: Tuple[List[str], Dict[str, float]]
+    """
+    handled_metrics = [
+        EvaluationMetrics.PROTECTED_MATERIAL,
+        _InternalEvaluationMetrics.ECI,
+    ]
+    label_cols = []
+    for col in df.columns:
+        metric_name = col.split(".")[1]
+        if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
+            label_cols.append(col)
+
+    label_df = df[label_cols]
+    defect_rates = {}
+    for col in label_df.columns:
+        defect_rate_name = col.replace("_label", "_defect_rate")
+        col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
+        defect_rates[defect_rate_name] = round(
+            np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
+            2,
+        )
+    return label_cols, defect_rates
+
+
+def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
+    """Aggregate metrics from the evaluation results.
+    On top of naively calculating the mean of most metrics, this function also identifies certain columns
+    that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
+    EX: protected_material_label -> protected_material_defect_rate
+
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :param evaluators:  A dictionary mapping of strings to evaluator classes.
+    :type evaluators: Dict[str, Type]
+    :return: The aggregated metrics.
+    :rtype: Dict[str, float]
+    """
+    df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
+
+    handled_columns = []
+    defect_rates = {}
+    # Rename certain columns as defect rates if we know that's what their aggregates represent
+    # Content safety metrics
+    content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
+    handled_columns.extend(content_safety_cols)
+    defect_rates.update(cs_defect_rates)
+    # Label-based (true/false) metrics where 'true' means 'something is wrong'
+    label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
+    handled_columns.extend(label_cols)
+    defect_rates.update(label_defect_rates)
 
     # For rest of metrics, we will calculate mean
-    df.drop(columns=content_safety_cols, inplace=True)
+    df.drop(columns=handled_columns, inplace=True)
+
     mean_value = df.mean(numeric_only=True)
     metrics = mean_value.to_dict()
-
+    # Add defect rates back into metrics
     metrics.update(defect_rates)
     return metrics
 
@@ -522,4 +601,4 @@ def _evaluate(  # pylint: disable=too-many-locals
     if output_path:
         _write_output(output_path, result)
 
-    return result
+    return result
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
+from ._bleu import BleuScoreEvaluator
 from ._chat import ChatEvaluator
 from ._coherence import CoherenceEvaluator
 from ._content_safety import (
@@ -14,10 +15,13 @@
 )
 from ._f1_score import F1ScoreEvaluator
 from ._fluency import FluencyEvaluator
+from ._gleu import GleuScoreEvaluator
 from ._groundedness import GroundednessEvaluator
-from ._protected_materials import ProtectedMaterialsEvaluator
+from ._meteor import MeteorScoreEvaluator
+from ._protected_material import ProtectedMaterialEvaluator
 from ._qa import QAEvaluator
 from ._relevance import RelevanceEvaluator
+from ._rouge import RougeScoreEvaluator, RougeType
 from ._similarity import SimilarityEvaluator
 
 __all__ = [
@@ -35,5 +39,11 @@
     "HateUnfairnessEvaluator",
     "ContentSafetyEvaluator",
     "ContentSafetyChatEvaluator",
-    "ProtectedMaterialsEvaluator",
-]
+    "IndirectAttackEvaluator",
+    "BleuScoreEvaluator",
+    "GleuScoreEvaluator",
+    "MeteorScoreEvaluator",
+    "RougeScoreEvaluator",
+    "RougeType",
+    "ProtectedMaterialEvaluator",
+]
@@ -0,0 +1,9 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+from ._bleu import BleuScoreEvaluator
+
+__all__ = [
+    "BleuScoreEvaluator",
+]
Original file line number	Diff line number	Diff line change
`@@ -1326,7 +1326,12 @@`
`1326`	`1326`	`"tparam",`
`1327`	`1327`	`"tqdm",`
`1328`	`1328`	`"ncols",`
`1329`		`- "datas"`
	`1329`	`+ "datas",`
	`1330`	`+ "punkt",`
	`1331`	`+ "gleu",`
	`1332`	`+ "fmeasure",`
	`1333`	`+ "upia",`
	`1334`	`+ "xpia",`
`1330`	`1335`	`]`
`1331`	`1336`	`},`
`1332`	`1337`	`{`