Skip to content

Commit 96fa1e3

Browse files
authored
Final Sync of evaluation Code (Azure#37233)
* Final sync of evaluation code * fix up some user agents
1 parent 80e8d00 commit 96fa1e3

File tree

32 files changed

+1041
-57
lines changed

32 files changed

+1041
-57
lines changed

.vscode/cspell.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1326,7 +1326,12 @@
13261326
"tparam",
13271327
"tqdm",
13281328
"ncols",
1329-
"datas"
1329+
"datas",
1330+
"punkt",
1331+
"gleu",
1332+
"fmeasure",
1333+
"upia",
1334+
"xpia",
13301335
]
13311336
},
13321337
{

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class Tasks:
3232

3333
CONTENT_HARM = "content harm"
3434
PROTECTED_MATERIAL = "protected material"
35+
XPIA = "xpia"
3536

3637

3738
class _InternalAnnotationTasks:
@@ -52,6 +53,7 @@ class EvaluationMetrics:
5253
SELF_HARM = "self_harm"
5354
SEXUAL = "sexual"
5455
PROTECTED_MATERIAL = "protected_material"
56+
XPIA = "xpia"
5557

5658

5759
class _InternalEvaluationMetrics:
@@ -60,4 +62,4 @@ class _InternalEvaluationMetrics:
6062
enum over time.
6163
"""
6264

63-
ECI = "eci"
65+
ECI = "eci"

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
103103
elif metric == _InternalEvaluationMetrics.ECI:
104104
task = _InternalAnnotationTasks.ECI
105105
include_metric = False
106+
elif metric == EvaluationMetrics.XPIA:
107+
task = Tasks.XPIA
108+
include_metric = False
106109
return (
107110
{
108111
"UserTextList": [normalized_user_text],
@@ -207,21 +210,50 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
207210
:return: The parsed annotation result.
208211
:rtype: List[List[Dict]]
209212
"""
210-
211-
if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
213+
# non-numeric metrics
214+
if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
212215
if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
213216
return {}
214217
response = batch_response[0][metric_name]
215218
response = response.replace("false", "False")
216219
response = response.replace("true", "True")
217220
parsed_response = literal_eval(response)
218221
result = {}
219-
result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan
220-
result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
222+
metric_prefix = _get_metric_prefix(metric_name)
223+
# Use label instead of score since these are assumed to be boolean results.
224+
# Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
225+
result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
226+
result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
227+
228+
if metric_name == EvaluationMetrics.XPIA:
229+
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
230+
# if present else set them to np.nan
231+
result[metric_prefix + "_manipulated_content"] = (
232+
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
233+
)
234+
result[metric_prefix + "_intrusion"] = (
235+
parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
236+
)
237+
result[metric_prefix + "_information_gathering"] = (
238+
parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
239+
)
221240
return result
222241
return _parse_content_harm_response(batch_response, metric_name)
223242

224243

244+
def _get_metric_prefix(metric_name: str) -> str:
245+
"""Get the prefix for the evaluation metric. This is usually the metric name.
246+
247+
:param metric_name: The evaluation metric to use.
248+
:type metric_name: str
249+
:return: The prefix for the evaluation metric.
250+
:rtype: str
251+
"""
252+
if metric_name == _InternalEvaluationMetrics.ECI:
253+
return "ECI"
254+
return metric_name
255+
256+
225257
def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
226258
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
227259
@@ -415,4 +447,4 @@ async def evaluate_with_rai_service(
415447
annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
416448
result = parse_response(annotation_response, metric_name)
417449

418-
return result
450+
return result

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,20 @@
55
from . import constants
66
except ImportError:
77
import constants
8+
9+
from typing import List, cast
10+
11+
import nltk
812
import numpy as np
913

14+
try:
15+
from nltk.tokenize.nist import NISTTokenizer
16+
except LookupError:
17+
nltk.download("perluniprops")
18+
nltk.download("punkt")
19+
nltk.download("punkt_tab")
20+
from nltk.tokenize.nist import NISTTokenizer
21+
1022

1123
def get_harm_severity_level(harm_score: int) -> str:
1224
"""Generate harm severity level based on harm score.
@@ -28,3 +40,19 @@ def get_harm_severity_level(harm_score: int) -> str:
2840
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
2941
return harm_level.value
3042
return np.nan
43+
44+
45+
def nltk_tokenize(text: str) -> List[str]:
46+
"""Tokenize the input text using the NLTK tokenizer."""
47+
48+
is_latin_or_numeric = all(
49+
("\u0020" <= c <= "\u007E") # Basic Latin
50+
or ("\u00A0" <= c <= "\u00FF") # Latin-1 Supplement
51+
or ("0" <= c <= "9") # Digits
52+
for c in text
53+
)
54+
55+
if is_latin_or_numeric:
56+
return cast(List[str], nltk.word_tokenize(text))
57+
58+
return list(NISTTokenizer().international_tokenize(text))

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,16 @@ class EvaluationMetrics:
1818
VIOLENCE = "violence"
1919
SELF_HARM = "self_harm"
2020
SEXUAL = "sexual"
21+
PROTECTED_MATERIAL = "protected_material"
22+
23+
24+
class _InternalEvaluationMetrics:
25+
"""Evaluation metrics that are not publicly supported.
26+
These metrics are experimental and subject to potential change or migration to the main
27+
enum over time.
28+
"""
29+
30+
ECI = "eci"
2131

2232

2333
class Prefixes:
@@ -36,4 +46,4 @@ class Prefixes:
3646
PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
3747

3848
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
39-
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
49+
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_eval_run.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,9 @@ def _start_run(self) -> None:
170170
url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
171171
body = {
172172
"experiment_id": "0",
173-
"user_id": "promptflow-evals",
173+
"user_id": "azure-ai-evaluation",
174174
"start_time": int(time.time() * 1000),
175-
"tags": [{"key": "mlflow.user", "value": "promptflow-evals"}],
175+
"tags": [{"key": "mlflow.user", "value": "azure-ai-evaluation"}],
176176
}
177177
if self._run_name:
178178
body["run_name"] = self._run_name

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py

Lines changed: 88 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,20 @@
44
import inspect
55
import os
66
import re
7-
from typing import Any, Callable, Dict, Optional, Set, Tuple
7+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
88

99
import numpy as np
1010
import pandas as pd
1111

1212
from promptflow._sdk._constants import LINE_NUMBER
1313
from promptflow.client import PFClient
1414

15-
from .._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, Prefixes
15+
from .._constants import (
16+
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
17+
EvaluationMetrics,
18+
Prefixes,
19+
_InternalEvaluationMetrics,
20+
)
1621
from .._user_agent import USER_AGENT
1722
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
1823
from ._telemetry import log_evaluate_activity
@@ -24,10 +29,22 @@
2429
)
2530

2631

27-
def _aggregate_metrics(df, evaluators) -> Dict[str, float]:
28-
df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
29-
30-
# Calculate defect rates for content safety metrics
32+
# pylint: disable=line-too-long
33+
def _aggregate_content_safety_metrics(
34+
df: pd.DataFrame, evaluators: Dict[str, Type]
35+
) -> Tuple[List[str], Dict[str, float]]:
36+
"""Find and aggregate defect rates for content safety metrics. Returns both a list
37+
of columns that were used to calculate defect rates and the defect rates themselves.
38+
39+
:param df: The dataframe of evaluation results.
40+
:type df: ~pandas.DataFrame
41+
:param evaluators: A dictionary mapping of strings to evaluator classes. This is used to identify
42+
content safety metrics, since they should start with a string that matches an evaluator name.
43+
:type evaluators: Dict[str, type]
44+
:return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
45+
and the second element is a dictionary of defect column names and defect rates.
46+
:rtype: Tuple[List[str], Dict[str, float]]
47+
"""
3148
content_safety_metrics = [
3249
EvaluationMetrics.SEXUAL,
3350
EvaluationMetrics.SELF_HARM,
@@ -59,12 +76,74 @@ def _aggregate_metrics(df, evaluators) -> Dict[str, float]:
5976
/ col_with_numeric_values.count(),
6077
2,
6178
)
79+
return content_safety_cols, defect_rates
80+
81+
82+
def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
83+
"""Find and aggregate defect rates for label-based metrics. Returns both a list
84+
of columns that were used to calculate defect rates and the defect rates themselves.
85+
86+
:param df: The dataframe of evaluation results.
87+
:type df: ~pandas.DataFrame
88+
:return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
89+
and the second element is a dictionary of defect column names and defect rates.
90+
:rtype: Tuple[List[str], Dict[str, float]]
91+
"""
92+
handled_metrics = [
93+
EvaluationMetrics.PROTECTED_MATERIAL,
94+
_InternalEvaluationMetrics.ECI,
95+
]
96+
label_cols = []
97+
for col in df.columns:
98+
metric_name = col.split(".")[1]
99+
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
100+
label_cols.append(col)
101+
102+
label_df = df[label_cols]
103+
defect_rates = {}
104+
for col in label_df.columns:
105+
defect_rate_name = col.replace("_label", "_defect_rate")
106+
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
107+
defect_rates[defect_rate_name] = round(
108+
np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
109+
2,
110+
)
111+
return label_cols, defect_rates
112+
113+
114+
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
115+
"""Aggregate metrics from the evaluation results.
116+
On top of naively calculating the mean of most metrics, this function also identifies certain columns
117+
that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
118+
EX: protected_material_label -> protected_material_defect_rate
119+
120+
:param df: The dataframe of evaluation results.
121+
:type df: ~pandas.DataFrame
122+
:param evaluators: A dictionary mapping of strings to evaluator classes.
123+
:type evaluators: Dict[str, Type]
124+
:return: The aggregated metrics.
125+
:rtype: Dict[str, float]
126+
"""
127+
df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
128+
129+
handled_columns = []
130+
defect_rates = {}
131+
# Rename certain columns as defect rates if we know that's what their aggregates represent
132+
# Content safety metrics
133+
content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
134+
handled_columns.extend(content_safety_cols)
135+
defect_rates.update(cs_defect_rates)
136+
# Label-based (true/false) metrics where 'true' means 'something is wrong'
137+
label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
138+
handled_columns.extend(label_cols)
139+
defect_rates.update(label_defect_rates)
62140

63141
# For rest of metrics, we will calculate mean
64-
df.drop(columns=content_safety_cols, inplace=True)
142+
df.drop(columns=handled_columns, inplace=True)
143+
65144
mean_value = df.mean(numeric_only=True)
66145
metrics = mean_value.to_dict()
67-
146+
# Add defect rates back into metrics
68147
metrics.update(defect_rates)
69148
return metrics
70149

@@ -522,4 +601,4 @@ def _evaluate( # pylint: disable=too-many-locals
522601
if output_path:
523602
_write_output(output_path, result)
524603

525-
return result
604+
return result

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/__init__.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5+
from ._bleu import BleuScoreEvaluator
56
from ._chat import ChatEvaluator
67
from ._coherence import CoherenceEvaluator
78
from ._content_safety import (
@@ -14,10 +15,13 @@
1415
)
1516
from ._f1_score import F1ScoreEvaluator
1617
from ._fluency import FluencyEvaluator
18+
from ._gleu import GleuScoreEvaluator
1719
from ._groundedness import GroundednessEvaluator
18-
from ._protected_materials import ProtectedMaterialsEvaluator
20+
from ._meteor import MeteorScoreEvaluator
21+
from ._protected_material import ProtectedMaterialEvaluator
1922
from ._qa import QAEvaluator
2023
from ._relevance import RelevanceEvaluator
24+
from ._rouge import RougeScoreEvaluator, RougeType
2125
from ._similarity import SimilarityEvaluator
2226

2327
__all__ = [
@@ -35,5 +39,11 @@
3539
"HateUnfairnessEvaluator",
3640
"ContentSafetyEvaluator",
3741
"ContentSafetyChatEvaluator",
38-
"ProtectedMaterialsEvaluator",
39-
]
42+
"IndirectAttackEvaluator",
43+
"BleuScoreEvaluator",
44+
"GleuScoreEvaluator",
45+
"MeteorScoreEvaluator",
46+
"RougeScoreEvaluator",
47+
"RougeType",
48+
"ProtectedMaterialEvaluator",
49+
]
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
from ._bleu import BleuScoreEvaluator
6+
7+
__all__ = [
8+
"BleuScoreEvaluator",
9+
]

0 commit comments

Comments
 (0)