Skip to content

Commit 25970ad

Browse files
authored
Protected material metrics fix using defect rate instead label. (Azure#39327)
* fix for label * fix label * updated tests and fix recordings * fix test config and updated assets * fix test config and updated assets * updated assets * revert * revert
1 parent a8e12c2 commit 25970ad

File tree

4 files changed

+32
-21
lines changed

4 files changed

+32
-21
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ class EvaluationMetrics:
2222
SELF_HARM = "self_harm"
2323
SEXUAL = "sexual"
2424
PROTECTED_MATERIAL = "protected_material"
25+
ARTWORK = "artwork"
26+
FICTIONAL_CHARACTERS = "fictional_characters"
27+
LOGOS_AND_BRANDS = "logos_and_brands"
2528
XPIA = "xpia"
2629

2730

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,9 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
147147
"""
148148
handled_metrics = [
149149
EvaluationMetrics.PROTECTED_MATERIAL,
150+
EvaluationMetrics.FICTIONAL_CHARACTERS,
151+
EvaluationMetrics.ARTWORK,
152+
EvaluationMetrics.LOGOS_AND_BRANDS,
150153
_InternalEvaluationMetrics.ECI,
151154
EvaluationMetrics.XPIA,
152155
]

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
9494
"similarity": SimilarityEvaluator(model_config),
9595
"qa": QAEvaluator(model_config),
9696
"grounded_pro": GroundednessProEvaluator(azure_cred, project_scope),
97-
# "protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope),
97+
"protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope),
9898
"indirect_attack": IndirectAttackEvaluator(azure_cred, project_scope),
9999
"eci": ECIEvaluator(azure_cred, project_scope),
100100
"content_safety": ContentSafetyEvaluator(azure_cred, project_scope),
@@ -108,7 +108,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
108108

109109
row_result_df = pd.DataFrame(result["rows"])
110110
metrics = result["metrics"]
111-
assert len(row_result_df.keys()) == 61
111+
assert len(row_result_df.keys()) == 63
112112
assert len(row_result_df["inputs.query"]) == 3
113113
assert len(row_result_df["inputs.context"]) == 3
114114
assert len(row_result_df["inputs.response"]) == 3
@@ -136,8 +136,8 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
136136
assert len(row_result_df["outputs.similarity.gpt_similarity"]) == 3
137137
assert len(row_result_df["outputs.grounded_pro.groundedness_pro_label"]) == 3
138138
assert len(row_result_df["outputs.grounded_pro.groundedness_pro_reason"]) == 3
139-
# assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 3
140-
# assert len(row_result_df["outputs.protected_material.protected_material_reason"]) == 3
139+
assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 3
140+
assert len(row_result_df["outputs.protected_material.protected_material_reason"]) == 3
141141
assert len(row_result_df["outputs.indirect_attack.xpia_label"]) == 3
142142
assert len(row_result_df["outputs.indirect_attack.xpia_reason"]) == 3
143143
assert len(row_result_df["outputs.indirect_attack.xpia_manipulated_content"]) == 3
@@ -173,7 +173,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
173173
assert len(row_result_df["outputs.qa.similarity"]) == 3
174174
assert len(row_result_df["outputs.qa.gpt_similarity"]) == 3
175175

176-
assert len(metrics.keys()) == 38
176+
assert len(metrics.keys()) == 39
177177
assert metrics["f1_score.f1_score"] >= 0
178178
assert metrics["gleu.gleu_score"] >= 0
179179
assert metrics["bleu.bleu_score"] >= 0
@@ -199,7 +199,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
199199
assert metrics["content_safety.hate_unfairness_defect_rate"] >= 0
200200
assert metrics["content_safety.violence_defect_rate"] >= 0
201201
assert metrics["grounded_pro.groundedness_pro_passing_rate"] >= 0
202-
# assert metrics["protected_material.protected_material_defect_rate"] >= 0
202+
assert metrics["protected_material.protected_material_defect_rate"] >= 0
203203
assert metrics["indirect_attack.xpia_defect_rate"] >= 0
204204
assert metrics["eci.eci_defect_rate"] >= 0
205205
assert metrics["qa.f1_score"] >= 0
@@ -221,7 +221,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
221221
"fluency": FluencyEvaluator(model_config),
222222
"relevance": RelevanceEvaluator(model_config),
223223
"grounded_pro": GroundednessProEvaluator(azure_cred, project_scope),
224-
# "protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope),
224+
"protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope),
225225
"indirect_attack": IndirectAttackEvaluator(azure_cred, project_scope),
226226
"eci": ECIEvaluator(azure_cred, project_scope),
227227
"content_safety": ContentSafetyEvaluator(azure_cred, project_scope),
@@ -237,7 +237,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
237237
row_result_df = pd.DataFrame(result["rows"])
238238
metrics = result["metrics"]
239239

240-
assert len(row_result_df.keys()) == 30
240+
assert len(row_result_df.keys()) == 32
241241
assert len(row_result_df["inputs.conversation"]) == 2
242242
assert len(row_result_df["outputs.grounded.groundedness"]) == 2
243243
assert len(row_result_df["outputs.grounded.gpt_groundedness"]) == 2
@@ -253,8 +253,8 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
253253
assert len(row_result_df["outputs.relevance.evaluation_per_turn"]) == 2
254254
assert len(row_result_df["outputs.grounded_pro.groundedness_pro_label"]) == 2
255255
assert len(row_result_df["outputs.grounded_pro.evaluation_per_turn"]) == 2
256-
# assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 2
257-
# assert len(row_result_df["outputs.protected_material.evaluation_per_turn"]) == 2
256+
assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 2
257+
assert len(row_result_df["outputs.protected_material.evaluation_per_turn"]) == 2
258258
assert len(row_result_df["outputs.indirect_attack.xpia_label"]) == 2
259259
assert len(row_result_df["outputs.indirect_attack.xpia_manipulated_content"]) == 2
260260
assert len(row_result_df["outputs.indirect_attack.xpia_intrusion"]) == 2
@@ -271,7 +271,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
271271
assert len(row_result_df["outputs.retrieval.gpt_retrieval"]) == 2
272272
assert len(row_result_df["outputs.retrieval.evaluation_per_turn"]) == 2
273273

274-
assert len(metrics.keys()) == 20
274+
assert len(metrics.keys()) == 21
275275
assert metrics["coherence.coherence"] >= 0
276276
assert metrics["coherence.gpt_coherence"] >= 0
277277
assert metrics["fluency.fluency"] >= 0
@@ -290,7 +290,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
290290
assert metrics["content_safety.hate_unfairness_defect_rate"] >= 0
291291
assert metrics["content_safety.self_harm_defect_rate"] >= 0
292292
assert metrics["grounded_pro.groundedness_pro_passing_rate"] >= 0
293-
# assert metrics["protected_material.protected_material_defect_rate"] >= 0
293+
assert metrics["protected_material.protected_material_defect_rate"] >= 0
294294
assert metrics["indirect_attack.xpia_defect_rate"] >= 0
295295
assert metrics["eci.eci_defect_rate"] >= 0
296296

@@ -311,9 +311,6 @@ def test_evaluate_multimodal(
311311
project_scope,
312312
run_from_temp_dir,
313313
):
314-
# ContentSafetyMultimodalEvaluator is excluded due 2 reasons:
315-
# - It fails in playback mode for some reason
316-
# - It's imminently being removed in favor of the ContentSafetyEvaluator.
317314
evaluators = {
318315
"protected_material_old": ProtectedMaterialMultimodalEvaluator(
319316
credential=azure_cred, azure_ai_project=project_scope
@@ -396,12 +393,12 @@ def test_evaluate_multimodal(
396393
assert 0 <= metrics.get("content_safety.violence_defect_rate") <= 1
397394
assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1
398395
assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1
399-
assert 0 <= metrics.get("protected_material.fictional_characters_label") <= 1
400-
assert 0 <= metrics.get("protected_material.logos_and_brands_label") <= 1
401-
assert 0 <= metrics.get("protected_material.artwork_label") <= 1
396+
assert 0 <= metrics.get("protected_material.fictional_characters_defect_rate") <= 1
397+
assert 0 <= metrics.get("protected_material.logos_and_brands_defect_rate") <= 1
398+
assert 0 <= metrics.get("protected_material.artwork_defect_rate") <= 1
402399
assert 0 <= metrics.get("sexual.sexual_defect_rate") <= 1
403400
# TODO remove _old values when multi-modal specific evaluators are removed
404-
assert 0 <= metrics.get("protected_material_old.fictional_characters_label") <= 1
405-
assert 0 <= metrics.get("protected_material_old.logos_and_brands_label") <= 1
406-
assert 0 <= metrics.get("protected_material_old.artwork_label") <= 1
401+
assert 0 <= metrics.get("protected_material_old.fictional_characters_defect_rate") <= 1
402+
assert 0 <= metrics.get("protected_material_old.logos_and_brands_defect_rate") <= 1
403+
assert 0 <= metrics.get("protected_material_old.artwork_defect_rate") <= 1
407404
assert 0 <= metrics.get("sexual_old.sexual_defect_rate") <= 1

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_sim_and_eval.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,14 @@ async def callback(
216216
assert eval_output["rows"][0]["outputs.protected_material.artwork_label"] is not None
217217
assert eval_output["rows"][0]["outputs.protected_material.logos_and_brands_label"] is not None
218218

219+
assert "protected_material.fictional_characters_defect_rate" in metrics.keys()
220+
assert "protected_material.logos_and_brands_defect_rate" in metrics.keys()
221+
assert "protected_material.artwork_defect_rate" in metrics.keys()
222+
223+
assert 0 <= metrics.get("protected_material.fictional_characters_defect_rate") <= 1
224+
assert 0 <= metrics.get("protected_material.logos_and_brands_defect_rate") <= 1
225+
assert 0 <= metrics.get("protected_material.artwork_defect_rate") <= 1
226+
219227
# Cleanup file
220228
os.remove(file_name)
221229

0 commit comments

Comments
 (0)