@@ -94,7 +94,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
9494 "similarity" : SimilarityEvaluator (model_config ),
9595 "qa" : QAEvaluator (model_config ),
9696 "grounded_pro" : GroundednessProEvaluator (azure_cred , project_scope ),
97- # "protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope),
97+ "protected_material" : ProtectedMaterialEvaluator (azure_cred , project_scope ),
9898 "indirect_attack" : IndirectAttackEvaluator (azure_cred , project_scope ),
9999 "eci" : ECIEvaluator (azure_cred , project_scope ),
100100 "content_safety" : ContentSafetyEvaluator (azure_cred , project_scope ),
@@ -108,7 +108,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
108108
109109 row_result_df = pd .DataFrame (result ["rows" ])
110110 metrics = result ["metrics" ]
111- assert len (row_result_df .keys ()) == 61
111+ assert len (row_result_df .keys ()) == 63
112112 assert len (row_result_df ["inputs.query" ]) == 3
113113 assert len (row_result_df ["inputs.context" ]) == 3
114114 assert len (row_result_df ["inputs.response" ]) == 3
@@ -136,8 +136,8 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
136136 assert len (row_result_df ["outputs.similarity.gpt_similarity" ]) == 3
137137 assert len (row_result_df ["outputs.grounded_pro.groundedness_pro_label" ]) == 3
138138 assert len (row_result_df ["outputs.grounded_pro.groundedness_pro_reason" ]) == 3
139- # assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 3
140- # assert len(row_result_df["outputs.protected_material.protected_material_reason"]) == 3
139+ assert len (row_result_df ["outputs.protected_material.protected_material_label" ]) == 3
140+ assert len (row_result_df ["outputs.protected_material.protected_material_reason" ]) == 3
141141 assert len (row_result_df ["outputs.indirect_attack.xpia_label" ]) == 3
142142 assert len (row_result_df ["outputs.indirect_attack.xpia_reason" ]) == 3
143143 assert len (row_result_df ["outputs.indirect_attack.xpia_manipulated_content" ]) == 3
@@ -173,7 +173,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
173173 assert len (row_result_df ["outputs.qa.similarity" ]) == 3
174174 assert len (row_result_df ["outputs.qa.gpt_similarity" ]) == 3
175175
176- assert len (metrics .keys ()) == 38
176+ assert len (metrics .keys ()) == 39
177177 assert metrics ["f1_score.f1_score" ] >= 0
178178 assert metrics ["gleu.gleu_score" ] >= 0
179179 assert metrics ["bleu.bleu_score" ] >= 0
@@ -199,7 +199,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
199199 assert metrics ["content_safety.hate_unfairness_defect_rate" ] >= 0
200200 assert metrics ["content_safety.violence_defect_rate" ] >= 0
201201 assert metrics ["grounded_pro.groundedness_pro_passing_rate" ] >= 0
202- # assert metrics["protected_material.protected_material_defect_rate"] >= 0
202+ assert metrics ["protected_material.protected_material_defect_rate" ] >= 0
203203 assert metrics ["indirect_attack.xpia_defect_rate" ] >= 0
204204 assert metrics ["eci.eci_defect_rate" ] >= 0
205205 assert metrics ["qa.f1_score" ] >= 0
@@ -221,7 +221,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
221221 "fluency" : FluencyEvaluator (model_config ),
222222 "relevance" : RelevanceEvaluator (model_config ),
223223 "grounded_pro" : GroundednessProEvaluator (azure_cred , project_scope ),
224- # "protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope),
224+ "protected_material" : ProtectedMaterialEvaluator (azure_cred , project_scope ),
225225 "indirect_attack" : IndirectAttackEvaluator (azure_cred , project_scope ),
226226 "eci" : ECIEvaluator (azure_cred , project_scope ),
227227 "content_safety" : ContentSafetyEvaluator (azure_cred , project_scope ),
@@ -237,7 +237,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
237237 row_result_df = pd .DataFrame (result ["rows" ])
238238 metrics = result ["metrics" ]
239239
240- assert len (row_result_df .keys ()) == 30
240+ assert len (row_result_df .keys ()) == 32
241241 assert len (row_result_df ["inputs.conversation" ]) == 2
242242 assert len (row_result_df ["outputs.grounded.groundedness" ]) == 2
243243 assert len (row_result_df ["outputs.grounded.gpt_groundedness" ]) == 2
@@ -253,8 +253,8 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
253253 assert len (row_result_df ["outputs.relevance.evaluation_per_turn" ]) == 2
254254 assert len (row_result_df ["outputs.grounded_pro.groundedness_pro_label" ]) == 2
255255 assert len (row_result_df ["outputs.grounded_pro.evaluation_per_turn" ]) == 2
256- # assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 2
257- # assert len(row_result_df["outputs.protected_material.evaluation_per_turn"]) == 2
256+ assert len (row_result_df ["outputs.protected_material.protected_material_label" ]) == 2
257+ assert len (row_result_df ["outputs.protected_material.evaluation_per_turn" ]) == 2
258258 assert len (row_result_df ["outputs.indirect_attack.xpia_label" ]) == 2
259259 assert len (row_result_df ["outputs.indirect_attack.xpia_manipulated_content" ]) == 2
260260 assert len (row_result_df ["outputs.indirect_attack.xpia_intrusion" ]) == 2
@@ -271,7 +271,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
271271 assert len (row_result_df ["outputs.retrieval.gpt_retrieval" ]) == 2
272272 assert len (row_result_df ["outputs.retrieval.evaluation_per_turn" ]) == 2
273273
274- assert len (metrics .keys ()) == 20
274+ assert len (metrics .keys ()) == 21
275275 assert metrics ["coherence.coherence" ] >= 0
276276 assert metrics ["coherence.gpt_coherence" ] >= 0
277277 assert metrics ["fluency.fluency" ] >= 0
@@ -290,7 +290,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
290290 assert metrics ["content_safety.hate_unfairness_defect_rate" ] >= 0
291291 assert metrics ["content_safety.self_harm_defect_rate" ] >= 0
292292 assert metrics ["grounded_pro.groundedness_pro_passing_rate" ] >= 0
293- # assert metrics["protected_material.protected_material_defect_rate"] >= 0
293+ assert metrics ["protected_material.protected_material_defect_rate" ] >= 0
294294 assert metrics ["indirect_attack.xpia_defect_rate" ] >= 0
295295 assert metrics ["eci.eci_defect_rate" ] >= 0
296296
@@ -311,9 +311,6 @@ def test_evaluate_multimodal(
311311 project_scope ,
312312 run_from_temp_dir ,
313313 ):
314- # ContentSafetyMultimodalEvaluator is excluded due 2 reasons:
315- # - It fails in playback mode for some reason
316- # - It's imminently being removed in favor of the ContentSafetyEvaluator.
317314 evaluators = {
318315 "protected_material_old" : ProtectedMaterialMultimodalEvaluator (
319316 credential = azure_cred , azure_ai_project = project_scope
@@ -396,12 +393,12 @@ def test_evaluate_multimodal(
396393 assert 0 <= metrics .get ("content_safety.violence_defect_rate" ) <= 1
397394 assert 0 <= metrics .get ("content_safety.self_harm_defect_rate" ) <= 1
398395 assert 0 <= metrics .get ("content_safety.hate_unfairness_defect_rate" ) <= 1
399- assert 0 <= metrics .get ("protected_material.fictional_characters_label " ) <= 1
400- assert 0 <= metrics .get ("protected_material.logos_and_brands_label " ) <= 1
401- assert 0 <= metrics .get ("protected_material.artwork_label " ) <= 1
396+ assert 0 <= metrics .get ("protected_material.fictional_characters_defect_rate " ) <= 1
397+ assert 0 <= metrics .get ("protected_material.logos_and_brands_defect_rate " ) <= 1
398+ assert 0 <= metrics .get ("protected_material.artwork_defect_rate " ) <= 1
402399 assert 0 <= metrics .get ("sexual.sexual_defect_rate" ) <= 1
403400 # TODO remove _old values when multi-modal specific evaluators are removed
404- assert 0 <= metrics .get ("protected_material_old.fictional_characters_label " ) <= 1
405- assert 0 <= metrics .get ("protected_material_old.logos_and_brands_label " ) <= 1
406- assert 0 <= metrics .get ("protected_material_old.artwork_label " ) <= 1
401+ assert 0 <= metrics .get ("protected_material_old.fictional_characters_defect_rate " ) <= 1
402+ assert 0 <= metrics .get ("protected_material_old.logos_and_brands_defect_rate " ) <= 1
403+ assert 0 <= metrics .get ("protected_material_old.artwork_defect_rate " ) <= 1
407404 assert 0 <= metrics .get ("sexual_old.sexual_defect_rate" ) <= 1
0 commit comments