55import pathlib
66import pandas as pd
77import pytest
8+ from regex import F
89
910
1011from azure .ai .evaluation import (
1112 F1ScoreEvaluator ,
12- # GleuScoreEvaluator,
13+ GleuScoreEvaluator ,
1314 BleuScoreEvaluator ,
1415 RougeScoreEvaluator ,
1516 MeteorScoreEvaluator ,
1617 CoherenceEvaluator ,
1718 FluencyEvaluator ,
1819 RelevanceEvaluator ,
19- # SimilarityEvaluator,
20+ SimilarityEvaluator ,
2021 GroundednessEvaluator ,
2122 # QAEvaluator,
2223 ContentSafetyEvaluator ,
@@ -74,21 +75,20 @@ class TestMassEvaluate:
7475 """
7576
7677 def test_evaluate_singleton_inputs (self , model_config , azure_cred , project_scope , data_file ):
77- # qa and similarity disabled due to being playback-unfriendly due to URL sanitization problems.
78- # glue disabled due to being unfriendly to CI playback for some reason.
79- # content safety disabled temporarily to test CI PF teardown race condition
78+ # qa fails in playback but ONLY when using the pf proxy for some reason, and
79+ # using it without pf proxy causes CI to hang and timeout after 3 hours.
8080 evaluators = {
8181 "f1_score" : F1ScoreEvaluator (),
82- # "gleu": GleuScoreEvaluator(),
82+ "gleu" : GleuScoreEvaluator (),
8383 "bleu" : BleuScoreEvaluator (),
8484 "rouge" : RougeScoreEvaluator (RougeType .ROUGE_L ),
8585 "meteor" : MeteorScoreEvaluator (),
8686 "grounded" : GroundednessEvaluator (model_config ),
8787 "coherence" : CoherenceEvaluator (model_config ),
8888 "fluency" : FluencyEvaluator (model_config ),
8989 "relevance" : RelevanceEvaluator (model_config ),
90- # "similarity": SimilarityEvaluator(model_config),
91- # "qa" : QAEvaluator(model_config),
90+ "similarity" : SimilarityEvaluator (model_config ),
91+ # "qa": QAEvaluator(model_config),
9292 "grounded_pro" : GroundednessProEvaluator (azure_cred , project_scope ),
9393 "protected_material" : ProtectedMaterialEvaluator (azure_cred , project_scope ),
9494 "indirect_attack" : IndirectAttackEvaluator (azure_cred , project_scope ),
@@ -105,13 +105,13 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
105105 row_result_df = pd .DataFrame (result ["rows" ])
106106 metrics = result ["metrics" ]
107107
108- assert len (row_result_df .keys ()) == 45 # 63 with gleu, qa/similarity
108+ assert len (row_result_df .keys ()) == 48 # 63 with qa
109109 assert len (row_result_df ["inputs.query" ]) == 3
110110 assert len (row_result_df ["inputs.context" ]) == 3
111111 assert len (row_result_df ["inputs.response" ]) == 3
112112 assert len (row_result_df ["inputs.ground_truth" ]) == 3
113113 assert len (row_result_df ["outputs.f1_score.f1_score" ]) == 3
114- # assert len(row_result_df["outputs.gleu.gleu_score"]) == 3
114+ assert len (row_result_df ["outputs.gleu.gleu_score" ]) == 3
115115 assert len (row_result_df ["outputs.bleu.bleu_score" ]) == 3
116116 assert len (row_result_df ["outputs.rouge.rouge_precision" ]) == 3
117117 assert len (row_result_df ["outputs.rouge.rouge_recall" ]) == 3
@@ -129,23 +129,8 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
129129 assert len (row_result_df ["outputs.relevance.relevance" ]) == 3
130130 assert len (row_result_df ["outputs.relevance.gpt_relevance" ]) == 3
131131 assert len (row_result_df ["outputs.relevance.relevance_reason" ]) == 3
132- # assert len(row_result_df['outputs.similarity.similarity']) == 3
133- # assert len(row_result_df['outputs.similarity.gpt_similarity']) == 3
134- # assert len(row_result_df['outputs.qa.f1_score']) == 3
135- # assert len(row_result_df['outputs.qa.groundedness']) == 3
136- # assert len(row_result_df['outputs.qa.gpt_groundedness']) == 3
137- # assert len(row_result_df['outputs.qa.groundedness_reason']) == 3
138- # assert len(row_result_df['outputs.qa.coherence']) == 3
139- # assert len(row_result_df['outputs.qa.gpt_coherence']) == 3
140- # assert len(row_result_df['outputs.qa.coherence_reason']) == 3
141- # assert len(row_result_df['outputs.qa.fluency']) == 3
142- # assert len(row_result_df['outputs.qa.gpt_fluency']) == 3
143- # assert len(row_result_df['outputs.qa.fluency_reason']) == 3
144- # assert len(row_result_df['outputs.qa.relevance']) == 3
145- # assert len(row_result_df['outputs.qa.gpt_relevance']) == 3
146- # assert len(row_result_df['outputs.qa.relevance_reason']) == 3
147- # assert len(row_result_df['outputs.qa.similarity']) == 3
148- # assert len(row_result_df['outputs.qa.gpt_similarity']) == 3
132+ assert len (row_result_df ["outputs.similarity.similarity" ]) == 3
133+ assert len (row_result_df ["outputs.similarity.gpt_similarity" ]) == 3
149134 assert len (row_result_df ["outputs.grounded_pro.groundedness_pro_label" ]) == 3
150135 assert len (row_result_df ["outputs.grounded_pro.groundedness_pro_reason" ]) == 3
151136 assert len (row_result_df ["outputs.protected_material.protected_material_label" ]) == 3
@@ -169,10 +154,25 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
169154 assert len (row_result_df ["outputs.content_safety.violence" ]) == 3
170155 assert len (row_result_df ["outputs.content_safety.violence_score" ]) == 3
171156 assert len (row_result_df ["outputs.content_safety.violence_reason" ]) == 3
157+ # assert len(row_result_df["outputs.qa.f1_score"]) == 3
158+ # assert len(row_result_df["outputs.qa.groundedness"]) == 3
159+ # assert len(row_result_df["outputs.qa.gpt_groundedness"]) == 3
160+ # assert len(row_result_df["outputs.qa.groundedness_reason"]) == 3
161+ # assert len(row_result_df["outputs.qa.coherence"]) == 3
162+ # assert len(row_result_df["outputs.qa.gpt_coherence"]) == 3
163+ # assert len(row_result_df["outputs.qa.coherence_reason"]) == 3
164+ # assert len(row_result_df["outputs.qa.fluency"]) == 3
165+ # assert len(row_result_df["outputs.qa.gpt_fluency"]) == 3
166+ # assert len(row_result_df["outputs.qa.fluency_reason"]) == 3
167+ # assert len(row_result_df["outputs.qa.relevance"]) == 3
168+ # assert len(row_result_df["outputs.qa.gpt_relevance"]) == 3
169+ # assert len(row_result_df["outputs.qa.relevance_reason"]) == 3
170+ # assert len(row_result_df["outputs.qa.similarity"]) == 3
171+ # assert len(row_result_df["outputs.qa.gpt_similarity"]) == 3
172172
173- assert len (metrics .keys ()) == 25 # 39 with gleu, qa, similarity
173+ assert len (metrics .keys ()) == 28 # 39 with qa
174174 assert metrics ["f1_score.f1_score" ] >= 0
175- # assert metrics["gleu.gleu_score"] >= 0
175+ assert metrics ["gleu.gleu_score" ] >= 0
176176 assert metrics ["bleu.bleu_score" ] >= 0
177177 assert metrics ["rouge.rouge_precision" ] >= 0
178178 assert metrics ["rouge.rouge_recall" ] >= 0
@@ -186,8 +186,8 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
186186 assert metrics ["fluency.gpt_fluency" ] >= 0
187187 assert metrics ["relevance.relevance" ] >= 0
188188 assert metrics ["relevance.gpt_relevance" ] >= 0
189- # assert metrics[' similarity.similarity' ] >= 0
190- # assert metrics[' similarity.gpt_similarity' ] >= 0
189+ assert metrics [" similarity.similarity" ] >= 0
190+ assert metrics [" similarity.gpt_similarity" ] >= 0
191191 assert metrics ["indirect_attack.xpia_manipulated_content" ] >= 0
192192 assert metrics ["indirect_attack.xpia_intrusion" ] >= 0
193193 assert metrics ["indirect_attack.xpia_information_gathering" ] >= 0
@@ -199,17 +199,17 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
199199 assert metrics ["protected_material.protected_material_defect_rate" ] >= 0
200200 assert metrics ["indirect_attack.xpia_defect_rate" ] >= 0
201201 assert metrics ["eci.eci_defect_rate" ] >= 0
202- # assert metrics[' qa.f1_score' ] >= 0
203- # assert metrics[' qa.groundedness' ] >= 0
204- # assert metrics[' qa.gpt_groundedness' ] >= 0
205- # assert metrics[' qa.coherence' ] >= 0
206- # assert metrics[' qa.gpt_coherence' ] >= 0
207- # assert metrics[' qa.fluency' ] >= 0
208- # assert metrics[' qa.gpt_fluency' ] >= 0
209- # assert metrics[' qa.relevance' ] >= 0
210- # assert metrics[' qa.gpt_relevance' ] >= 0
211- # assert metrics[' qa.similarity' ] >= 0
212- # assert metrics[' qa.gpt_similarity' ] >= 0
202+ # assert metrics[" qa.f1_score" ] >= 0
203+ # assert metrics[" qa.groundedness" ] >= 0
204+ # assert metrics[" qa.gpt_groundedness" ] >= 0
205+ # assert metrics[" qa.coherence" ] >= 0
206+ # assert metrics[" qa.gpt_coherence" ] >= 0
207+ # assert metrics[" qa.fluency" ] >= 0
208+ # assert metrics[" qa.gpt_fluency" ] >= 0
209+ # assert metrics[" qa.relevance" ] >= 0
210+ # assert metrics[" qa.gpt_relevance" ] >= 0
211+ # assert metrics[" qa.similarity" ] >= 0
212+ # assert metrics[" qa.gpt_similarity" ] >= 0
213213
214214 def test_evaluate_conversation (self , model_config , data_convo_file , azure_cred , project_scope ):
215215 evaluators = {
@@ -291,7 +291,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
291291 assert metrics ["indirect_attack.xpia_defect_rate" ] >= 0
292292 assert metrics ["eci.eci_defect_rate" ] >= 0
293293
294- # Imagee urls with target is disabled due to being unstable in CI
294+ # Image urls with target is disabled due to being unstable in CI
295295 @pytest .mark .parametrize (
296296 "multi_modal_input_type,pm_evaluator_class,cs_evaluator_class" ,
297297 [
0 commit comments