@@ -88,8 +88,18 @@ class TestMassEvaluate:
8888 - Multi-modal inputs: This one has some parameters for the different types of multi-modal inputs.
8989 """
9090
91- @pytest .mark .skipif (not is_live (), reason = "Skip in playback due to inconsistency in evaluation results." )
92- def test_evaluate_singleton_inputs (self , model_config , azure_cred , project_scope , data_file ):
91+ @pytest .mark .parametrize (
92+ ("proj_scope" , "cred" , "conv" , "m_config" ),
93+ (
94+ ("project_scope" , "azure_cred" , "data_file" , "model_config" ),
95+ # ("project_scope_onedp", "azure_cred_onedp", "data_file", "model_config_onedp"),
96+ )
97+ )
98+ def test_evaluate_singleton_inputs (self , request , proj_scope , cred , conv , m_config ):
99+ project_scope = request .getfixturevalue (proj_scope )
100+ azure_cred = request .getfixturevalue (cred )
101+ data_file = request .getfixturevalue (conv )
102+ model_config = request .getfixturevalue (m_config )
93103 # qa fails in playback but ONLY when using the pf proxy for some reason, and
94104 # using it without pf proxy causes CI to hang and timeout after 3 hours.
95105 evaluators = {
@@ -184,7 +194,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
184194 assert len (row_result_df ["outputs.qa.similarity" ]) == 3
185195 assert len (row_result_df ["outputs.qa.gpt_similarity" ]) == 3
186196
187- assert len (metrics .keys ()) == 62
197+ assert len (metrics .keys ()) == 76
188198 assert metrics ["f1_score.f1_score" ] >= 0
189199 assert metrics ["gleu.gleu_score" ] >= 0
190200 assert metrics ["bleu.bleu_score" ] >= 0
@@ -225,7 +235,19 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
225235 assert metrics ["qa.similarity" ] >= 0
226236 assert metrics ["qa.gpt_similarity" ] >= 0
227237
228- def test_evaluate_conversation (self , model_config , data_convo_file , azure_cred , project_scope ):
238+ @pytest .mark .parametrize (
239+ ("proj_scope" , "cred" , "conv" , "m_config" ),
240+ (
241+ ("project_scope" , "azure_cred" , "data_convo_file" , "model_config" ),
242+ # ("project_scope_onedp", "azure_cred_onedp", "data_convo_file", "model_config_onedp"),
243+ )
244+ )
245+ def test_evaluate_conversation (self , request , proj_scope , cred , conv , m_config ):
246+ project_scope = request .getfixturevalue (proj_scope )
247+ azure_cred = request .getfixturevalue (cred )
248+ data_convo_file = request .getfixturevalue (conv )
249+ model_config = request .getfixturevalue (m_config )
250+
229251 evaluators = {
230252 "grounded" : GroundednessEvaluator (model_config ),
231253 "coherence" : CoherenceEvaluator (model_config ),
0 commit comments