feat: GenAI Client(evals) - Add support for inference_configs in create_evaluation_run.

jsondai · copybara-github · commit 33fe72a41de3 · 2026-01-14T12:46:22.000-08:00
PiperOrigin-RevId: 856324409
diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -20,11 +20,11 @@
 import pytest
 
 GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
-UNIVERSAL_AR_METRIC = types.EvaluationRunMetric(
-    metric="universal_ar_v1",
+GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
+    metric="general_quality_v1",
     metric_config=types.UnifiedMetric(
         predefined_metric_spec=types.PredefinedMetricSpec(
-            metric_spec_name="universal_ar_v1",
+            metric_spec_name="general_quality_v1",
         )
     ),
 )
@@ -71,7 +71,7 @@ def test_create_eval_run_data_source_evaluation_set(client):
         ),
         dest=GCS_DEST,
         metrics=[
-            UNIVERSAL_AR_METRIC,
+            GENERAL_QUALITY_METRIC,
             types.RubricMetric.FINAL_RESPONSE_QUALITY,
             LLM_METRIC,
         ],
@@ -94,7 +94,7 @@ def test_create_eval_run_data_source_evaluation_set(client):
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
         ),
-        metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
+        metrics=[GENERAL_QUALITY_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
     )
     assert evaluation_run.inference_configs[
         "agent-1"
@@ -131,7 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
         ),
         labels={"label1": "value1"},
         dest=GCS_DEST,
-        metrics=[UNIVERSAL_AR_METRIC],
+        metrics=[GENERAL_QUALITY_METRIC],
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test5"
@@ -152,7 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
         ),
-        metrics=[UNIVERSAL_AR_METRIC],
+        metrics=[GENERAL_QUALITY_METRIC],
     )
     assert evaluation_run.inference_configs is None
     assert evaluation_run.labels == {
@@ -161,6 +161,43 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
     assert evaluation_run.error is None
 
 
+def test_create_eval_run_with_inference_configs(client):
+    """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
+    client._api_client._http_options.api_version = "v1beta1"
+    inference_config = types.EvaluationRunInferenceConfig(
+        model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+    )
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test_inference_config",
+        display_name="test_inference_config",
+        dataset=types.EvaluationRunDataSource(
+            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        inference_configs={"model_1": inference_config},
+        labels={"label1": "value1"},
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test_inference_config"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert evaluation_run.data_source.evaluation_set == (
+        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+    )
+    assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
+        output_config=genai_types.OutputConfig(
+            gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
+        ),
+        metrics=[GENERAL_QUALITY_METRIC],
+    )
+    assert evaluation_run.inference_configs["model_1"] == inference_config
+    assert evaluation_run.labels == {
+        "label1": "value1",
+    }
+    assert evaluation_run.error is None
+
+
 # Test fails in replay mode because of UUID generation mismatch.
 # def test_create_eval_run_data_source_evaluation_dataset(client):
 #     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
@@ -217,7 +254,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
 #             eval_dataset_df=input_df,
 #         ),
 #         dest=GCS_DEST,
-#         metrics=[UNIVERSAL_AR_METRIC],
+#         metrics=[GENERAL_QUALITY_METRIC],
 #     )
 #     assert isinstance(evaluation_run, types.EvaluationRun)
 #     assert evaluation_run.display_name == "test6"
@@ -278,7 +315,7 @@ async def test_create_eval_run_async(client):
             )
         ),
         dest=GCS_DEST,
-        metrics=[UNIVERSAL_AR_METRIC],
+        metrics=[GENERAL_QUALITY_METRIC],
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test8"
@@ -295,7 +332,7 @@ async def test_create_eval_run_async(client):
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
         ),
-        metrics=[UNIVERSAL_AR_METRIC],
+        metrics=[GENERAL_QUALITY_METRIC],
     )
     assert evaluation_run.error is None
     assert evaluation_run.inference_configs is None
@@ -304,6 +341,44 @@ async def test_create_eval_run_async(client):
     assert evaluation_run.error is None
 
 
+@pytest.mark.asyncio
+async def test_create_eval_run_async_with_inference_configs(client):
+    """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
+    client._api_client._http_options.api_version = "v1beta1"
+    inference_config = types.EvaluationRunInferenceConfig(
+        model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+    )
+    evaluation_run = await client.aio.evals.create_evaluation_run(
+        name="test_inference_config_async",
+        display_name="test_inference_config_async",
+        dataset=types.EvaluationRunDataSource(
+            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        inference_configs={"model_1": inference_config},
+        labels={"label1": "value1"},
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test_inference_config_async"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert evaluation_run.data_source.evaluation_set == (
+        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+    )
+    assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
+        output_config=genai_types.OutputConfig(
+            gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
+        ),
+        metrics=[GENERAL_QUALITY_METRIC],
+    )
+    assert evaluation_run.inference_configs["model_1"] == inference_config
+    assert evaluation_run.labels == {
+        "label1": "value1",
+    }
+    assert evaluation_run.error is None
+
+
 pytestmark = pytest_helper.setup(
     file=__file__,
     globals_for_file=globals(),
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
@@ -1581,6 +1581,9 @@ def create_evaluation_run(
         name: Optional[str] = None,
         display_name: Optional[str] = None,
         agent_info: Optional[types.evals.AgentInfoOrDict] = None,
+        inference_configs: Optional[
+            dict[str, types.EvaluationRunInferenceConfigOrDict]
+        ] = None,
         labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
@@ -1593,12 +1596,21 @@ def create_evaluation_run(
           name: The name of the evaluation run.
           display_name: The display name of the evaluation run.
           agent_info: The agent info to evaluate.
+          inference_configs: The candidate to inference config map for the evaluation run.
+              The key is the candidate name, and the value is the inference config.
+              If provided, agent_info must be None.
+              Example:
+              {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")}
           labels: The labels to apply to the evaluation run.
           config: The configuration for the evaluation run.
 
         Returns:
             The created evaluation run.
         """
+        if agent_info and inference_configs:
+            raise ValueError(
+                "At most one of agent_info or inference_configs can be provided."
+            )
         if agent_info and isinstance(agent_info, dict):
             agent_info = types.evals.AgentInfo.model_validate(agent_info)
         if type(dataset).__name__ == "EvaluationDataset":
@@ -1630,8 +1642,8 @@ def create_evaluation_run(
         evaluation_config = types.EvaluationRunConfig(
             output_config=output_config, metrics=resolved_metrics
         )
-        inference_configs = {}
         if agent_info:
+            inference_configs = {}
             inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
                 agent_config=types.EvaluationRunAgentConfig(
                     developer_instruction=genai_types.Content(
@@ -2429,6 +2441,9 @@ async def create_evaluation_run(
         name: Optional[str] = None,
         display_name: Optional[str] = None,
         agent_info: Optional[types.evals.AgentInfo] = None,
+        inference_configs: Optional[
+            dict[str, types.EvaluationRunInferenceConfigOrDict]
+        ] = None,
         labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
@@ -2441,12 +2456,21 @@ async def create_evaluation_run(
           name: The name of the evaluation run.
           display_name: The display name of the evaluation run.
           agent_info: The agent info to evaluate.
+          inference_configs: The candidate to inference config map for the evaluation run.
+              The key is the candidate name, and the value is the inference config.
+              If provided, agent_info must be None.
+              Example:
+              {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")}
           labels: The labels to apply to the evaluation run.
           config: The configuration for the evaluation run.
 
         Returns:
             The created evaluation run.
         """
+        if agent_info and inference_configs:
+            raise ValueError(
+                "At most one of agent_info or inference_configs can be provided."
+            )
         if agent_info and isinstance(agent_info, dict):
             agent_info = types.evals.AgentInfo.model_validate(agent_info)
         if type(dataset).__name__ == "EvaluationDataset":
@@ -2477,8 +2501,8 @@ async def create_evaluation_run(
         evaluation_config = types.EvaluationRunConfig(
             output_config=output_config, metrics=resolved_metrics
         )
-        inference_configs = {}
         if agent_info:
+            inference_configs = {}
             inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
                 agent_config=types.EvaluationRunAgentConfig(
                     developer_instruction=genai_types.Content(