diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index d130bac190..392de18fc2 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -20,11 +20,11 @@ import pytest GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output" -UNIVERSAL_AR_METRIC = types.EvaluationRunMetric( - metric="universal_ar_v1", +GENERAL_QUALITY_METRIC = types.EvaluationRunMetric( + metric="general_quality_v1", metric_config=types.UnifiedMetric( predefined_metric_spec=types.PredefinedMetricSpec( - metric_spec_name="universal_ar_v1", + metric_spec_name="general_quality_v1", ) ), ) @@ -71,7 +71,7 @@ def test_create_eval_run_data_source_evaluation_set(client): ), dest=GCS_DEST, metrics=[ - UNIVERSAL_AR_METRIC, + GENERAL_QUALITY_METRIC, types.RubricMetric.FINAL_RESPONSE_QUALITY, LLM_METRIC, ], @@ -94,7 +94,7 @@ def test_create_eval_run_data_source_evaluation_set(client): output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) ), - metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], + metrics=[GENERAL_QUALITY_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC], ) assert evaluation_run.inference_configs[ "agent-1" @@ -131,7 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): ), labels={"label1": "value1"}, dest=GCS_DEST, - metrics=[UNIVERSAL_AR_METRIC], + metrics=[GENERAL_QUALITY_METRIC], ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test5" @@ -152,7 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) ), - metrics=[UNIVERSAL_AR_METRIC], + metrics=[GENERAL_QUALITY_METRIC], ) assert evaluation_run.inference_configs is None assert evaluation_run.labels == { @@ -161,6 +161,43 @@ def test_create_eval_run_data_source_bigquery_request_set(client): assert evaluation_run.error is None +def test_create_eval_run_with_inference_configs(client): + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs.""" + client._api_client._http_options.api_version = "v1beta1" + inference_config = types.EvaluationRunInferenceConfig( + model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" + ) + evaluation_run = client.evals.create_evaluation_run( + name="test_inference_config", + display_name="test_inference_config", + dataset=types.EvaluationRunDataSource( + evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + ), + dest=GCS_DEST, + metrics=[GENERAL_QUALITY_METRIC], + inference_configs={"model_1": inference_config}, + labels={"label1": "value1"}, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test_inference_config" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert evaluation_run.data_source.evaluation_set == ( + "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + ) + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + metrics=[GENERAL_QUALITY_METRIC], + ) + assert evaluation_run.inference_configs["model_1"] == inference_config + assert evaluation_run.labels == { + "label1": "value1", + } + assert evaluation_run.error is None + + # Test fails in replay mode because of UUID generation mismatch. # def test_create_eval_run_data_source_evaluation_dataset(client): # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset.""" @@ -217,7 +254,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client): # eval_dataset_df=input_df, # ), # dest=GCS_DEST, -# metrics=[UNIVERSAL_AR_METRIC], +# metrics=[GENERAL_QUALITY_METRIC], # ) # assert isinstance(evaluation_run, types.EvaluationRun) # assert evaluation_run.display_name == "test6" @@ -278,7 +315,7 @@ async def test_create_eval_run_async(client): ) ), dest=GCS_DEST, - metrics=[UNIVERSAL_AR_METRIC], + metrics=[GENERAL_QUALITY_METRIC], ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test8" @@ -295,7 +332,7 @@ async def test_create_eval_run_async(client): output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) ), - metrics=[UNIVERSAL_AR_METRIC], + metrics=[GENERAL_QUALITY_METRIC], ) assert evaluation_run.error is None assert evaluation_run.inference_configs is None @@ -304,6 +341,44 @@ async def test_create_eval_run_async(client): assert evaluation_run.error is None +@pytest.mark.asyncio +async def test_create_eval_run_async_with_inference_configs(client): + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously.""" + client._api_client._http_options.api_version = "v1beta1" + inference_config = types.EvaluationRunInferenceConfig( + model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" + ) + evaluation_run = await client.aio.evals.create_evaluation_run( + name="test_inference_config_async", + display_name="test_inference_config_async", + dataset=types.EvaluationRunDataSource( + evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + ), + dest=GCS_DEST, + metrics=[GENERAL_QUALITY_METRIC], + inference_configs={"model_1": inference_config}, + labels={"label1": "value1"}, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test_inference_config_async" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert evaluation_run.data_source.evaluation_set == ( + "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + ) + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + metrics=[GENERAL_QUALITY_METRIC], + ) + assert evaluation_run.inference_configs["model_1"] == inference_config + assert evaluation_run.labels == { + "label1": "value1", + } + assert evaluation_run.error is None + + pytestmark = pytest_helper.setup( file=__file__, globals_for_file=globals(), diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index d968b4e7c3..afbf148025 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -1581,6 +1581,9 @@ def create_evaluation_run( name: Optional[str] = None, display_name: Optional[str] = None, agent_info: Optional[types.evals.AgentInfoOrDict] = None, + inference_configs: Optional[ + dict[str, types.EvaluationRunInferenceConfigOrDict] + ] = None, labels: Optional[dict[str, str]] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: @@ -1593,12 +1596,18 @@ def create_evaluation_run( name: The name of the evaluation run. display_name: The display name of the evaluation run. agent_info: The agent info to evaluate. + inference_configs: The candidate to inference config map for the evaluation run. + If provided, agent_info must be None. labels: The labels to apply to the evaluation run. config: The configuration for the evaluation run. Returns: The created evaluation run. """ + if agent_info and inference_configs: + raise ValueError( + "At most one of agent_info or inference_configs can be provided." + ) if agent_info and isinstance(agent_info, dict): agent_info = types.evals.AgentInfo.model_validate(agent_info) if type(dataset).__name__ == "EvaluationDataset": @@ -1630,8 +1639,8 @@ def create_evaluation_run( evaluation_config = types.EvaluationRunConfig( output_config=output_config, metrics=resolved_metrics ) - inference_configs = {} if agent_info: + inference_configs = {} inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig( agent_config=types.EvaluationRunAgentConfig( developer_instruction=genai_types.Content( @@ -2429,6 +2438,9 @@ async def create_evaluation_run( name: Optional[str] = None, display_name: Optional[str] = None, agent_info: Optional[types.evals.AgentInfo] = None, + inference_configs: Optional[ + dict[str, types.EvaluationRunInferenceConfigOrDict] + ] = None, labels: Optional[dict[str, str]] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: @@ -2441,12 +2453,18 @@ async def create_evaluation_run( name: The name of the evaluation run. display_name: The display name of the evaluation run. agent_info: The agent info to evaluate. + inference_configs: The candidate to inference config map for the evaluation run. + If provided, agent_info must be None. labels: The labels to apply to the evaluation run. config: The configuration for the evaluation run. Returns: The created evaluation run. """ + if agent_info and inference_configs: + raise ValueError( + "At most one of agent_info or inference_configs can be provided." + ) if agent_info and isinstance(agent_info, dict): agent_info = types.evals.AgentInfo.model_validate(agent_info) if type(dataset).__name__ == "EvaluationDataset": @@ -2477,8 +2495,8 @@ async def create_evaluation_run( evaluation_config = types.EvaluationRunConfig( output_config=output_config, metrics=resolved_metrics ) - inference_configs = {} if agent_info: + inference_configs = {} inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig( agent_config=types.EvaluationRunAgentConfig( developer_instruction=genai_types.Content(