Skip to content

Commit cb1506c

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add retry to predefine metric
PiperOrigin-RevId: 825757088
1 parent e600277 commit cb1506c

File tree

2 files changed

+136
-3
lines changed

2 files changed

+136
-3
lines changed

tests/unit/vertexai/genai/test_evals.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from vertexai._genai import evals
3434
from vertexai._genai import types as vertexai_genai_types
3535
from google.genai import client
36+
from google.genai import errors as genai_errors
3637
from google.genai import types as genai_types
3738
import pandas as pd
3839
import pytest
@@ -4861,6 +4862,114 @@ def test_execute_evaluation_adds_creation_timestamp(
48614862
assert result.metadata is not None
48624863
assert result.metadata.creation_timestamp == mock_now
48634864

4865+
@mock.patch(
4866+
"vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS",
4867+
frozenset(["summarization_quality"]),
4868+
)
4869+
@mock.patch("time.sleep", return_value=None)
4870+
@mock.patch(
4871+
"vertexai._genai.evals.Evals._evaluate_instances"
4872+
)
4873+
def test_predefined_metric_retry_on_resource_exhausted(
4874+
self,
4875+
mock_private_evaluate_instances,
4876+
mock_sleep,
4877+
mock_api_client_fixture,
4878+
):
4879+
dataset_df = pd.DataFrame(
4880+
[{"prompt": "Test prompt", "response": "Test response"}]
4881+
)
4882+
input_dataset = vertexai_genai_types.EvaluationDataset(
4883+
eval_dataset_df=dataset_df
4884+
)
4885+
metric = vertexai_genai_types.Metric(name="summarization_quality")
4886+
metric_result = vertexai_genai_types.MetricResult(
4887+
score=0.9,
4888+
explanation="Mocked predefined explanation",
4889+
rubric_verdicts=[],
4890+
error=None,
4891+
)
4892+
error_response_json = {
4893+
"error": {
4894+
"code": 429,
4895+
"message": ("Judge model resource exhausted. Please try again later."),
4896+
"status": "RESOURCE_EXHAUSTED",
4897+
}
4898+
}
4899+
mock_private_evaluate_instances.side_effect = [
4900+
genai_errors.ClientError(code=429, response_json=error_response_json),
4901+
genai_errors.ClientError(code=429, response_json=error_response_json),
4902+
vertexai_genai_types.EvaluateInstancesResponse(
4903+
metric_results=[metric_result]
4904+
),
4905+
]
4906+
4907+
result = _evals_common._execute_evaluation(
4908+
api_client=mock_api_client_fixture,
4909+
dataset=input_dataset,
4910+
metrics=[metric],
4911+
)
4912+
4913+
assert mock_private_evaluate_instances.call_count == 3
4914+
assert mock_sleep.call_count == 2
4915+
assert len(result.summary_metrics) == 1
4916+
summary_metric = result.summary_metrics[0]
4917+
assert summary_metric.metric_name == "summarization_quality"
4918+
assert summary_metric.mean_score == 0.9
4919+
4920+
@mock.patch(
4921+
"vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS",
4922+
frozenset(["summarization_quality"]),
4923+
)
4924+
@mock.patch("time.sleep", return_value=None)
4925+
@mock.patch(
4926+
"vertexai._genai.evals.Evals._evaluate_instances"
4927+
)
4928+
def test_predefined_metric_retry_fail_on_resource_exhausted(
4929+
self,
4930+
mock_private_evaluate_instances,
4931+
mock_sleep,
4932+
mock_api_client_fixture,
4933+
):
4934+
dataset_df = pd.DataFrame(
4935+
[{"prompt": "Test prompt", "response": "Test response"}]
4936+
)
4937+
input_dataset = vertexai_genai_types.EvaluationDataset(
4938+
eval_dataset_df=dataset_df
4939+
)
4940+
error_response_json = {
4941+
"error": {
4942+
"code": 429,
4943+
"message": ("Judge model resource exhausted. Please try again later."),
4944+
"status": "RESOURCE_EXHAUSTED",
4945+
}
4946+
}
4947+
metric = vertexai_genai_types.Metric(name="summarization_quality")
4948+
mock_private_evaluate_instances.side_effect = [
4949+
genai_errors.ClientError(code=429, response_json=error_response_json),
4950+
genai_errors.ClientError(code=429, response_json=error_response_json),
4951+
genai_errors.ClientError(code=429, response_json=error_response_json),
4952+
]
4953+
4954+
result = _evals_common._execute_evaluation(
4955+
api_client=mock_api_client_fixture,
4956+
dataset=input_dataset,
4957+
metrics=[metric],
4958+
)
4959+
4960+
assert mock_private_evaluate_instances.call_count == 3
4961+
assert mock_sleep.call_count == 2
4962+
assert len(result.summary_metrics) == 1
4963+
summary_metric = result.summary_metrics[0]
4964+
assert summary_metric.metric_name == "summarization_quality"
4965+
assert summary_metric.mean_score is None
4966+
assert summary_metric.num_cases_error == 1
4967+
assert (
4968+
"Judge model resource exhausted after 3 retries"
4969+
) in result.eval_case_results[0].response_candidate_results[0].metric_results[
4970+
"summarization_quality"
4971+
].error_message
4972+
48644973

48654974
class TestEvaluationDataset:
48664975
"""Contains set of tests for the EvaluationDataset class methods."""

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020
import json
2121
import logging
2222
import statistics
23+
import time
2324
from typing import Any, Callable, Optional, TypeVar, Union
2425

26+
from google.genai import errors as genai_errors
2527
from google.genai import _common
2628
from google.genai import types as genai_types
2729
from tqdm import tqdm
@@ -34,6 +36,7 @@
3436

3537

3638
logger = logging.getLogger(__name__)
39+
_MAX_RETRIES = 3
3740

3841

3942
def _extract_text_from_content(
@@ -964,9 +967,30 @@ def get_metric_result(
964967
metric_name = self.metric.name
965968
try:
966969
payload = self._build_request_payload(eval_case, response_index)
967-
api_response = self.module._evaluate_instances(
968-
metrics=[self.metric], instance=payload.get("instance")
969-
)
970+
for attempt in range(_MAX_RETRIES):
971+
try:
972+
api_response = self.module._evaluate_instances(
973+
metrics=[self.metric], instance=payload.get("instance")
974+
)
975+
break
976+
except genai_errors.ClientError as e:
977+
if e.code == 429:
978+
logger.warning(
979+
"Resource Exhausted error on attempt %d/%d: %s. Retrying in %s"
980+
" seconds...",
981+
attempt + 1,
982+
_MAX_RETRIES,
983+
e,
984+
2**attempt,
985+
)
986+
if attempt == _MAX_RETRIES - 1:
987+
return types.EvalCaseMetricResult(
988+
metric_name=metric_name,
989+
error_message=f"Judge model resource exhausted after {_MAX_RETRIES} retries: {e}",
990+
)
991+
time.sleep(2**attempt)
992+
else:
993+
raise e
970994

971995
if (
972996
api_response

0 commit comments

Comments
 (0)