From 9f05e09e62a9b7749e0d4c54104189c7b0a6028f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20De=20Ara=C3=BAjo?= Date: Mon, 20 Oct 2025 14:14:56 +0000 Subject: [PATCH 1/6] chore(ci_visibility): retry Test Optimization API calls on server errors --- ddtrace/internal/ci_visibility/_api_client.py | 22 ++++++++++++++++++- .../ci_visibility/telemetry/api_request.py | 4 ++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/ddtrace/internal/ci_visibility/_api_client.py b/ddtrace/internal/ci_visibility/_api_client.py index e2a010dc676..e34a4b9491f 100644 --- a/ddtrace/internal/ci_visibility/_api_client.py +++ b/ddtrace/internal/ci_visibility/_api_client.py @@ -51,6 +51,7 @@ from ddtrace.internal.utils.http import Response from ddtrace.internal.utils.http import get_connection from ddtrace.internal.utils.http import verify_url +from ddtrace.internal.utils.retry import fibonacci_backoff_with_jitter from ddtrace.internal.utils.time import StopWatch @@ -81,6 +82,22 @@ class TestVisibilitySkippableItemsError(Exception): pass +class CIVisibilityAPIError(Exception): + def __init__(self, status: int) -> None: + self.status = status + + +class CIVisibilityAPIClientError(CIVisibilityAPIError): + pass + + +class CIVisibilityAPIServerError(CIVisibilityAPIError): + pass + + +_RETRIABLE_ERRORS = (*_NETWORK_ERRORS, CIVisibilityAPIServerError) + + @dataclasses.dataclass(frozen=True) class EarlyFlakeDetectionSettings: enabled: bool = False @@ -300,6 +317,7 @@ def _do_request(self, method: str, endpoint: str, payload: str, timeout: t.Optio if conn is not None: conn.close() + @fibonacci_backoff_with_jitter(attempts=5, until=lambda e: not isinstance(e, _RETRIABLE_ERRORS)) def _do_request_with_telemetry( self, method: str, @@ -342,7 +360,9 @@ def _do_request_with_telemetry( error_type = ERROR_TYPES.CODE_4XX if response.status < 500 else ERROR_TYPES.CODE_5XX if response.status == 403: raise CIVisibilityAuthenticationException() - raise ValueError("API response status code: %d", response.status) + if response.status >= 500: + raise CIVisibilityAPIServerError(response.status) + raise CIVisibilityAPIClientError(response.status) try: sw.stop() # Stop the timer before parsing the response response_body = response.body diff --git a/ddtrace/internal/ci_visibility/telemetry/api_request.py b/ddtrace/internal/ci_visibility/telemetry/api_request.py index 77f3ea5f626..cb1caab20d2 100644 --- a/ddtrace/internal/ci_visibility/telemetry/api_request.py +++ b/ddtrace/internal/ci_visibility/telemetry/api_request.py @@ -25,7 +25,7 @@ def record_api_request( error: Optional[ERROR_TYPES] = None, ): log.debug( - "Recording early flake detection telemetry for %s: %s, %s, %s", + "Recording Test Optimization telemetry for %s: %s, %s, %s", metric_names.count, duration, response_bytes, @@ -47,5 +47,5 @@ def record_api_request( def record_api_request_error(error_metric_name: str, error: ERROR_TYPES): - log.debug("Recording early flake detection request error telemetry: %s", error) + log.debug("Recording Test Optimization request error telemetry: %s", error) telemetry_writer.add_count_metric(TELEMETRY_NAMESPACE.CIVISIBILITY, error_metric_name, 1, (("error_type", error),)) From 4c6c872093634195a4a594eadc4b1d19dd95ec29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20De=20Ara=C3=BAjo?= Date: Mon, 20 Oct 2025 14:47:55 +0000 Subject: [PATCH 2/6] add tests --- ddtrace/internal/ci_visibility/_api_client.py | 3 + .../test_ci_visibility_api_client.py | 63 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/ddtrace/internal/ci_visibility/_api_client.py b/ddtrace/internal/ci_visibility/_api_client.py index e34a4b9491f..a27fecfa77f 100644 --- a/ddtrace/internal/ci_visibility/_api_client.py +++ b/ddtrace/internal/ci_visibility/_api_client.py @@ -86,6 +86,9 @@ class CIVisibilityAPIError(Exception): def __init__(self, status: int) -> None: self.status = status + def __str__(self) -> str: + return f"Error calling Test Optimization API (status: {self.status})" + class CIVisibilityAPIClientError(CIVisibilityAPIError): pass diff --git a/tests/ci_visibility/api_client/test_ci_visibility_api_client.py b/tests/ci_visibility/api_client/test_ci_visibility_api_client.py index ca085a85ed4..5ce4ba973e3 100644 --- a/tests/ci_visibility/api_client/test_ci_visibility_api_client.py +++ b/tests/ci_visibility/api_client/test_ci_visibility_api_client.py @@ -8,6 +8,7 @@ from ddtrace.ext.test_visibility import _get_default_test_visibility_contrib_config from ddtrace.internal.ci_visibility import CIVisibility from ddtrace.internal.ci_visibility._api_client import AgentlessTestVisibilityAPIClient +from ddtrace.internal.ci_visibility._api_client import CIVisibilityAPIServerError from ddtrace.internal.ci_visibility._api_client import EVPProxyTestVisibilityAPIClient from ddtrace.internal.ci_visibility._api_client import ITRData from ddtrace.internal.ci_visibility._api_client import TestVisibilityAPISettings @@ -222,6 +223,68 @@ def test_civisibility_api_client_settings_do_request_call_optionals( itr_skipping_level, git_data=git_data, dd_service=dd_service, dd_env=dd_env ) + def test_civisibility_api_client_settings_retry_on_errors(self): + """Tests that the API call to the settings endpoint is retried in case of server errors.""" + client = self._get_test_client( + itr_skipping_level=ITR_SKIPPING_LEVEL.TEST, + api_key="my_api_key", + dd_service=None, + dd_env=None, + git_data=self.git_data_parameters[0], + ) + with mock.patch.object( + client, + "_do_request", + side_effect=[ + _get_setting_api_response(status_code=500), + _get_setting_api_response(status_code=500), + _get_setting_api_response(), + ], + ) as mock_do_request: + with mock.patch("ddtrace.internal.utils.retry.sleep"): + settings = client.fetch_settings(read_from_cache=False) + + assert settings == TestVisibilityAPISettings() + + assert mock_do_request.call_count == 3 + for call_args, _ in mock_do_request.call_args_list: + assert call_args[0] == "POST" + assert json.loads(call_args[2]) == self._get_expected_do_request_setting_payload( + ITR_SKIPPING_LEVEL.TEST, git_data=self.git_data_parameters[0], dd_service=None, dd_env=None + ) + + def test_civisibility_api_client_settings_fail_after_5_retries(self): + """Tests that the API call to the settings endpoint is retried in case of server errors.""" + client = self._get_test_client( + itr_skipping_level=ITR_SKIPPING_LEVEL.TEST, + api_key="my_api_key", + dd_service=None, + dd_env=None, + git_data=self.git_data_parameters[0], + ) + with mock.patch.object( + client, + "_do_request", + side_effect=[ + _get_setting_api_response(status_code=500), + _get_setting_api_response(status_code=500), + _get_setting_api_response(status_code=500), + _get_setting_api_response(status_code=500), + _get_setting_api_response(status_code=500), + _get_setting_api_response(), + ], + ) as mock_do_request: + with mock.patch("ddtrace.internal.utils.retry.sleep"): + with pytest.raises(CIVisibilityAPIServerError): + _ = client.fetch_settings(read_from_cache=False) + + assert mock_do_request.call_count == 5 + for call_args, _ in mock_do_request.call_args_list: + assert call_args[0] == "POST" + assert json.loads(call_args[2]) == self._get_expected_do_request_setting_payload( + ITR_SKIPPING_LEVEL.TEST, git_data=self.git_data_parameters[0], dd_service=None, dd_env=None + ) + @pytest.mark.parametrize("client_timeout", [None, 5]) @pytest.mark.parametrize("request_timeout", [None, 10]) @pytest.mark.parametrize( From fbf4fae096f54d839d47e3f70018e2c1e8999a62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20De=20Ara=C3=BAjo?= Date: Wed, 22 Oct 2025 10:04:28 +0000 Subject: [PATCH 3/6] test with more error types --- .../api_client/test_ci_visibility_api_client.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/ci_visibility/api_client/test_ci_visibility_api_client.py b/tests/ci_visibility/api_client/test_ci_visibility_api_client.py index 5ce4ba973e3..7ac44254cfa 100644 --- a/tests/ci_visibility/api_client/test_ci_visibility_api_client.py +++ b/tests/ci_visibility/api_client/test_ci_visibility_api_client.py @@ -1,5 +1,7 @@ from contextlib import contextmanager +from http.client import RemoteDisconnected import json +import socket from unittest import mock import pytest @@ -237,7 +239,7 @@ def test_civisibility_api_client_settings_retry_on_errors(self): "_do_request", side_effect=[ _get_setting_api_response(status_code=500), - _get_setting_api_response(status_code=500), + TimeoutError(), _get_setting_api_response(), ], ) as mock_do_request: @@ -267,10 +269,10 @@ def test_civisibility_api_client_settings_fail_after_5_retries(self): "_do_request", side_effect=[ _get_setting_api_response(status_code=500), - _get_setting_api_response(status_code=500), - _get_setting_api_response(status_code=500), - _get_setting_api_response(status_code=500), - _get_setting_api_response(status_code=500), + _get_setting_api_response(status_code=504), + TimeoutError(), + RemoteDisconnected(), + socket.timeout(), _get_setting_api_response(), ], ) as mock_do_request: From 2c18cccb63ac5078274007122cf8271b8f26395f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20De=20Ara=C3=BAjo?= Date: Wed, 22 Oct 2025 12:55:28 +0000 Subject: [PATCH 4/6] fix some tests --- .../test_ci_visibility_api_client_setting_responses.py | 4 ++-- .../test_ci_visibility_api_client_skippable_responses.py | 4 +++- ...test_ci_visibility_api_client_test_management_responses.py | 4 +++- .../test_ci_visibility_api_client_unique_tests_responses.py | 4 +++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/ci_visibility/api_client/test_ci_visibility_api_client_setting_responses.py b/tests/ci_visibility/api_client/test_ci_visibility_api_client_setting_responses.py index d42e143bf20..e8e0cf06711 100644 --- a/tests/ci_visibility/api_client/test_ci_visibility_api_client_setting_responses.py +++ b/tests/ci_visibility/api_client/test_ci_visibility_api_client_setting_responses.py @@ -138,8 +138,8 @@ def test_civisibility_api_client_setting_parsed(self, setting_response, expected def test_civisibility_api_client_setting_errors(self, do_request_side_effect, expected_exception): """Tests that the client reports errors correctly based on the API response""" client = self._get_test_client() - with mock.patch.object(client, "_do_request", side_effect=[do_request_side_effect]), pytest.raises( + with mock.patch.object(client, "_do_request", side_effect=do_request_side_effect), pytest.raises( expected_exception - ): + ), mock.patch("ddtrace.internal.utils.retry.sleep"): settings = client.fetch_settings(read_from_cache=False) assert settings is None diff --git a/tests/ci_visibility/api_client/test_ci_visibility_api_client_skippable_responses.py b/tests/ci_visibility/api_client/test_ci_visibility_api_client_skippable_responses.py index 5ebe8667319..6b78d986c24 100644 --- a/tests/ci_visibility/api_client/test_ci_visibility_api_client_skippable_responses.py +++ b/tests/ci_visibility/api_client/test_ci_visibility_api_client_skippable_responses.py @@ -577,6 +577,8 @@ def test_civisibility_api_client_skippable_parsed_covered_files(self, skippable_ def test_civisibility_api_client_skippable_errors(self, do_request_side_effect): """Tests that the client reports errors correctly gives a None item without crashing""" client = self._get_test_client() - with mock.patch.object(client, "_do_request", side_effect=[do_request_side_effect]): + with mock.patch.object(client, "_do_request", side_effect=do_request_side_effect), mock.patch( + "ddtrace.internal.utils.retry.sleep" + ): skippable_items = client.fetch_skippable_items(read_from_cache=False) assert skippable_items is None diff --git a/tests/ci_visibility/api_client/test_ci_visibility_api_client_test_management_responses.py b/tests/ci_visibility/api_client/test_ci_visibility_api_client_test_management_responses.py index 3d95f0cafe4..3b81192e09e 100644 --- a/tests/ci_visibility/api_client/test_ci_visibility_api_client_test_management_responses.py +++ b/tests/ci_visibility/api_client/test_ci_visibility_api_client_test_management_responses.py @@ -119,6 +119,8 @@ def test_api_client_test_management_tests_parsed(self): def test_api_client_test_management_tests_errors(self, do_request_side_effect): """Tests that the client correctly handles errors in the Test Management test API response""" client = self._get_test_client() - with mock.patch.object(client, "_do_request", side_effect=[do_request_side_effect]): + with mock.patch.object(client, "_do_request", side_effect=do_request_side_effect), mock.patch( + "ddtrace.internal.utils.retry.sleep" + ): settings = client.fetch_test_management_tests(read_from_cache=False) assert settings is None diff --git a/tests/ci_visibility/api_client/test_ci_visibility_api_client_unique_tests_responses.py b/tests/ci_visibility/api_client/test_ci_visibility_api_client_unique_tests_responses.py index 04b260947ff..48ce759775b 100644 --- a/tests/ci_visibility/api_client/test_ci_visibility_api_client_unique_tests_responses.py +++ b/tests/ci_visibility/api_client/test_ci_visibility_api_client_unique_tests_responses.py @@ -102,6 +102,8 @@ def test_civisibility_api_client_known_tests_parsed(self, known_test_response, e def test_civisibility_api_client_known_tests_errors(self, do_request_side_effect): """Tests that the client correctly handles errors in the known test API response""" client = self._get_test_client() - with mock.patch.object(client, "_do_request", side_effect=[do_request_side_effect]): + with mock.patch.object(client, "_do_request", side_effect=do_request_side_effect), mock.patch( + "ddtrace.internal.utils.retry.sleep" + ): settings = client.fetch_known_tests(read_from_cache=False) assert settings is None From 50d55924e993fd52d7471d79fc6b4579fc1dc403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20De=20Ara=C3=BAjo?= Date: Wed, 22 Oct 2025 12:55:59 +0000 Subject: [PATCH 5/6] itr:noskip From 0e467e65ce5b55517bc40852de4c0c4bc8096399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20De=20Ara=C3=BAjo?= Date: Wed, 22 Oct 2025 12:56:25 +0000 Subject: [PATCH 6/6] itr:noskip