PR #770 only auto-retry on 429 for now (#441, #764)

soxofaan · soxofaan · commit b97343a52598 · 2025-05-23T14:56:41.000+02:00
also automatically retrying on 5xx codes might interfere
too much with soft-error handling features of existing poll loops

also introduce HTTP status constants for more self-descriptive code
diff --git a/openeo/rest/_connection.py b/openeo/rest/_connection.py
@@ -13,7 +13,7 @@
 from openeo.rest import OpenEoApiError, OpenEoApiPlainError, OpenEoRestError
 from openeo.rest.auth.auth import NullAuth
 from openeo.util import ContextTimer, ensure_list, str_truncate, url_join
-from openeo.utils.http import session_with_retries
+from openeo.utils.http import HTTP_502_BAD_GATEWAY, session_with_retries
 
 _log = logging.getLogger(__name__)
 
@@ -174,7 +174,7 @@ def _raise_api_error(self, response: requests.Response):
         _log.warning(f"Failed to parse API error response: [{status_code}] {text!r} (headers: {response.headers})")
 
         # TODO: eliminate this VITO-backend specific error massaging?
-        if status_code == 502 and "Proxy Error" in text:
+        if status_code == HTTP_502_BAD_GATEWAY and "Proxy Error" in text:
             error_message = (
                 "Received 502 Proxy Error."
                 " This typically happens when a synchronous openEO processing request takes too long and is aborted."
diff --git a/openeo/rest/_testing.py b/openeo/rest/_testing.py
@@ -17,6 +17,7 @@
 
 from openeo import Connection, DataCube
 from openeo.rest.vectorcube import VectorCube
+from openeo.utils.http import HTTP_201_CREATED, HTTP_202_ACCEPTED, HTTP_204_NO_CONTENT
 
 OPENEO_BACKEND = "https://openeo.test/"
 
@@ -209,7 +210,7 @@ def _handle_post_jobs(self, request, context):
         for field in self.extra_job_metadata_fields:
             job_data[field] = post_data.get(field)
         self.batch_jobs[job_id] = job_data
-        context.status_code = 201
+        context.status_code = HTTP_201_CREATED
         context.headers["openeo-identifier"] = job_id
 
     def _get_job_id(self, request) -> str:
@@ -232,7 +233,7 @@ def _handle_post_job_results(self, request, context):
         self.batch_jobs[job_id]["status"] = self._get_job_status(
             job_id=job_id, current_status=self.batch_jobs[job_id]["status"]
         )
-        context.status_code = 202
+        context.status_code = HTTP_202_ACCEPTED
 
     def _handle_get_job(self, request, context):
         """Handler of `GET /job/{job_id}` (get batch job status and metadata)."""
@@ -270,7 +271,7 @@ def _handle_delete_job_results(self, request, context):
         job_id = self._get_job_id(request)
         self.batch_jobs[job_id]["status"] = "canceled"
         self._forced_job_status[job_id] = "canceled"
-        context.status_code = 204
+        context.status_code = HTTP_204_NO_CONTENT
 
     def _handle_get_job_result_asset(self, request, context):
         """Handler of `GET /job/{job_id}/results/result.data` (get batch job result asset)."""
diff --git a/openeo/rest/connection.py b/openeo/rest/connection.py
@@ -86,6 +86,11 @@
     load_json_resource,
     rfc3339,
 )
+from openeo.utils.http import (
+    HTTP_201_CREATED,
+    HTTP_401_UNAUTHORIZED,
+    HTTP_403_FORBIDDEN,
+)
 from openeo.utils.version import ComparableVersion
 
 __all__ = ["Connection", "connect"]
@@ -676,7 +681,10 @@ def _request():
             # Initial request attempt
             return _request()
         except OpenEoApiError as api_exc:
-            if api_exc.http_status_code in {401, 403} and api_exc.code == "TokenInvalid":
+            if (
+                api_exc.http_status_code in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN}
+                and api_exc.code == "TokenInvalid"
+            ):
                 # Auth token expired: can we refresh?
                 if isinstance(self.auth, OidcBearerAuth) and self._oidc_auth_renewer:
                     msg = f"OIDC access token expired ({api_exc.http_status_code} {api_exc.code})."
@@ -1763,7 +1771,7 @@ def create_job(
         )
 
         self._preflight_validation(pg_with_metadata=pg_with_metadata, validate=validate)
-        response = self.post("/jobs", json=pg_with_metadata, expected_status=201)
+        response = self.post("/jobs", json=pg_with_metadata, expected_status=HTTP_201_CREATED)
 
         job_id = None
         if "openeo-identifier" in response.headers:
diff --git a/openeo/rest/job.py b/openeo/rest/job.py
@@ -27,6 +27,15 @@
 from openeo.rest.models.general import LogsResponse
 from openeo.rest.models.logs import log_level_name
 from openeo.util import ensure_dir
+from openeo.utils.http import (
+    HTTP_408_REQUEST_TIMEOUT,
+    HTTP_429_TOO_MANY_REQUESTS,
+    HTTP_500_INTERNAL_SERVER_ERROR,
+    HTTP_501_NOT_IMPLEMENTED,
+    HTTP_502_BAD_GATEWAY,
+    HTTP_503_SERVICE_UNAVAILABLE,
+    HTTP_504_GATEWAY_TIMEOUT,
+)
 
 if typing.TYPE_CHECKING:
     # Imports for type checking only (circular import issue at runtime).
@@ -37,7 +46,16 @@
 
 DEFAULT_JOB_RESULTS_FILENAME = "job-results.json"
 MAX_RETRIES_PER_RANGE = 3
-RETRIABLE_STATUSCODES = [408, 429, 500, 501, 502, 503, 504]
+RETRIABLE_STATUSCODES = [
+    HTTP_408_REQUEST_TIMEOUT,
+    HTTP_429_TOO_MANY_REQUESTS,
+    HTTP_500_INTERNAL_SERVER_ERROR,
+    HTTP_501_NOT_IMPLEMENTED,
+    HTTP_502_BAD_GATEWAY,
+    HTTP_503_SERVICE_UNAVAILABLE,
+    HTTP_504_GATEWAY_TIMEOUT,
+]
+
 
 class BatchJob:
     """
@@ -313,7 +331,7 @@ def soft_error(message: str):
                 soft_error("Connection error while polling job status: {e}".format(e=e))
                 continue
             except OpenEoApiPlainError as e:
-                if e.http_status_code in [502, 503]:
+                if e.http_status_code in [HTTP_502_BAD_GATEWAY, HTTP_503_SERVICE_UNAVAILABLE]:
                     soft_error("Service availability error while polling job status: {e}".format(e=e))
                     continue
                 else:
diff --git a/openeo/utils/http.py b/openeo/utils/http.py
@@ -8,7 +8,29 @@
 import requests.adapters
 from urllib3.util import Retry
 
-DEFAULT_RETRIES_TOTAL = 5
+# Commonly used subset of HTTP response status codes
+HTTP_100_CONTINUE = 100
+HTTP_200_OK = 200
+HTTP_201_CREATED = 201
+HTTP_202_ACCEPTED = 202
+HTTP_204_NO_CONTENT = 204
+HTTP_301_MOVED_PERMANENTLY = 301
+HTTP_302_FOUND = 302
+HTTP_400_BAD_REQUEST = 400
+HTTP_401_UNAUTHORIZED = 401
+HTTP_402_PAYMENT_REQUIRED = 402
+HTTP_403_FORBIDDEN = 403
+HTTP_404_NOT_FOUND = 404
+HTTP_408_REQUEST_TIMEOUT = 408
+HTTP_429_TOO_MANY_REQUESTS = 429
+HTTP_500_INTERNAL_SERVER_ERROR = 500
+HTTP_501_NOT_IMPLEMENTED = 501
+HTTP_502_BAD_GATEWAY = 502
+HTTP_503_SERVICE_UNAVAILABLE = 503
+HTTP_504_GATEWAY_TIMEOUT = 504
+
+
+DEFAULT_RETRIES_TOTAL = 3
 
 # On `backoff_factor`: it influences how much to sleep according to the formula:
 #     sleep = {backoff factor} * (2 ** ({consecutive errors - 1}))
@@ -20,10 +42,7 @@
 
 DEFAULT_RETRY_FORCELIST = frozenset(
     [
-        429,  # Too Many Requests
-        502,  # Bad Gateway
-        503,  # Service Unavailable
-        504,  # Gateway Timeout
+        HTTP_429_TOO_MANY_REQUESTS,
     ]
 )
 
diff --git a/tests/rest/test_job.py b/tests/rest/test_job.py
@@ -18,6 +18,13 @@
 from openeo.rest.job import BatchJob, ResultAsset
 from openeo.rest.models.general import Link
 from openeo.rest.models.logs import LogEntry
+from openeo.utils.http import (
+    HTTP_402_PAYMENT_REQUIRED,
+    HTTP_429_TOO_MANY_REQUESTS,
+    HTTP_500_INTERNAL_SERVER_ERROR,
+    HTTP_502_BAD_GATEWAY,
+    HTTP_503_SERVICE_UNAVAILABLE,
+)
 
 API_URL = "https://oeo.test"
 
@@ -320,28 +327,35 @@ def test_execute_batch_with_excessive_soft_errors(con100, requests_mock, tmpdir,
     [
         (  # Default retry settings
             None,
-            [
-                httpretty.Response(status=502, body="Bad Gateway"),
-                httpretty.Response(status=504, body="Service Unavailable"),
-            ],
+            [],
             contextlib.nullcontext(),
-            [0.1, 23, 34],
+            [23, 34],
         ),
-        (
-            # Only retry on 429 (and fail on 500)
-            {"status_forcelist": [429]},
-            [
-                httpretty.Response(status=500, body="Internal Server Error"),
-            ],
+        (  # Default config with a generic 500 error
+            None,
+            [httpretty.Response(status=HTTP_500_INTERNAL_SERVER_ERROR, body="Internal Server Error")],
             pytest.raises(OpenEoApiPlainError, match=re.escape("[500] Internal Server Error")),
-            [0.1, 23],
+            [23],
+        ),
+        (  # Default config with a 503 error (skipped by soft error feature of execute_batch poll loop)
+            None,
+            [httpretty.Response(status=HTTP_503_SERVICE_UNAVAILABLE, body="Service Unavailable")],
+            contextlib.nullcontext(),
+            [23, 12.34, 34],
         ),
         (
-            # No retry setup
+            # Explicit status_forcelist with custom status code to retry
+            {"status_forcelist": [HTTP_429_TOO_MANY_REQUESTS, HTTP_402_PAYMENT_REQUIRED]},
+            [httpretty.Response(status=HTTP_402_PAYMENT_REQUIRED, body="Payment Required")],
+            contextlib.nullcontext(),
+            [23, 34],
+        ),
+        (
+            # No retry setup: also fail on 429
             False,
             [],
             pytest.raises(OpenEoApiPlainError, match=re.escape("[429] Too Many Requests")),
-            [0.1],
+            [],
         ),
     ],
 )
@@ -401,12 +415,17 @@ def test_execute_batch_retry_after_429_too_many_requests(
 
     con = openeo.connect(API_URL, retry=retry_config)
 
+    max_poll_interval = 0.1
+    connection_retry_interval = 12.34
     with mock.patch("time.sleep") as sleep_mock:
         job = con.load_collection("SENTINEL2").create_job()
         with expectation_context:
-            job.start_and_wait(max_poll_interval=0.1)
+            job.start_and_wait(max_poll_interval=max_poll_interval, connection_retry_interval=connection_retry_interval)
 
-    assert sleep_mock.call_args_list == dirty_equals.Contains(*(mock.call(s) for s in expected_sleeps))
+    # Check retry related sleeps
+    actual_sleeps = [args[0] for args, kwargs in sleep_mock.call_args_list]
+    actual_sleeps = [s for s in actual_sleeps if s != max_poll_interval]
+    assert actual_sleeps == expected_sleeps
 
 
 class LogGenerator:
diff --git a/tests/utils/test_http.py b/tests/utils/test_http.py
@@ -40,8 +40,7 @@ def test_default_basic(self, time_sleep):
             (1, [], True),
             (2, [5], True),
             (3, [5, 10], True),
-            (5, [5, 10, 20, 40], True),
-            (6, [5, 10, 20, 40], False),
+            (4, [5, 10], False),
         ],
     )
     def test_default_multiple_attempts(self, time_sleep, fail_count, expected_sleeps, success):