|
| 1 | +# Original: https://github.com/googleapis/python-bigquery/blob/main/google/cloud/bigquery/retry.py |
| 2 | +# Copyright 2018 Google LLC |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | + |
| 16 | +from google.api_core import exceptions, retry |
| 17 | +import google.api_core.future.polling |
| 18 | +from google.auth import exceptions as auth_exceptions # type: ignore |
| 19 | +import requests.exceptions |
| 20 | + |
| 21 | +_RETRYABLE_REASONS = frozenset( |
| 22 | + ["rateLimitExceeded", "backendError", "internalError", "badGateway"] |
| 23 | +) |
| 24 | + |
| 25 | +_UNSTRUCTURED_RETRYABLE_TYPES = ( |
| 26 | + ConnectionError, |
| 27 | + exceptions.TooManyRequests, |
| 28 | + exceptions.InternalServerError, |
| 29 | + exceptions.BadGateway, |
| 30 | + exceptions.ServiceUnavailable, |
| 31 | + requests.exceptions.ChunkedEncodingError, |
| 32 | + requests.exceptions.ConnectionError, |
| 33 | + requests.exceptions.Timeout, |
| 34 | + auth_exceptions.TransportError, |
| 35 | +) |
| 36 | + |
| 37 | +_MINUTE_IN_SECONDS = 60.0 |
| 38 | +_HOUR_IN_SECONDS = 60.0 * _MINUTE_IN_SECONDS |
| 39 | +_DEFAULT_RETRY_DEADLINE = 10.0 * _MINUTE_IN_SECONDS |
| 40 | + |
| 41 | +# Ambiguous errors (e.g. internalError, backendError, rateLimitExceeded) retry |
| 42 | +# until the full `_DEFAULT_RETRY_DEADLINE`. This is because the |
| 43 | +# `jobs.getQueryResults` REST API translates a job failure into an HTTP error. |
| 44 | +# |
| 45 | +# TODO(https://github.com/googleapis/python-bigquery/issues/1903): Investigate |
| 46 | +# if we can fail early for ambiguous errors in `QueryJob.result()`'s call to |
| 47 | +# the `jobs.getQueryResult` API. |
| 48 | +# |
| 49 | +# We need `_DEFAULT_JOB_DEADLINE` to be some multiple of |
| 50 | +# `_DEFAULT_RETRY_DEADLINE` to allow for a few retries after the retry |
| 51 | +# timeout is reached. |
| 52 | +# |
| 53 | +# Note: This multiple should actually be a multiple of |
| 54 | +# (2 * _DEFAULT_RETRY_DEADLINE). After an ambiguous exception, the first |
| 55 | +# call from `job_retry()` refreshes the job state without actually restarting |
| 56 | +# the query. The second `job_retry()` actually restarts the query. For a more |
| 57 | +# detailed explanation, see the comments where we set `restart_query_job = True` |
| 58 | +# in `QueryJob.result()`'s inner `is_job_done()` function. |
| 59 | +_DEFAULT_JOB_DEADLINE = 2.0 * (2.0 * _DEFAULT_RETRY_DEADLINE) |
| 60 | + |
| 61 | + |
| 62 | +def _should_retry(exc): |
| 63 | + """Predicate for determining when to retry. |
| 64 | +
|
| 65 | + We retry if and only if the 'reason' is 'backendError' |
| 66 | + or 'rateLimitExceeded'. |
| 67 | + """ |
| 68 | + if not hasattr(exc, "errors") or len(exc.errors) == 0: |
| 69 | + # Check for unstructured error returns, e.g. from GFE |
| 70 | + return isinstance(exc, _UNSTRUCTURED_RETRYABLE_TYPES) |
| 71 | + |
| 72 | + reason = exc.errors[0]["reason"] |
| 73 | + return reason in _RETRYABLE_REASONS |
| 74 | + |
| 75 | + |
| 76 | +DEFAULT_RETRY = retry.Retry(predicate=_should_retry, deadline=_DEFAULT_RETRY_DEADLINE) |
| 77 | +"""The default retry object. |
| 78 | +
|
| 79 | +Any method with a ``retry`` parameter will be retried automatically, |
| 80 | +with reasonable defaults. To disable retry, pass ``retry=None``. |
| 81 | +To modify the default retry behavior, call a ``with_XXX`` method |
| 82 | +on ``DEFAULT_RETRY``. For example, to change the deadline to 30 seconds, |
| 83 | +pass ``retry=bigquery.DEFAULT_RETRY.with_deadline(30)``. |
| 84 | +""" |
| 85 | + |
| 86 | + |
| 87 | +def _should_retry_get_job_conflict(exc): |
| 88 | + """Predicate for determining when to retry a jobs.get call after a conflict error. |
| 89 | +
|
| 90 | + Sometimes we get a 404 after a Conflict. In this case, we |
| 91 | + have pretty high confidence that by retrying the 404, we'll |
| 92 | + (hopefully) eventually recover the job. |
| 93 | + https://github.com/googleapis/python-bigquery/issues/2134 |
| 94 | +
|
| 95 | + Note: we may be able to extend this to user-specified predicates |
| 96 | + after https://github.com/googleapis/python-api-core/issues/796 |
| 97 | + to tweak existing Retry object predicates. |
| 98 | + """ |
| 99 | + return isinstance(exc, exceptions.NotFound) or _should_retry(exc) |
| 100 | + |
| 101 | + |
| 102 | +# Pick a deadline smaller than our other deadlines since we want to timeout |
| 103 | +# before those expire. |
| 104 | +_DEFAULT_GET_JOB_CONFLICT_DEADLINE = _DEFAULT_RETRY_DEADLINE / 3.0 |
| 105 | +_DEFAULT_GET_JOB_CONFLICT_RETRY = retry.Retry( |
| 106 | + predicate=_should_retry_get_job_conflict, |
| 107 | + deadline=_DEFAULT_GET_JOB_CONFLICT_DEADLINE, |
| 108 | +) |
| 109 | +"""Private, may be removed in future.""" |
| 110 | + |
| 111 | + |
| 112 | +# Note: Take care when updating DEFAULT_TIMEOUT to anything but None. We |
| 113 | +# briefly had a default timeout, but even setting it at more than twice the |
| 114 | +# theoretical server-side default timeout of 2 minutes was not enough for |
| 115 | +# complex queries. See: |
| 116 | +# https://github.com/googleapis/python-bigquery/issues/970#issuecomment-921934647 |
| 117 | +DEFAULT_TIMEOUT = None |
| 118 | +"""The default API timeout. |
| 119 | +
|
| 120 | +This is the time to wait per request. To adjust the total wait time, set a |
| 121 | +deadline on the retry object. |
| 122 | +""" |
| 123 | + |
| 124 | +job_retry_reasons = ( |
| 125 | + "rateLimitExceeded", |
| 126 | + "backendError", |
| 127 | + "internalError", |
| 128 | + "jobBackendError", |
| 129 | + "jobInternalError", |
| 130 | + "jobRateLimitExceeded", |
| 131 | +) |
| 132 | + |
| 133 | + |
| 134 | +def _job_should_retry(exc): |
| 135 | + # Sometimes we have ambiguous errors, such as 'backendError' which could |
| 136 | + # be due to an API problem or a job problem. For these, make sure we retry |
| 137 | + # our is_job_done() function. |
| 138 | + # |
| 139 | + # Note: This won't restart the job unless we know for sure it's because of |
| 140 | + # the job status and set restart_query_job = True in that loop. This means |
| 141 | + # that we might end up calling this predicate twice for the same job |
| 142 | + # but from different paths: (1) from jobs.getQueryResults RetryError and |
| 143 | + # (2) from translating the job error from the body of a jobs.get response. |
| 144 | + # |
| 145 | + # Note: If we start retrying job types other than queries where we don't |
| 146 | + # call the problematic getQueryResults API to check the status, we need |
| 147 | + # to provide a different predicate, as there shouldn't be ambiguous |
| 148 | + # errors in those cases. |
| 149 | + if isinstance(exc, exceptions.RetryError): |
| 150 | + exc = exc.cause |
| 151 | + |
| 152 | + # Per https://github.com/googleapis/python-bigquery/issues/1929, sometimes |
| 153 | + # retriable errors make their way here. Because of the separate |
| 154 | + # `restart_query_job` logic to make sure we aren't restarting non-failed |
| 155 | + # jobs, it should be safe to continue and not totally fail our attempt at |
| 156 | + # waiting for the query to complete. |
| 157 | + if _should_retry(exc): |
| 158 | + return True |
| 159 | + |
| 160 | + if not hasattr(exc, "errors") or len(exc.errors) == 0: |
| 161 | + return False |
| 162 | + |
| 163 | + reason = exc.errors[0]["reason"] |
| 164 | + return reason in job_retry_reasons |
| 165 | + |
| 166 | + |
| 167 | +DEFAULT_JOB_RETRY = retry.Retry( |
| 168 | + predicate=_job_should_retry, deadline=_DEFAULT_JOB_DEADLINE |
| 169 | +) |
| 170 | +""" |
| 171 | +The default job retry object. |
| 172 | +""" |
| 173 | + |
| 174 | + |
| 175 | +DEFAULT_ML_JOB_RETRY = retry.Retry( |
| 176 | + predicate=_job_should_retry, deadline=_HOUR_IN_SECONDS |
| 177 | +) |
| 178 | +""" |
| 179 | +The default job retry object for AI/ML jobs. |
| 180 | +
|
| 181 | +Such jobs can take a long time to fail. See: b/436586523. |
| 182 | +""" |
| 183 | + |
| 184 | + |
| 185 | +def _query_job_insert_should_retry(exc): |
| 186 | + # Per https://github.com/googleapis/python-bigquery/issues/2134, sometimes |
| 187 | + # we get a 404 error. In this case, if we get this far, assume that the job |
| 188 | + # doesn't actually exist and try again. We can't add 404 to the default |
| 189 | + # job_retry because that happens for errors like "this table does not |
| 190 | + # exist", which probably won't resolve with a retry. |
| 191 | + if isinstance(exc, exceptions.RetryError): |
| 192 | + exc = exc.cause |
| 193 | + |
| 194 | + if isinstance(exc, exceptions.NotFound): |
| 195 | + message = exc.message |
| 196 | + # Don't try to retry table/dataset not found, just job not found. |
| 197 | + # The URL contains jobs, so use whitespace to disambiguate. |
| 198 | + return message is not None and " job" in message.lower() |
| 199 | + |
| 200 | + return _job_should_retry(exc) |
| 201 | + |
| 202 | + |
| 203 | +_DEFAULT_QUERY_JOB_INSERT_RETRY = retry.Retry( |
| 204 | + predicate=_query_job_insert_should_retry, |
| 205 | + # jobs.insert doesn't wait for the job to complete, so we don't need the |
| 206 | + # long _DEFAULT_JOB_DEADLINE for this part. |
| 207 | + deadline=_DEFAULT_RETRY_DEADLINE, |
| 208 | +) |
| 209 | +"""Private, may be removed in future.""" |
| 210 | + |
| 211 | + |
| 212 | +DEFAULT_GET_JOB_TIMEOUT = 128 |
| 213 | +""" |
| 214 | +Default timeout for Client.get_job(). |
| 215 | +""" |
| 216 | + |
| 217 | +POLLING_DEFAULT_VALUE = google.api_core.future.polling.PollingFuture._DEFAULT_VALUE |
| 218 | +""" |
| 219 | +Default value defined in google.api_core.future.polling.PollingFuture. |
| 220 | +""" |
0 commit comments