|
| 1 | +import asyncio |
1 | 2 | from datetime import datetime, timezone |
2 | 3 | import hashlib |
3 | | -import traceback |
| 4 | +import random |
4 | 5 | import urllib.parse |
5 | 6 |
|
6 | 7 | import boto3 |
|
14 | 15 | HTTP_400_BAD_REQUEST, |
15 | 16 | HTTP_401_UNAUTHORIZED, |
16 | 17 | HTTP_403_FORBIDDEN, |
| 18 | + HTTP_404_NOT_FOUND, |
17 | 19 | ) |
18 | 20 |
|
19 | 21 | from gen3workflow import aws_utils, logger |
20 | 22 | from gen3workflow.auth import Auth |
21 | 23 | from gen3workflow.config import config |
22 | | -from gen3workflow.routes.system import get_status |
23 | 24 |
|
24 | 25 |
|
25 | 26 | s3_root_router = APIRouter(include_in_schema=False) |
26 | 27 | s3_router = APIRouter(prefix="/s3") |
27 | 28 |
|
28 | 29 |
|
| 30 | +S3_MAX_RETRIES = 3 |
| 31 | +S3_RETRY_BASE_DELAY = 0.5 |
| 32 | +S3_RETRY_BACKOFF_FACTOR = 2 |
| 33 | + |
| 34 | + |
29 | 35 | async def set_access_token_and_get_user_id(auth: Auth, headers: Headers) -> str: |
30 | 36 | """ |
31 | 37 | Extract the user's access token and (in some cases) the user's ID, which should have been |
@@ -336,28 +342,65 @@ async def s3_endpoint(path: str, request: Request): |
336 | 342 | signing_key, string_to_sign.encode("utf-8"), hashlib.sha256 |
337 | 343 | ).hexdigest() |
338 | 344 |
|
339 | | - # construct the Authorization header from the credentials and the signature, and forward the |
340 | | - # call to AWS S3 with the new Authorization header |
| 345 | + # construct the Authorization header from the credentials and the signature |
341 | 346 | headers["authorization"] = ( |
342 | 347 | f"AWS4-HMAC-SHA256 Credential={credentials.access_key}/{date}/{region}/{service}/aws4_request, SignedHeaders={signed_headers}, Signature={signature}" |
343 | 348 | ) |
344 | 349 | s3_api_url = f"https://{user_bucket}.s3.{region}.amazonaws.com/{api_endpoint}" |
345 | 350 | logger.debug(f"Outgoing S3 request: '{request.method} {s3_api_url}'") |
346 | | - response = await request.app.async_client.request( |
347 | | - method=request.method, |
348 | | - url=s3_api_url, |
349 | | - headers=headers, |
350 | | - params=query_params, |
351 | | - data=body, |
352 | | - ) |
353 | 351 |
|
354 | | - if response.status_code >= 300: |
355 | | - logger.debug(f"Received a failure status code from AWS: {response.status_code}") |
356 | | - # no need to log 404 errors except in debug mode: they are are expected when running |
357 | | - # workflows (e.g. for Nextflow workflows, error output files may not be present when there |
358 | | - # were no errors) |
359 | | - if response.status_code != 404: |
360 | | - logger.error(f"Error from AWS: {response.status_code} {response.text}") |
| 352 | + # forward the call to AWS S3 with the new Authorization header. |
| 353 | + # this call is retried with exponential backoff in case of unexpected error from S3. |
| 354 | + for attempt in range(1, S3_MAX_RETRIES + 1): |
| 355 | + proceed = True |
| 356 | + exception = None |
| 357 | + try: |
| 358 | + response = await request.app.async_client.request( |
| 359 | + method=request.method, |
| 360 | + url=s3_api_url, |
| 361 | + headers=headers, |
| 362 | + params=query_params, |
| 363 | + data=body, |
| 364 | + ) |
| 365 | + |
| 366 | + if response.status_code >= 300: |
| 367 | + # no need to log details (unless in debug mode) or retry in the case of a 404 |
| 368 | + # error: 404s are are expected when running workflows (e.g. for Nextflow workflows, |
| 369 | + # stderr output files may not be present when there were no errors) |
| 370 | + if response.status_code != HTTP_404_NOT_FOUND: |
| 371 | + logger.error( |
| 372 | + f"Error from S3: {response.status_code} {response.text}" |
| 373 | + ) |
| 374 | + # do not retry in the case of a 403 error: authentication is done internally by |
| 375 | + # this function, so 403 errors are internal service errors |
| 376 | + if response.status_code != HTTP_403_FORBIDDEN: |
| 377 | + proceed = False |
| 378 | + else: |
| 379 | + logger.debug(f"Error from S3: {response.status_code}") |
| 380 | + except Exception as e: |
| 381 | + logger.error(f"Exception while attempting to make a call to S3: {e}") |
| 382 | + proceed = False |
| 383 | + exception = e |
| 384 | + |
| 385 | + # exit if the call succeeded or should not be retried, or we reached the max number of |
| 386 | + # retries |
| 387 | + if proceed: |
| 388 | + break |
| 389 | + if attempt == S3_MAX_RETRIES: |
| 390 | + logger.error( |
| 391 | + f"Outgoing S3 request failed (attempt {attempt}/{S3_MAX_RETRIES}). Giving up" |
| 392 | + ) |
| 393 | + if exception: |
| 394 | + raise exception |
| 395 | + break |
| 396 | + |
| 397 | + # retry with exponential backoff |
| 398 | + delay = S3_RETRY_BASE_DELAY * (S3_RETRY_BACKOFF_FACTOR**attempt) |
| 399 | + delay += delay * 0.1 * random.uniform(-1, 1) # add jitter |
| 400 | + logger.warning( |
| 401 | + f"Outgoing S3 request failed (attempt {attempt}/{S3_MAX_RETRIES}). Retrying in {delay:.2f} seconds" |
| 402 | + ) |
| 403 | + await asyncio.sleep(delay) |
361 | 404 |
|
362 | 405 | # return the response from AWS S3. |
363 | 406 | # - mask the details of 403 errors from the end user: authentication is done internally by this |
|
0 commit comments