Skip to content
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@
"filename": "docs/s3.md",
"hashed_secret": "08d2e98e6754af941484848930ccbaddfefe13d6",
"is_verified": false,
"line_number": 56
"line_number": 55
}
],
"migrations/versions/e1886270d9d2_create_system_key_table.py": [
Expand Down Expand Up @@ -205,9 +205,9 @@
"filename": "tests/test_s3_endpoint.py",
"hashed_secret": "08d2e98e6754af941484848930ccbaddfefe13d6",
"is_verified": false,
"line_number": 75
"line_number": 77
}
]
},
"generated_at": "2026-02-03T17:56:09Z"
"generated_at": "2026-02-27T22:12:14Z"
}
1 change: 0 additions & 1 deletion docs/s3.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Note: This discussion can apply to many use cases, but it is written with a spec
Contents:
- [Using IAM keys](#using-iam-keys)
- [Using a custom S3 endpoint](#using-a-custom-s3-endpoint)
- [Diagram](#diagram)

## Using IAM keys

Expand Down
2 changes: 1 addition & 1 deletion gen3workflow/routes/ga4gh_tes.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_non_allowed_images(images: set, username: str) -> set:
"""

def _image_to_regex(image: str) -> str:
"""
r"""
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix for SyntaxWarning: invalid escape sequence '\Z'

Update a whitelisted image to a regex: replace {username} with the actual username,
replace `*` with `.*` to match any sequence of characters, and replace `:.*` with
`(:.+|\Z)` so that "myimage" is accepted if "myimage:*" is in the whitelist.
Expand Down
74 changes: 48 additions & 26 deletions gen3workflow/routes/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,14 @@ async def s3_endpoint(path: str, request: Request):
not support S3 endpoints with a path, such as the Minio-go S3 client.
"""

# because this endpoint is exposed at root, if the GET path is empty, assume the user is not
# trying to reach the S3 endpoint and redirect to the status endpoint instead
if request.method == "GET" and not path:
return await get_status(request)
# Because this endpoint is exposed at root, if the GET path is empty, the user may not be
# trying to reach the S3 endpoint: suggest using the status endpoint.
# "All buckets" listing requests also land here and are not supported, since users can only
# access their own bucket.
if request.method == "GET" and path in ("", "s3"):
err_msg = f"If you are using the S3 endpoint: 's3 ls' not supported, use 's3 ls s3://<your bucket>' instead. If you are trying to reach the Gen3-Workflow API, try '/_status'."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay! I see we are explicitly asking them to try /_status if they need it. Can't we still support get_status when path is empty? Do you think that would be complicated?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It gets confusing because if you use "aws s3 ls" against the S3 endpoint exposed at root, the AWS CLI can't parse the response from "get status"

logger.error(err_msg)
raise HTTPException(HTTP_400_BAD_REQUEST, err_msg)

# extract the caller's access token from the request headers, and ensure the caller (user, or
# client acting on behalf of the user) has access to run workflows
Expand All @@ -180,6 +184,10 @@ async def s3_endpoint(path: str, request: Request):
# get the name of the user's bucket and ensure the user is making a call to their own bucket
logger.info(f"Incoming S3 request from user '{user_id}': '{request.method} {path}'")
user_bucket = aws_utils.get_safe_name_from_hostname(user_id)
if request.method == "GET" and path == "s3":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh! Can someone make a request to get the names of all the buckets in an account when they interact via gen3-workflow? Would it be possible to add a test around it? And will the path variable be exactly equal to s3.

Also, what would happen when someone tries ls s3://{someone else's bucket}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes the path is exactly "s3". Right now listing the buckets doesn't work, i tried to get it working and only return the user's bucket, but it wasn't easy and i thought it's not worth the trouble, since "ls s3://{user_bucket}" works.

"ls s3://{someone else's bucket}" would return a 403 through the same check as other calls (L188).

I'll have a look at adding a unit test for ls

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Turns out the path is only "s3" when hitting "/s3", and it's otherwise empty. I added a commit to reject all "list all buckets" requests + a unit test.

"ls s3://{user_bucket}" and "ls s3://{someone else's bucket}" are already tested by existing unit tests, for example this one

err_msg = f"'ls' not supported, use 'ls s3://{user_bucket}' instead"
logger.error(err_msg)
raise HTTPException(HTTP_400_BAD_REQUEST, err_msg)
request_bucket = path.split("?")[0].split("/")[0]
if request_bucket != user_bucket:
err_msg = f"'{path}' (bucket '{request_bucket}') not allowed. You can make calls to your personal bucket, '{user_bucket}'"
Expand Down Expand Up @@ -217,27 +225,42 @@ async def s3_endpoint(path: str, request: Request):
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
date = timestamp[:8] # the date portion (YYYYMMDD) of the timestamp

# Generate the body hash and the request headers.
# Overwrite the original `x-amz-content-sha256` header value with the body hash. When this
# header is set to "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" in the original request (payload sent
# over multiple chunks), we still replace it with the body hash and we strip the body of the
# chunk signatures => protocol translation from a chunk-signed streaming request (SigV4
# streaming HTTP PUT) into a single-payload request (Normal SigV4 HTTP PUT). We could also
# implement chunked signing but it's not straightforward and likely unnecessary.
# NOTE: Chunked uploads and multipart uploads are NOT the same thing. Python boto3 does not
# generate chunked uploads, but the Minio-go S3 client used by Funnel does.
# Generate the request headers. Chunked payload support:
# - The AWS CLI uploads files with the STREAMING-UNSIGNED-PAYLOAD-TRAILER method.
# The body includes chunks and checksums. It can be forwarded to AWS without changes as long
# as the necessary headers are forwarded as well.
# - The Minio-go S3 client uploads files with the STREAMING-AWS4-HMAC-SHA256-PAYLOAD method.
# Funnel uses this client.
# We overwrite the original `x-amz-content-sha256` header value with the body hash and we
# strip the body of the chunk signatures => protocol translation from a chunk-signed streaming
# request (SigV4 streaming HTTP PUT) into a single-payload request (Normal SigV4 HTTP PUT).
# We could also implement chunked signing but it's not straightforward and likely unnecessary.
# Note: Chunked uploads != multipart uploads.
body = await request.body()
headers = {
"host": f"{user_bucket}.s3.{region}.amazonaws.com",
"x-amz-date": timestamp,
}
for h in [
"content-encoding",
"content-length",
"x-amz-content-sha256",
"x-amz-decoded-content-length",
"x-amz-trailer",
]:
if request.headers.get(h):
headers[h] = request.headers[h]
if (
request.headers.get("x-amz-content-sha256")
== "STREAMING-AWS4-HMAC-SHA256-PAYLOAD"
):
# parse the body and update the corresponding headers
body = chunked_to_non_chunked_body(body)
body_hash = hashlib.sha256(body).hexdigest()
headers = {
"host": f"{user_bucket}.s3.{region}.amazonaws.com",
"x-amz-content-sha256": body_hash,
"x-amz-date": timestamp,
}
content_len = str(len(body))
headers["x-amz-content-sha256"] = hashlib.sha256(body).hexdigest()
for h in ["content-length", "x-amz-decoded-content-length"]:
if request.headers.get(h):
headers[h] = content_len

# get AWS credentials from the configuration or the current assumed role session
if config["S3_ENDPOINTS_AWS_ACCESS_KEY_ID"]:
Expand All @@ -263,11 +286,12 @@ async def s3_endpoint(path: str, request: Request):
headers["x-amz-server-side-encryption"] = "aws:kms"
headers["x-amz-server-side-encryption-aws-kms-key-id"] = kms_key_arn

# construct the canonical request
# construct the canonical request. All header keys must be lowercase
sorted_headers = sorted(list(headers.keys()), key=str.casefold)
canonical_headers = "".join(
f"{key}:{headers[key]}\n" for key in sorted(list(headers.keys()))
f"{key.lower()}:{headers[key]}\n" for key in sorted_headers
)
signed_headers = ";".join(sorted([k.lower() for k in headers.keys()]))
signed_headers = ";".join([k.lower() for k in sorted_headers])
query_params = dict(request.query_params)
# the query params in the canonical request have to be sorted:
query_params_names = sorted(list(query_params.keys()))
Expand All @@ -282,7 +306,7 @@ async def s3_endpoint(path: str, request: Request):
f"{canonical_headers}"
f"\n"
f"{signed_headers}\n"
f"{body_hash}"
f"{headers['x-amz-content-sha256']}"
)

# construct the string to sign based on the canonical request
Expand Down Expand Up @@ -319,9 +343,7 @@ async def s3_endpoint(path: str, request: Request):
)

if response.status_code >= 300:
logger.debug(
f"Received a failure status code from AWS: {response.status_code}. {response.text=}"
)
logger.debug(f"Received a failure status code from AWS: {response.status_code}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do we get in the {response.text=} field? is not useful?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's also logged on L347 so it's duplicated, except in the case of a 404, in which case we don't need extra details

# no need to log 404 errors except in debug mode: they are are expected when running
# workflows (e.g. for Nextflow workflows, error output files may not be present when there
# were no errors)
Expand Down
Loading
Loading