-
Notifications
You must be signed in to change notification settings - Fork 1
MIDRC-1228 Multipart upload support #104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -192,10 +192,6 @@ async def s3_endpoint(path: str, request: Request): | |
| # get the name of the user's bucket and ensure the user is making a call to their own bucket | ||
| logger.info(f"Incoming S3 request from user '{user_id}': '{request.method} {path}'") | ||
| user_bucket = aws_utils.get_safe_name_from_hostname(user_id) | ||
| if request.method == "GET" and path == "s3": | ||
| err_msg = f"'ls' not supported, use 'ls s3://{user_bucket}' instead" | ||
| logger.error(err_msg) | ||
| raise HTTPException(HTTP_400_BAD_REQUEST, err_msg) | ||
|
Comment on lines
-195
to
-198
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| request_bucket = path.split("?")[0].split("/")[0] | ||
| if request_bucket != user_bucket: | ||
| err_msg = f"'{path}' (bucket '{request_bucket}') not allowed. You can make calls to your personal bucket, '{user_bucket}'" | ||
|
|
@@ -282,8 +278,17 @@ async def s3_endpoint(path: str, request: Request): | |
| assert credentials, "No AWS credentials found" | ||
| headers["x-amz-security-token"] = credentials.token | ||
|
|
||
| # if this is a PUT request, we need the KMS key ID to use for encryption | ||
| if config["KMS_ENCRYPTION_ENABLED"] and request.method == "PUT": | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do Multi-part uploads use POST? I'm wondering why didn't we check for
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes multipart uploads use POST. i guess we just didn't test with large files until now |
||
| # If this is a PUT or POST request, specify the KMS key to use for encryption. | ||
| # For multipart uploads, the initial CreateMultipartUpload request includes the KMS | ||
| # configuration, and the following UploadPart and CompleteMultipartUpload requests do not. | ||
| # We know this is an UploadPart or CompleteMultipartUpload request if it includes the | ||
| # uploadId query parameter. | ||
| query_params = dict(request.query_params) | ||
| if ( | ||
| config["KMS_ENCRYPTION_ENABLED"] | ||
| and request.method in ["PUT", "POST"] | ||
| and "uploadId" not in query_params | ||
| ): | ||
| _, kms_key_arn = aws_utils.get_existing_kms_key_for_bucket(user_bucket) | ||
| if not kms_key_arn: | ||
| err_msg = "Bucket misconfigured. Hit the `GET /storage/setup` endpoint and try again." | ||
|
|
@@ -300,7 +305,6 @@ async def s3_endpoint(path: str, request: Request): | |
| f"{key.lower()}:{headers[key]}\n" for key in sorted_headers | ||
| ) | ||
| signed_headers = ";".join([k.lower() for k in sorted_headers]) | ||
| query_params = dict(request.query_params) | ||
| # the query params in the canonical request have to be sorted: | ||
| query_params_names = sorted(list(query_params.keys())) | ||
| canonical_query_params = "&".join( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,12 @@ | |
| from fastapi import HTTPException | ||
| import pytest | ||
|
|
||
| from conftest import MOCKED_S3_RESPONSE_DICT, TEST_USER_ID, TEST_USER_TOKEN | ||
| from conftest import ( | ||
| MOCKED_S3_RESPONSE_DICT, | ||
| TEST_USER_ID, | ||
| TEST_USER_TOKEN, | ||
| mock_aws_s3_request, | ||
| ) | ||
| from gen3workflow.config import config | ||
| from gen3workflow.routes.s3 import ( | ||
| set_access_token_and_get_user_id, | ||
|
|
@@ -350,23 +355,33 @@ def test_s3_upload_file(s3_client, access_token_patcher, multipart): | |
| Test that the boto3 `upload_file` function works with the `/s3` endpoint, both for a small | ||
| file uploaded in 1 part and for a large file uploaded in multiple parts. | ||
| """ | ||
| bucket_name = f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}" | ||
| object_key = f"test_s3_upload_file{'_multipart' if multipart else ''}.txt" | ||
|
|
||
| with patch( | ||
| "gen3workflow.aws_utils.get_existing_kms_key_for_bucket", | ||
| lambda _: ("test_kms_key_alias", "test_kms_key_arn"), | ||
| ): | ||
| with tempfile.NamedTemporaryFile(mode="w+t", delete=True) as file_to_upload: | ||
| file_to_upload.write("Test file contents\n") | ||
| with tempfile.NamedTemporaryFile(delete=True) as file_to_upload: | ||
| file_to_upload.write(b"A" * (6 * 1024 * 1024)) # create a 6MB file | ||
| file_to_upload.flush() | ||
| s3_client.upload_file( | ||
| file_to_upload.name, | ||
| f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}", | ||
| f"test_s3_upload_file{'_multipart' if multipart else ''}.txt", | ||
| # to test a multipart upload, set the chunk size to 1 to force splitting the file | ||
| # into multiple chunks: | ||
| Config=boto3.s3.transfer.TransferConfig( | ||
| multipart_threshold=1 if multipart else 9999 | ||
| bucket_name, | ||
| object_key, | ||
| # to test a multipart upload, set the part size to 1 to force splitting the file | ||
| # into multiple parts: | ||
| Config=( | ||
| boto3.s3.transfer.TransferConfig(multipart_threshold=1) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to boto docs, default part size for a multi-part upload seems to be 8MB (here). So, if the file size is 6MB, it will still be uploaded as a single part, even if the multipart threshold is set to 1 MB. With the current setup, we can test whether multipart uploads are triggered. However, to properly test multipart uploads, you could either:
Alternatively, we could do both. For example:
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right now we get 3 calls: init upload, upload part, and complete upload.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As long as we cover init, upload and complete part in the test, I think we are good. |
||
| if multipart | ||
| else None | ||
| ), | ||
| ) | ||
|
|
||
| mock_aws_s3_request.assert_called_with( | ||
| f"https://{bucket_name}.s3.us-east-1.amazonaws.com/{object_key}{'?uploadId=test-upload-id' if multipart else ''}" | ||
| ) | ||
|
|
||
|
|
||
| def test_chunked_to_non_chunked_body(): | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can fine-tune this during the testing phase
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
S3 Connection errors being handled here is nice! Thanks! :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hopefully it's enough!