Skip to content
67 changes: 50 additions & 17 deletions gen3workflow/aws_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,18 +191,14 @@ def create_user_bucket(user_id: str) -> Tuple[str, str, str]:
return user_bucket_name, "ga4gh-tes", config["USER_BUCKETS_REGION"]


def delete_user_bucket(user_id: str) -> Union[str, None]:
def delete_all_bucket_objects(user_id, user_bucket_name):
"""
Deletes all objects from a user's S3 bucket before deleting the bucket itself.
Deletes all objects from the specified S3 bucket.

Args:
user_id (str): The user's unique Gen3 ID

Raises:
Exception: If there is an error during the deletion process.
user_id (str): The user's unique Gen3 ID.
user_bucket_name (str): The name of the S3 bucket.
"""
user_bucket_name = get_safe_name_from_hostname(user_id)

try:
object_list = s3_client.list_objects_v2(Bucket=user_bucket_name)

Expand All @@ -211,21 +207,58 @@ def delete_user_bucket(user_id: str) -> Union[str, None]:
f"Deleting all contents from '{user_bucket_name}' for user '{user_id}' before deleting the bucket"
)
keys = [{"Key": obj.get("Key")} for obj in object_list["Contents"]]
s3_client.delete_objects(Bucket=user_bucket_name, Delete={"Objects": keys})

logger.info(f"Deleting bucket '{user_bucket_name}' for user '{user_id}'")
s3_client.delete_bucket(Bucket=user_bucket_name)
return user_bucket_name
# According to the docs, up to 1000 objects can be deleted in a single request:
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.delete_objects
limit = 1000
for offset in range(0, len(keys), limit):
response = s3_client.delete_objects(
Bucket=user_bucket_name,
Delete={"Objects": keys[offset : offset + limit]},
)
if response.get("Errors"):
logger.error(
f"Failed to delete objects from bucket '{user_bucket_name}' for user '{user_id}': {response}"
)
raise Exception(response)
except ClientError as e:
logger.error(
f"Failed to delete objects from bucket '{user_bucket_name}' for user '{user_id}': {e}"
)
raise


def delete_user_bucket(user_id: str) -> Union[str, None]:
"""
Deletes all objects from a user's S3 bucket before deleting the bucket itself.

Args:
user_id (str): The user's unique Gen3 ID

Raises:
Exception: If there is an error during the deletion process.
"""
user_bucket_name = get_safe_name_from_hostname(user_id)

try:
s3_client.head_bucket(Bucket=user_bucket_name)
except ClientError as e:
error_code = e.response["Error"]["Code"]
if error_code == "NoSuchBucket":
if error_code == "404":
logger.warning(
f"Bucket '{user_bucket_name}' not found for user '{user_id}'."
)
return None
else:
logger.error(
f"Failed to delete bucket '{user_bucket_name}' for user '{user_id}': {e}"
)

try:
delete_all_bucket_objects(user_id, user_bucket_name)

logger.info(f"Deleting bucket '{user_bucket_name}' for user '{user_id}'")
s3_client.delete_bucket(Bucket=user_bucket_name)
return user_bucket_name

except Exception as e:
logger.error(
f"Failed to delete bucket '{user_bucket_name}' for user '{user_id}': {e}"
)
raise
2 changes: 2 additions & 0 deletions gen3workflow/routes/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ async def get_storage_info(request: Request, auth=Depends(Auth)) -> dict:

@router.delete("/user-bucket", status_code=HTTP_204_NO_CONTENT)
async def delete_user_bucket(request: Request, auth=Depends(Auth)) -> None:
await auth.authorize("delete", ["/services/workflow/user-bucket"])

token_claims = await auth.get_token_claims()
user_id = token_claims.get("sub")
logger.info(f"User '{user_id}' deleting their storage bucket")
Expand Down
148 changes: 137 additions & 11 deletions tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
from moto import mock_aws
import pytest
from unittest.mock import patch, MagicMock

from conftest import TEST_USER_ID
from gen3workflow import aws_utils
Expand All @@ -29,6 +30,14 @@ def mock_aws_services():
yield


# Return a mock list of objects in the bucket with the given size
def get_dummy_object_list(bucket_name, size):
def mock_list_object_v2(Bucket=bucket_name):
return {"Contents": [{"Key": f"file{i}"} for i in range(size)]}

return mock_list_object_v2


def test_get_safe_name_from_hostname(reset_config_hostname):
user_id = "asdfgh"

Expand All @@ -53,6 +62,14 @@ def test_get_safe_name_from_hostname(reset_config_hostname):
async def test_storage_info(client, access_token_patcher, mock_aws_services):
# check that the user's storage information is as expected
expected_bucket_name = f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}"

# Bucket must not exist before this test
with pytest.raises(ClientError) as e:
aws_utils.s3_client.head_bucket(Bucket=expected_bucket_name)
assert (
e.value.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404
), f"Bucket exists: {e.value}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think the ClientError is raised before this assert statement runs?


res = await client.get("/storage/info", headers={"Authorization": "bearer 123"})
assert res.status_code == 200, res.text
storage_info = res.json()
Expand All @@ -62,6 +79,10 @@ async def test_storage_info(client, access_token_patcher, mock_aws_services):
"region": config["USER_BUCKETS_REGION"],
}

# check that the bucket was created after the call to `/storage/info`
bucket_exists = aws_utils.s3_client.head_bucket(Bucket=expected_bucket_name)
assert bucket_exists, "Bucket does not exist"

# check that the bucket is setup with KMS encryption
kms_key = aws_utils.kms_client.describe_key(KeyId=f"alias/{expected_bucket_name}")
kms_key_arn = kms_key["KeyMetadata"]["Arn"]
Expand Down Expand Up @@ -177,21 +198,43 @@ async def test_delete_user_bucket(client, access_token_patcher, mock_aws_service
The user should be able to delete their own bucket.
"""

# Get users bucket name using through user ID
expected_bucket_name = f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}"
# Create the bucket if it doesn't exist
res = await client.get("/storage/info", headers={"Authorization": "bearer 123"})
bucket_name = res.json().get("bucket")

# Verify the bucket exists
bucket_exists = aws_utils.s3_client.head_bucket(Bucket=bucket_name)
assert bucket_exists, "Bucket does not exist"

# Delete the bucket
res = await client.delete(
"/storage/user-bucket", headers={"Authorization": "bearer 123"}
)
assert res.status_code == 204, res.text

# Verify the bucket is deleted
with pytest.raises(ClientError) as e:
aws_utils.s3_client.head_bucket(Bucket=bucket_name)
assert (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment here about ClientError vs assert statement

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this was indented appropriately. No?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah sorry, i think i got this wrong, the assert statement can/should be inside of the with block here. I confused it with a try block. Could you revert that? my bad

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand. I think the assert statement must be outside the with block. (reference here)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whichever is fine.

e.value.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404
), f"Bucket still exists: {e.value}"


# delete bucket that doesn't exist
@pytest.mark.asyncio
async def test_delete_user_bucket_which_does_not_exist(
client, access_token_patcher, mock_aws_services
):
"""
Attempt to delete a bucket that does not exist
"""

# Create the bucket if it doesn't exist
res = await client.get("/storage/info", headers={"Authorization": "bearer 123"})
assert res.status_code == 200, res.text
storage_info = res.json()
assert storage_info == {
"bucket": expected_bucket_name,
"workdir": f"s3://{expected_bucket_name}/ga4gh-tes",
"region": config["USER_BUCKETS_REGION"],
}
bucket_name = res.json().get("bucket")

# Verify the bucket exists
bucket_exists = aws_utils.s3_client.head_bucket(Bucket=expected_bucket_name)
bucket_exists = aws_utils.s3_client.head_bucket(Bucket=bucket_name)
assert bucket_exists, "Bucket does not exist"

# Delete the bucket
Expand All @@ -202,7 +245,90 @@ async def test_delete_user_bucket(client, access_token_patcher, mock_aws_service

# Verify the bucket is deleted
with pytest.raises(ClientError) as e:
aws_utils.s3_client.head_bucket(Bucket=expected_bucket_name)
aws_utils.s3_client.head_bucket(Bucket=bucket_name)
assert (
e.value.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404
), f"Bucket still exists: {e.value}"

# Attempt to Delete the bucket again, must receive a 404, since bucket not found.
res = await client.delete(
"/storage/user-bucket", headers={"Authorization": "bearer 123"}
)
assert res.status_code == 404, res.text


@pytest.mark.asyncio
async def test_delete_user_bucket_with_files(
client, access_token_patcher, mock_aws_services
):
"""
Attempt to delete a bucket that is not empty.
Endpoint must be able to delete all the files and then delete the bucket.
"""

# Create the bucket if it doesn't exist
res = await client.get("/storage/info", headers={"Authorization": "bearer 123"})
bucket_name = res.json().get("bucket")

# Since we are unable to user put_object API due to a probable bug in moto,
# we will monkeypatch the list_objects_v2 API to get a list of objects to the bucket
# Add 1500 objects to the bucket, to ensure batching is working correctly
with patch.object(
aws_utils.s3_client, "list_objects_v2", get_dummy_object_list(bucket_name, 1500)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with this approach is that we're not testing that the delete_object call actually works against a bucket with objects, because it's effectively called against an empty bucket. What's the problem with moto?

Also, does s3_client.delete_objects not raise an error if the object doesn't exist?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does s3_client.delete_objects not raise an error if the object doesn't exist?

Weirdly, no! I expected s3_client.delete_objects to raise an error if the object didn’t exist, but Moto doesn’t throw any errors in that case.

I wanted to add tests to verify that the batching logic works correctly without triggering errors. Initially, I considered mocking the delete_objects endpoint to throw an error when the input exceeds 1000 records, but that felt like overfitting, so I decided against it.

What's the problem with moto?

I also couldn’t create objects in the bucket for testing because Moto throws an error when using the putObject API with an encrypted bucket, even when making a signed request. (Relevant Code)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Since in this case we're not testing the encryption, i think we could use an unencrypted bucket so we can test the deletion properly. I think the quick way would be to add a line to delete the bucket policy (and a comment linking to the previous issue so we know it's the same thing). A better way would be to reconfigure the service with disabled KMS for this test, but that sounds like a waste of time

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m glad you brought this up—I realized that the list_objects_v2 function has a restriction of returning no more than 1,000 files per operation. I’ve added a function to properly fetch and delete them in batches.

Additionally, I’ve removed the bucket_policy enforcing encryption for the bucket in the unit test.

):
# Delete the bucket
res = await client.delete(
"/storage/user-bucket", headers={"Authorization": "bearer 123"}
)
assert res.status_code == 204, res.text

# Verify the bucket is deleted
with pytest.raises(ClientError) as e:
aws_utils.s3_client.head_bucket(Bucket=bucket_name)
assert (
e.value.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404
), f"Bucket still exists: {e.value}"

# Attempt to Delete the bucket again, must receive a 404, since bucket not found.
res = await client.delete(
"/storage/user-bucket", headers={"Authorization": "bearer 123"}
)
assert res.status_code == 404, res.text


@pytest.mark.asyncio
async def test_delete_user_bucket_no_token(client, mock_aws_services):
"""
Attempt to delete a bucket when the user is not logged in. Must receive a 401 error.
"""
mock_delete_bucket = MagicMock()
# Delete the bucket
with patch("gen3workflow.aws_utils.delete_user_bucket", mock_delete_bucket):
res = await client.delete("/storage/user-bucket")
assert res.status_code == 401, res.text
assert res.json() == {"detail": "Must provide an access token"}
mock_delete_bucket.assert_not_called()


@pytest.mark.asyncio
@pytest.mark.parametrize(
"client",
[pytest.param({"authorized": False, "tes_resp_code": 200}, id="unauthorized")],
indirect=True,
)
async def test_delete_user_bucket_unauthorized(
client, access_token_patcher, mock_aws_services
):
"""
Attempt to delete a bucket when the user is logged in but does not have the appropriate authorization.
Must receive a 403 error.
"""
mock_delete_bucket = MagicMock()
# Delete the bucket
with patch("gen3workflow.aws_utils.delete_user_bucket", mock_delete_bucket):
res = await client.delete(
"/storage/user-bucket", headers={"Authorization": "bearer 123"}
)
assert res.status_code == 403, res.text
assert res.json() == {"detail": "Permission denied"}
mock_delete_bucket.assert_not_called()
Loading