Skip to content

Commit bc02a94

Browse files
feat: delete files for datasets removed from the registry
1 parent b3e6637 commit bc02a94

File tree

4 files changed

+50
-3
lines changed

4 files changed

+50
-3
lines changed

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
fetch_registered_datasets,
2020
get_license_name_from_url,
2121
)
22-
from oc4ids_datastore_pipeline.storage import upload_files
22+
from oc4ids_datastore_pipeline.storage import delete_files_for_dataset, upload_files
2323

2424
logger = logging.getLogger(__name__)
2525

@@ -137,7 +137,7 @@ def process_deleted_datasets(registered_datasets: dict[str, str]) -> None:
137137
for dataset_id in deleted_datasets:
138138
logger.info(f"Dataset {dataset_id} is no longer in the registry, deleting")
139139
delete_dataset(dataset_id)
140-
# TODO: Delete stored files
140+
delete_files_for_dataset(dataset_id)
141141

142142

143143
def process_registry() -> None:

oc4ids_datastore_pipeline/storage.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,17 @@ def upload_files(
9696
csv_public_url = _upload_csv(dataset_id, csv_path) if csv_path else None
9797
xlsx_public_url = _upload_xlsx(dataset_id, xlsx_path) if xlsx_path else None
9898
return json_public_url, csv_public_url, xlsx_public_url
99+
100+
101+
def delete_files_for_dataset(dataset_id: str) -> None:
102+
logger.info(f"Deleting files for dataset {dataset_id}")
103+
try:
104+
client = _get_client()
105+
response = client.list_objects_v2(Bucket=BUCKET_NAME, Prefix=dataset_id)
106+
if "Contents" in response:
107+
objects_to_delete = [{"Key": obj["Key"]} for obj in response["Contents"]]
108+
client.delete_objects(
109+
Bucket=BUCKET_NAME, Delete={"Objects": objects_to_delete}
110+
)
111+
except Exception as e:
112+
logger.warning(f"Failed to delete files with error {e}")

tests/test_pipeline.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,11 +111,15 @@ def test_process_deleted_datasets(mocker: MockerFixture) -> None:
111111
patch_delete_dataset = mocker.patch(
112112
"oc4ids_datastore_pipeline.pipeline.delete_dataset"
113113
)
114+
patch_delete_files_for_dataset = mocker.patch(
115+
"oc4ids_datastore_pipeline.pipeline.delete_files_for_dataset"
116+
)
114117

115118
registered_datasets = {"test_dataset": "https://test_dataset.json"}
116119
process_deleted_datasets(registered_datasets)
117120

118121
patch_delete_dataset.assert_called_once_with("old_dataset")
122+
patch_delete_files_for_dataset.assert_called_once_with("old_dataset")
119123

120124

121125
def test_process_dataset_catches_exception(mocker: MockerFixture) -> None:

tests/test_storage.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pytest
77
from pytest_mock import MockerFixture
88

9-
from oc4ids_datastore_pipeline.storage import upload_files
9+
from oc4ids_datastore_pipeline.storage import delete_files_for_dataset, upload_files
1010

1111

1212
@pytest.fixture(autouse=True)
@@ -185,3 +185,32 @@ def test_upload_files_xlsx_catches_upload_exception(mock_client: MagicMock) -> N
185185
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset_csv.zip" # noqa: E501
186186
)
187187
assert xlsx_public_url is None
188+
189+
190+
def test_delete_files_for_dataset(mock_client: MagicMock) -> None:
191+
mock_client.list_objects_v2.return_value = {
192+
"Contents": [
193+
{"Key": "test_dataset/test_dataset.json"},
194+
{"Key": "test_dataset/test_dataset_csv.zip"},
195+
{"Key": "test_dataset/test_dataset.xlsx"},
196+
]
197+
}
198+
199+
delete_files_for_dataset("test_dataset")
200+
201+
mock_client.delete_objects.assert_called_once_with(
202+
Bucket="test-bucket",
203+
Delete={
204+
"Objects": [
205+
{"Key": "test_dataset/test_dataset.json"},
206+
{"Key": "test_dataset/test_dataset_csv.zip"},
207+
{"Key": "test_dataset/test_dataset.xlsx"},
208+
]
209+
},
210+
)
211+
212+
213+
def test_delete_files_for_dataset_catches_exception(mock_client: MagicMock) -> None:
214+
mock_client.list_objects_v2.side_effect = Exception("Mock exception")
215+
216+
delete_files_for_dataset("test_dataset")

0 commit comments

Comments
 (0)