Skip to content

Commit 6b237b9

Browse files
feat: exception handling for file uploads
1 parent 31e7f4f commit 6b237b9

File tree

6 files changed

+230
-43
lines changed

6 files changed

+230
-43
lines changed

.env.test

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
ENABLE_UPLOAD=1
2+
BUCKET_REGION="test-region"
3+
BUCKET_NAME="test-bucket"
4+
BUCKET_ACCESS_KEY_ID="test-id"
5+
BUCKET_ACCESS_KEY_SECRET="test-secret"
Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
11
import logging
22
import time
33

4-
from dotenv import load_dotenv
5-
64
logging.basicConfig(
75
level=logging.INFO,
86
format="%(asctime)s:%(levelname)s:%(name)s:%(message)s",
97
datefmt="%Y-%m-%dT%H:%M:%S",
108
)
119
logging.Formatter.converter = time.gmtime
12-
13-
load_dotenv()

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import flattentool
99
import requests
10+
from dotenv import load_dotenv
1011
from libcoveoc4ids.api import oc4ids_json_output
1112

1213
from oc4ids_datastore_pipeline.database import (
@@ -116,7 +117,7 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None:
116117
)
117118
csv_path, xlsx_path = transform_to_csv_and_xlsx(json_path)
118119
json_public_url, csv_public_url, xlsx_public_url = upload_files(
119-
json_path=json_path, csv_path=csv_path, xlsx_path=xlsx_path
120+
dataset_name, json_path=json_path, csv_path=csv_path, xlsx_path=xlsx_path
120121
)
121122
save_dataset_metadata(
122123
dataset_name=dataset_name,
@@ -148,4 +149,5 @@ def process_registry() -> None:
148149

149150

150151
def run() -> None:
152+
load_dotenv()
151153
process_registry()

oc4ids_datastore_pipeline/storage.py

Lines changed: 50 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
logger = logging.getLogger(__name__)
1111

1212

13-
# TODO:
14-
ENABLE_UPLOAD = os.environ.get("ENABLE_UPLOAD", "0")
1513
BUCKET_REGION = os.environ.get("BUCKET_REGION")
1614
BUCKET_NAME = os.environ.get("BUCKET_NAME")
1715
BUCKET_ACCESS_KEY_ID = os.environ.get("BUCKET_ACCESS_KEY_ID")
@@ -30,8 +28,7 @@ def _get_client() -> Any:
3028
)
3129

3230

33-
def upload_file(local_path: str, content_type: str) -> str:
34-
bucket_path = os.path.relpath(local_path, "data")
31+
def _upload_file(local_path: str, bucket_path: str, content_type: str) -> str:
3532
logger.info(f"Uploading file {local_path}")
3633
client = _get_client()
3734
client.upload_file(
@@ -40,49 +37,65 @@ def upload_file(local_path: str, content_type: str) -> str:
4037
bucket_path,
4138
ExtraArgs={"ACL": "public-read", "ContentType": content_type},
4239
)
43-
return (
40+
public_url = (
4441
f"https://{BUCKET_NAME}.{BUCKET_REGION}.digitaloceanspaces.com/" + bucket_path
4542
)
46-
47-
48-
def upload_json(json_path: str) -> str:
49-
return upload_file(json_path, content_type="application/json")
50-
51-
52-
def upload_csv(csv_path: str) -> str:
53-
directory = Path(csv_path)
54-
zip_file_path = f"{csv_path}_csv.zip"
55-
with zipfile.ZipFile(zip_file_path, mode="w") as archive:
56-
for file_path in directory.rglob("*"):
57-
archive.write(file_path, arcname=file_path.relative_to(directory))
58-
return upload_file(zip_file_path, content_type="application/zip")
59-
60-
61-
def upload_xlsx(xlsx_path: str) -> str:
62-
return upload_file(
63-
xlsx_path,
64-
content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # noqa: E501
65-
)
43+
logger.info(f"Uploaded to {public_url}")
44+
return public_url
45+
46+
47+
def _upload_json(dataset_id: str, json_path: str) -> Optional[str]:
48+
try:
49+
return _upload_file(
50+
local_path=json_path,
51+
bucket_path=f"{dataset_id}/{dataset_id}.json",
52+
content_type="application/json",
53+
)
54+
except Exception as e:
55+
logger.warning(f"Failed to upload {json_path} with error {e}")
56+
return None
57+
58+
59+
def _upload_csv(dataset_id: str, csv_path: str) -> Optional[str]:
60+
try:
61+
directory = Path(csv_path)
62+
zip_file_path = f"{csv_path}_csv.zip"
63+
with zipfile.ZipFile(zip_file_path, mode="w") as archive:
64+
for file_path in directory.rglob("*"):
65+
archive.write(file_path, arcname=file_path.relative_to(directory))
66+
return _upload_file(
67+
local_path=zip_file_path,
68+
bucket_path=f"{dataset_id}/{dataset_id}_csv.zip",
69+
content_type="application/zip",
70+
)
71+
except Exception as e:
72+
logger.warning(f"Failed to upload {csv_path} with error {e}")
73+
return None
74+
75+
76+
def _upload_xlsx(dataset_id: str, xlsx_path: str) -> Optional[str]:
77+
try:
78+
return _upload_file(
79+
local_path=xlsx_path,
80+
bucket_path=f"{dataset_id}/{dataset_id}.xlsx",
81+
content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # noqa: E501
82+
)
83+
except Exception as e:
84+
logger.warning(f"Failed to upload {xlsx_path} with error {e}")
85+
return None
6686

6787

6888
def upload_files(
89+
dataset_id: str,
6990
json_path: Optional[str] = None,
7091
csv_path: Optional[str] = None,
7192
xlsx_path: Optional[str] = None,
7293
) -> tuple[Optional[str], Optional[str], Optional[str]]:
7394
# TODO: Option to delete local files once uploaded?
74-
# TODO: Exception handling
75-
if not bool(int(ENABLE_UPLOAD)):
95+
if not bool(int(os.environ.get("ENABLE_UPLOAD", "0"))):
7696
logger.info("Upload is disabled, skipping")
7797
return None, None, None
78-
logger.info("Uploading files")
79-
if json_path:
80-
json_public_url = upload_json(json_path)
81-
logger.info(f"Uploaded JSON file to {json_public_url}")
82-
if csv_path:
83-
csv_public_url = upload_csv(csv_path)
84-
logger.info(f"Uploaded CSV zip file to {csv_public_url}")
85-
if xlsx_path:
86-
xlsx_public_url = upload_xlsx(xlsx_path)
87-
logger.info(f"Uploaded XLSX file to {xlsx_public_url}")
98+
json_public_url = _upload_json(dataset_id, json_path) if json_path else None
99+
csv_public_url = _upload_csv(dataset_id, csv_path) if csv_path else None
100+
xlsx_public_url = _upload_xlsx(dataset_id, xlsx_path) if xlsx_path else None
88101
return json_public_url, csv_public_url, xlsx_public_url

tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from dotenv import find_dotenv, load_dotenv
2+
3+
load_dotenv(find_dotenv(".env.test"))

tests/test_storage.py

Lines changed: 169 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,169 @@
1-
# TODO: Implement tests
1+
import os
2+
import tempfile
3+
from typing import Any
4+
from unittest.mock import MagicMock
5+
6+
import pytest
7+
from pytest_mock import MockerFixture
8+
9+
from oc4ids_datastore_pipeline.storage import upload_files
10+
11+
12+
@pytest.fixture(autouse=True)
13+
def mock_client(mocker: MockerFixture) -> Any:
14+
os.environ["ENABLE_UPLOAD"] = "1"
15+
mock_boto3_client = MagicMock()
16+
patch_boto3_client = mocker.patch(
17+
"oc4ids_datastore_pipeline.storage.boto3.session.Session.client"
18+
)
19+
patch_boto3_client.return_value = mock_boto3_client
20+
return mock_boto3_client
21+
22+
23+
def test_upload_files_upload_disabled(mock_client: MagicMock) -> None:
24+
os.environ["ENABLE_UPLOAD"] = "0"
25+
26+
json_public_url, csv_public_url, xlsx_public_url = upload_files(
27+
"test_dataset",
28+
json_path="dataset.json",
29+
csv_path="dataset_csv.zip",
30+
xlsx_path="dataset.xlsx",
31+
)
32+
33+
mock_client.assert_not_called()
34+
assert json_public_url is None
35+
assert csv_public_url is None
36+
assert xlsx_public_url is None
37+
38+
39+
def test_upload_files_nothing_to_upload(mock_client: MagicMock) -> None:
40+
json_public_url, csv_public_url, xlsx_public_url = upload_files("test_dataset")
41+
42+
mock_client.assert_not_called()
43+
assert json_public_url is None
44+
assert csv_public_url is None
45+
assert xlsx_public_url is None
46+
47+
48+
def test_upload_files_json(mock_client: MagicMock) -> None:
49+
json_public_url, csv_public_url, xlsx_public_url = upload_files(
50+
"test_dataset", json_path="data/test_dataset/test_dataset.json"
51+
)
52+
53+
mock_client.upload_file.assert_called_once_with(
54+
"data/test_dataset/test_dataset.json",
55+
"test-bucket",
56+
"test_dataset/test_dataset.json",
57+
ExtraArgs={"ACL": "public-read", "ContentType": "application/json"},
58+
)
59+
assert (
60+
json_public_url
61+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset.json" # noqa: E501
62+
)
63+
assert csv_public_url is None
64+
assert xlsx_public_url is None
65+
66+
67+
def test_upload_files_json_catches_exception(mock_client: MagicMock) -> None:
68+
mock_client.upload_file.side_effect = [Exception("Mock exception"), None, None]
69+
70+
with tempfile.TemporaryDirectory() as csv_dir:
71+
json_public_url, csv_public_url, xlsx_public_url = upload_files(
72+
"test_dataset",
73+
json_path="data/test_dataset/test_dataset.json",
74+
csv_path=csv_dir,
75+
xlsx_path="data/test_dataset/test_dataset.xlsx",
76+
)
77+
assert json_public_url is None
78+
assert (
79+
csv_public_url
80+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset_csv.zip" # noqa: E501
81+
)
82+
assert (
83+
xlsx_public_url
84+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset.xlsx" # noqa: E501
85+
)
86+
87+
88+
def test_upload_files_csv(mock_client: MagicMock) -> None:
89+
with tempfile.TemporaryDirectory() as csv_dir:
90+
json_public_url, csv_public_url, xlsx_public_url = upload_files(
91+
"test_dataset", csv_path=csv_dir
92+
)
93+
94+
mock_client.upload_file.assert_called_once_with(
95+
f"{csv_dir}_csv.zip",
96+
"test-bucket",
97+
"test_dataset/test_dataset_csv.zip",
98+
ExtraArgs={"ACL": "public-read", "ContentType": "application/zip"},
99+
)
100+
assert json_public_url is None
101+
assert (
102+
csv_public_url
103+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset_csv.zip" # noqa: E501
104+
)
105+
assert xlsx_public_url is None
106+
107+
108+
def test_upload_files_csv_catches_exception(mock_client: MagicMock) -> None:
109+
mock_client.upload_file.side_effect = [None, Exception("Mock exception"), None]
110+
111+
with tempfile.TemporaryDirectory() as csv_dir:
112+
json_public_url, csv_public_url, xlsx_public_url = upload_files(
113+
"test_dataset",
114+
json_path="data/test_dataset/test_dataset.json",
115+
csv_path=csv_dir,
116+
xlsx_path="data/test_dataset/test_dataset.xlsx",
117+
)
118+
assert (
119+
json_public_url
120+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset.json" # noqa: E501
121+
)
122+
assert csv_public_url is None
123+
assert (
124+
xlsx_public_url
125+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset.xlsx" # noqa: E501
126+
)
127+
128+
129+
def test_upload_files_xlsx(mock_client: MagicMock) -> None:
130+
json_public_url, csv_public_url, xlsx_public_url = upload_files(
131+
"test_dataset", xlsx_path="data/test_dataset/test_dataset.xlsx"
132+
)
133+
134+
mock_client.upload_file.assert_called_once_with(
135+
"data/test_dataset/test_dataset.xlsx",
136+
"test-bucket",
137+
"test_dataset/test_dataset.xlsx",
138+
ExtraArgs={
139+
"ACL": "public-read",
140+
"ContentType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # noqa: E501
141+
},
142+
)
143+
assert json_public_url is None
144+
assert csv_public_url is None
145+
assert (
146+
xlsx_public_url
147+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset.xlsx" # noqa: E501
148+
)
149+
150+
151+
def test_upload_files_xlsx_catches_exception(mock_client: MagicMock) -> None:
152+
mock_client.upload_file.side_effect = [None, None, Exception("Mock exception")]
153+
154+
with tempfile.TemporaryDirectory() as csv_dir:
155+
json_public_url, csv_public_url, xlsx_public_url = upload_files(
156+
"test_dataset",
157+
json_path="data/test_dataset/test_dataset.json",
158+
csv_path=csv_dir,
159+
xlsx_path="data/test_dataset/test_dataset.xlsx",
160+
)
161+
assert (
162+
json_public_url
163+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset.json" # noqa: E501
164+
)
165+
assert (
166+
csv_public_url
167+
== "https://test-bucket.test-region.digitaloceanspaces.com/test_dataset/test_dataset_csv.zip" # noqa: E501
168+
)
169+
assert xlsx_public_url is None

0 commit comments

Comments
 (0)