Skip to content

Commit ba5dcef

Browse files
feat: upload files to s3-compatible storage bucket
1 parent 318b683 commit ba5dcef

File tree

8 files changed

+136
-4
lines changed

8 files changed

+136
-4
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
.venv
22
__pycache__
33

4+
.env
5+
46
data/

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ export DATABASE_URL="postgresql://oc4ids_datastore@localhost/oc4ids_datastore"
2929
alembic upgrade head
3030
```
3131

32+
### S3 environment variables
33+
34+
<!-- TODO: Document -->
35+
3236
### Run app
3337

3438
```
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
import logging
22
import time
33

4+
from dotenv import load_dotenv
5+
46
logging.basicConfig(
57
level=logging.INFO,
68
format="%(asctime)s:%(levelname)s:%(name)s:%(message)s",
79
datefmt="%Y-%m-%dT%H:%M:%S",
810
)
911
logging.Formatter.converter = time.gmtime
12+
13+
load_dotenv()

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
fetch_registered_datasets,
2020
get_license_name_from_url,
2121
)
22+
from oc4ids_datastore_pipeline.storage import upload_files
2223

2324
logger = logging.getLogger(__name__)
2425

@@ -83,7 +84,7 @@ def save_dataset_metadata(
8384
dataset_name: str,
8485
source_url: str,
8586
json_data: dict[str, Any],
86-
json_url: str,
87+
json_url: Optional[str],
8788
csv_url: Optional[str],
8889
xlsx_url: Optional[str],
8990
) -> None:
@@ -114,13 +115,16 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None:
114115
f"data/{dataset_name}/{dataset_name}.json", json_data
115116
)
116117
csv_path, xlsx_path = transform_to_csv_and_xlsx(json_path)
118+
json_public_url, csv_public_url, xlsx_public_url = upload_files(
119+
json_path=json_path, csv_path=csv_path, xlsx_path=xlsx_path
120+
)
117121
save_dataset_metadata(
118122
dataset_name=dataset_name,
119123
source_url=dataset_url,
120124
json_data=json_data,
121-
json_url=json_path,
122-
csv_url=csv_path,
123-
xlsx_url=xlsx_path,
125+
json_url=json_public_url,
126+
csv_url=csv_public_url,
127+
xlsx_url=xlsx_public_url,
124128
)
125129
logger.info(f"Processed dataset {dataset_name}")
126130
except Exception as e:
@@ -133,6 +137,7 @@ def process_deleted_datasets(registered_datasets: dict[str, str]) -> None:
133137
for dataset_id in deleted_datasets:
134138
logger.info(f"Dataset {dataset_id} is no longer in the registry, deleting")
135139
delete_dataset(dataset_id)
140+
# TODO: Delete stored files
136141

137142

138143
def process_registry() -> None:
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import logging
2+
import os
3+
import zipfile
4+
from pathlib import Path
5+
from typing import Any, Optional
6+
7+
import boto3
8+
import botocore
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
ENABLE_UPLOAD = bool(int(os.environ.get("ENABLE_UPLOAD", 0)))
14+
BUCKET_REGION = os.environ["BUCKET_REGION"]
15+
BUCKET_NAME = os.environ["BUCKET_NAME"]
16+
BUCKET_ACCESS_KEY_ID = os.environ["BUCKET_ACCESS_KEY_ID"]
17+
BUCKET_ACCESS_KEY_SECRET = os.environ["BUCKET_ACCESS_KEY_SECRET"]
18+
19+
20+
def _get_client() -> Any:
21+
session = boto3.session.Session()
22+
return session.client(
23+
"s3",
24+
endpoint_url=f"https://{BUCKET_REGION}.digitaloceanspaces.com/",
25+
config=botocore.config.Config(s3={"addressing_style": "virtual"}),
26+
region_name=BUCKET_REGION,
27+
aws_access_key_id=BUCKET_ACCESS_KEY_ID,
28+
aws_secret_access_key=BUCKET_ACCESS_KEY_SECRET,
29+
)
30+
31+
32+
def upload_file(local_path: str, content_type: str) -> str:
33+
bucket_path = os.path.relpath(local_path, "data")
34+
logger.info(f"Uploading file {local_path}")
35+
client = _get_client()
36+
client.upload_file(
37+
local_path,
38+
BUCKET_NAME,
39+
bucket_path,
40+
ExtraArgs={"ACL": "public-read", "ContentType": content_type},
41+
)
42+
return (
43+
f"https://{BUCKET_NAME}.{BUCKET_REGION}.digitaloceanspaces.com/" + bucket_path
44+
)
45+
46+
47+
def upload_json(json_path: str) -> str:
48+
return upload_file(json_path, content_type="application/json")
49+
50+
51+
def upload_csv(csv_path: str) -> str:
52+
directory = Path(csv_path)
53+
zip_file_path = f"{csv_path}_csv.zip"
54+
with zipfile.ZipFile(zip_file_path, mode="w") as archive:
55+
for file_path in directory.rglob("*"):
56+
archive.write(file_path, arcname=file_path.relative_to(directory))
57+
return upload_file(zip_file_path, content_type="application/zip")
58+
59+
60+
def upload_xlsx(xlsx_path: str) -> str:
61+
return upload_file(
62+
xlsx_path,
63+
content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # noqa: E501
64+
)
65+
66+
67+
def upload_files(
68+
json_path: Optional[str] = None,
69+
csv_path: Optional[str] = None,
70+
xlsx_path: Optional[str] = None,
71+
) -> tuple[Optional[str], Optional[str], Optional[str]]:
72+
# TODO: Option to delete local files once uploaded?
73+
# TODO: Exception handling
74+
if not ENABLE_UPLOAD:
75+
logger.info("Upload is disabled, skipping")
76+
return None, None, None
77+
logger.info("Uploading files")
78+
if json_path:
79+
json_public_url = upload_json(json_path)
80+
logger.info(f"Uploaded JSON file to {json_public_url}")
81+
if csv_path:
82+
csv_public_url = upload_csv(csv_path)
83+
logger.info(f"Uploaded CSV zip file to {csv_public_url}")
84+
if xlsx_path:
85+
xlsx_public_url = upload_xlsx(xlsx_path)
86+
logger.info(f"Uploaded XLSX file to {xlsx_public_url}")
87+
return json_public_url, csv_public_url, xlsx_public_url

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ version = "0.1.0"
99
readme = "README.md"
1010
dependencies = [
1111
"alembic",
12+
"boto3",
1213
"flattentool",
1314
"libcoveoc4ids",
1415
"psycopg2",
@@ -25,6 +26,8 @@ dev = [
2526
"mypy",
2627
"pytest",
2728
"pytest-mock",
29+
"python-dotenv",
30+
"types-boto3",
2831
"types-requests",
2932
]
3033

requirements_dev.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ backports-datetime-fromisoformat==2.0.3
1616
# via flattentool
1717
black==25.1.0
1818
# via oc4ids-datastore-pipeline (pyproject.toml)
19+
boto3==1.36.13
20+
# via oc4ids-datastore-pipeline (pyproject.toml)
21+
botocore==1.36.13
22+
# via
23+
# boto3
24+
# s3transfer
25+
botocore-stubs==1.36.12
26+
# via types-boto3
1927
btrees==6.1
2028
# via zodb
2129
cattrs==24.1.2
@@ -53,6 +61,10 @@ iniconfig==2.0.0
5361
# via pytest
5462
isort==6.0.0
5563
# via oc4ids-datastore-pipeline (pyproject.toml)
64+
jmespath==1.0.1
65+
# via
66+
# boto3
67+
# botocore
5668
json-merge-patch==0.2
5769
# via ocdsextensionregistry
5870
jsonref==1.1.0
@@ -125,6 +137,10 @@ pytest==8.3.4
125137
# pytest-mock
126138
pytest-mock==3.14.0
127139
# via oc4ids-datastore-pipeline (pyproject.toml)
140+
python-dateutil==2.9.0.post0
141+
# via botocore
142+
python-dotenv==1.0.1
143+
# via oc4ids-datastore-pipeline (pyproject.toml)
128144
pytz==2025.1
129145
# via flattentool
130146
referencing==0.36.2
@@ -150,10 +166,13 @@ rpds-py==0.22.3
150166
# via
151167
# jsonschema
152168
# referencing
169+
s3transfer==0.11.2
170+
# via boto3
153171
schema==0.7.7
154172
# via flattentool
155173
six==1.17.0
156174
# via
175+
# python-dateutil
157176
# rfc3339-validator
158177
# url-normalize
159178
sqlalchemy==2.0.37
@@ -162,8 +181,14 @@ sqlalchemy==2.0.37
162181
# oc4ids-datastore-pipeline (pyproject.toml)
163182
transaction==5.0
164183
# via zodb
184+
types-awscrt==0.23.9
185+
# via botocore-stubs
186+
types-boto3==1.36.13
187+
# via oc4ids-datastore-pipeline (pyproject.toml)
165188
types-requests==2.32.0.20241016
166189
# via oc4ids-datastore-pipeline (pyproject.toml)
190+
types-s3transfer==0.11.2
191+
# via types-boto3
167192
typing-extensions==4.12.2
168193
# via
169194
# alembic
@@ -174,6 +199,7 @@ url-normalize==1.4.3
174199
# via requests-cache
175200
urllib3==2.3.0
176201
# via
202+
# botocore
177203
# requests
178204
# requests-cache
179205
# types-requests

tests/test_storage.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# TODO: Implement tests

0 commit comments

Comments
 (0)