Skip to content

Commit af5db08

Browse files
feat: save valid datasets to database
1 parent aa347fc commit af5db08

File tree

5 files changed

+89
-1
lines changed

5 files changed

+89
-1
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ A Python application to validate and store published OC4IDS datasets.
77
### Prerequisites
88

99
- Python 3.12
10+
- Postgres
1011

1112
### Install Python requirements
1213

@@ -16,6 +17,12 @@ source .venv/bin/activate
1617
pip install -r requirements_dev.txt
1718
```
1819

20+
### Set database enrivonment variable
21+
22+
```
23+
export DATABASE_URL="postgresql://oc4ids_datastore@localhost/oc4ids_datastore"
24+
```
25+
1926
### Run app
2027

2128
```
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import datetime
2+
import logging
3+
import os
4+
5+
from sqlalchemy import (
6+
DateTime,
7+
Engine,
8+
String,
9+
create_engine,
10+
)
11+
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
12+
13+
logger = logging.getLogger(__name__)
14+
15+
_engine = None
16+
17+
18+
class Base(DeclarativeBase):
19+
pass
20+
21+
22+
class Dataset(Base):
23+
__tablename__ = "dataset"
24+
25+
dataset_id: Mapped[str] = mapped_column(String, primary_key=True)
26+
source_url: Mapped[str] = mapped_column(String)
27+
publisher_name: Mapped[str] = mapped_column(String)
28+
json_url: Mapped[str] = mapped_column(String)
29+
updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
30+
31+
32+
def get_engine() -> Engine:
33+
global _engine
34+
if _engine is None:
35+
_engine = create_engine(os.environ["DATABASE_URL"])
36+
return _engine
37+
38+
39+
def create_tables() -> None:
40+
logger.info("Creating database tables")
41+
Base.metadata.create_all(get_engine())
42+
logger.info("Created tables successfully")
43+
44+
45+
def save_dataset(dataset: Dataset) -> None:
46+
with Session(get_engine()) as session:
47+
session.merge(dataset)
48+
session.commit()

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import datetime
12
import json
23
import logging
34
import os
@@ -6,6 +7,8 @@
67
import requests
78
from libcoveoc4ids.api import oc4ids_json_output
89

10+
from oc4ids_datastore_pipeline.database import Dataset, create_tables, save_dataset
11+
912
logger = logging.getLogger(__name__)
1013

1114

@@ -62,12 +65,34 @@ def write_json_to_file(file_name: str, json_data: Any) -> None:
6265
raise Exception("Error while writing to JSON file", e)
6366

6467

68+
def save_dataset_metadata(
69+
dataset_name: str, source_url: str, publisher_name: str, file_name: str
70+
) -> None:
71+
logger.info(f"Saving metadata for dataset {dataset_name}")
72+
dataset = Dataset(
73+
dataset_id=dataset_name,
74+
source_url=source_url,
75+
publisher_name=publisher_name,
76+
json_url=file_name,
77+
updated_at=datetime.datetime.now(datetime.UTC),
78+
)
79+
save_dataset(dataset)
80+
81+
6582
def process_dataset(dataset_name: str, dataset_url: str) -> None:
6683
logger.info(f"Processing dataset {dataset_name}")
6784
try:
6885
json_data = download_json(dataset_url)
6986
validate_json(dataset_name, json_data)
70-
write_json_to_file(f"data/{dataset_name}.json", json_data)
87+
file_name = f"data/{dataset_name}.json"
88+
write_json_to_file(file_name, json_data)
89+
publisher_name = json_data.get("publisher", {}).get("name", "")
90+
save_dataset_metadata(
91+
dataset_name=dataset_name,
92+
source_url=dataset_url,
93+
publisher_name=publisher_name,
94+
file_name=file_name,
95+
)
7196
logger.info(f"Processed dataset {dataset_name}")
7297
except Exception as e:
7398
logger.warning(f"Failed to process dataset {dataset_name} with error {e}")
@@ -80,4 +105,5 @@ def process_datasets() -> None:
80105

81106

82107
def run() -> None:
108+
create_tables()
83109
process_datasets()

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ version = "0.1.0"
99
readme = "README.md"
1010
dependencies = [
1111
"libcoveoc4ids",
12+
"psycopg2",
1213
"requests",
14+
"sqlalchemy",
1315
]
1416

1517
[project.optional-dependencies]

requirements_dev.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ platformdirs==4.3.6
103103
# requests-cache
104104
pluggy==1.5.0
105105
# via pytest
106+
psycopg2==2.9.10
107+
# via oc4ids-datastore-pipeline (pyproject.toml)
106108
pycodestyle==2.12.1
107109
# via flake8
108110
pycparser==2.22
@@ -146,6 +148,8 @@ six==1.17.0
146148
# via
147149
# rfc3339-validator
148150
# url-normalize
151+
sqlalchemy==2.0.37
152+
# via oc4ids-datastore-pipeline (pyproject.toml)
149153
transaction==5.0
150154
# via zodb
151155
types-requests==2.32.0.20241016
@@ -154,6 +158,7 @@ typing-extensions==4.12.2
154158
# via
155159
# mypy
156160
# referencing
161+
# sqlalchemy
157162
url-normalize==1.4.3
158163
# via requests-cache
159164
urllib3==2.3.0

0 commit comments

Comments
 (0)