Skip to content

Commit 733a5b4

Browse files
feat: transform JSON to CSV and XLSX formats
1 parent 192a9e7 commit 733a5b4

File tree

6 files changed

+100
-6
lines changed

6 files changed

+100
-6
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""add csv and xlsx columns to dataset table
2+
3+
Revision ID: 084c39bf418e
4+
Revises: 85905d23accc
5+
Create Date: 2025-02-05 11:10:03.114086
6+
7+
"""
8+
9+
from typing import Sequence, Union
10+
11+
from alembic import op
12+
import sqlalchemy as sa
13+
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = "084c39bf418e"
17+
down_revision: Union[str, None] = "85905d23accc"
18+
branch_labels: Union[str, Sequence[str], None] = None
19+
depends_on: Union[str, Sequence[str], None] = None
20+
21+
22+
def upgrade() -> None:
23+
# ### commands auto generated by Alembic - please adjust! ###
24+
op.add_column("dataset", sa.Column("csv_url", sa.String(), nullable=True))
25+
op.add_column("dataset", sa.Column("xlsx_url", sa.String(), nullable=True))
26+
# ### end Alembic commands ###
27+
28+
29+
def downgrade() -> None:
30+
# ### commands auto generated by Alembic - please adjust! ###
31+
op.drop_column("dataset", "xlsx_url")
32+
op.drop_column("dataset", "csv_url")
33+
# ### end Alembic commands ###

oc4ids_datastore_pipeline/database.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ class Dataset(Base):
2929
license_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
3030
license_name: Mapped[Optional[str]] = mapped_column(String, nullable=True)
3131
json_url: Mapped[str] = mapped_column(String)
32+
csv_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
33+
xlsx_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
3234
updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
3335

3436

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
import json
33
import logging
44
import os
5-
from typing import Any
5+
from pathlib import Path
6+
from typing import Any, Optional
67

8+
import flattentool
79
import requests
810
from libcoveoc4ids.api import oc4ids_json_output
911

@@ -52,8 +54,33 @@ def write_json_to_file(file_name: str, json_data: dict[str, Any]) -> str:
5254
raise Exception("Error while writing to JSON file", e)
5355

5456

57+
def transform_to_csv_and_xlsx(json_path: str) -> tuple[Optional[str], Optional[str]]:
58+
logger.info(f"Transforming {json_path}")
59+
try:
60+
path = Path(json_path)
61+
flattentool.flatten(
62+
json_path,
63+
output_name=str(path.parent / path.stem),
64+
root_list_path="projects",
65+
main_sheet_name="projects",
66+
) # type: ignore[no-untyped-call]
67+
csv_path = str(path.parent / path.stem)
68+
xlsx_path = f"{path.parent / path.stem}.xlsx"
69+
logger.info(f"Transformed to CSV at {csv_path}")
70+
logger.info(f"Transformed to XLSX at {xlsx_path}")
71+
return csv_path, xlsx_path
72+
except Exception as e:
73+
logger.warning(f"Failed to transform JSON to CSV and XLSX with error {e}")
74+
return None, None
75+
76+
5577
def save_dataset_metadata(
56-
dataset_name: str, source_url: str, json_data: dict[str, Any], json_url: str
78+
dataset_name: str,
79+
source_url: str,
80+
json_data: dict[str, Any],
81+
json_url: str,
82+
csv_url: Optional[str],
83+
xlsx_url: Optional[str],
5784
) -> None:
5885
logger.info(f"Saving metadata for dataset {dataset_name}")
5986
publisher_name = json_data.get("publisher", {}).get("name", "")
@@ -66,6 +93,8 @@ def save_dataset_metadata(
6693
license_url=license_url,
6794
license_name=license_name,
6895
json_url=json_url,
96+
csv_url=csv_url,
97+
xlsx_url=xlsx_url,
6998
updated_at=datetime.datetime.now(datetime.UTC),
7099
)
71100
save_dataset(dataset)
@@ -76,12 +105,17 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None:
76105
try:
77106
json_data = download_json(dataset_url)
78107
validate_json(dataset_name, json_data)
79-
json_url = write_json_to_file(f"data/{dataset_name}.json", json_data)
108+
json_path = write_json_to_file(
109+
f"data/{dataset_name}/{dataset_name}.json", json_data
110+
)
111+
csv_path, xlsx_path = transform_to_csv_and_xlsx(json_path)
80112
save_dataset_metadata(
81113
dataset_name=dataset_name,
82114
source_url=dataset_url,
83115
json_data=json_data,
84-
json_url=json_url,
116+
json_url=json_path,
117+
csv_url=csv_path,
118+
xlsx_url=xlsx_path,
85119
)
86120
logger.info(f"Processed dataset {dataset_name}")
87121
except Exception as e:

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ version = "0.1.0"
99
readme = "README.md"
1010
dependencies = [
1111
"alembic",
12+
"flattentool",
1213
"libcoveoc4ids",
1314
"psycopg2",
1415
"requests",
@@ -40,7 +41,7 @@ max-line-length = 88
4041
strict = true
4142

4243
[[tool.mypy.overrides]]
43-
module = ["libcoveoc4ids.*"]
44+
module = ["libcoveoc4ids.*", "flattentool.*"]
4445
follow_untyped_imports = true
4546

4647
[tool.pytest.ini_options]

requirements_dev.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ flake8==7.1.1
4242
flake8-pyproject==1.2.3
4343
# via oc4ids-datastore-pipeline (pyproject.toml)
4444
flattentool==0.27.0
45-
# via libcove
45+
# via
46+
# libcove
47+
# oc4ids-datastore-pipeline (pyproject.toml)
4648
idna==3.10
4749
# via requests
4850
ijson==3.3.0

tests/test_pipeline.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from oc4ids_datastore_pipeline.pipeline import (
99
download_json,
1010
process_dataset,
11+
transform_to_csv_and_xlsx,
1112
validate_json,
1213
write_json_to_file,
1314
)
@@ -80,6 +81,27 @@ def test_write_json_to_file_raises_failure_exception(mocker: MockerFixture) -> N
8081
assert "Mocked exception" in str(exc_info.value)
8182

8283

84+
def test_transform_to_csv_and_xlsx_returns_correct_paths(mocker: MockerFixture) -> None:
85+
mocker.patch("oc4ids_datastore_pipeline.pipeline.flattentool.flatten")
86+
87+
csv_path, xlsx_path = transform_to_csv_and_xlsx("dir/dataset/dataset.json")
88+
89+
assert csv_path == "dir/dataset/dataset"
90+
assert xlsx_path == "dir/dataset/dataset.xlsx"
91+
92+
93+
def test_transform_to_csv_and_xlsx_catches_exception(mocker: MockerFixture) -> None:
94+
patch_flatten = mocker.patch(
95+
"oc4ids_datastore_pipeline.pipeline.flattentool.flatten"
96+
)
97+
patch_flatten.side_effect = Exception("Mocked exception")
98+
99+
csv_path, xlsx_path = transform_to_csv_and_xlsx("dir/dataset/dataset.json")
100+
101+
assert csv_path is None
102+
assert xlsx_path is None
103+
104+
83105
def test_process_dataset_catches_exception(mocker: MockerFixture) -> None:
84106
patch_download_json = mocker.patch(
85107
"oc4ids_datastore_pipeline.pipeline.download_json"

0 commit comments

Comments
 (0)