2
2
import json
3
3
import logging
4
4
import os
5
- from typing import Any
5
+ from pathlib import Path
6
+ from typing import Any , Optional
6
7
8
+ import flattentool
7
9
import requests
8
10
from libcoveoc4ids .api import oc4ids_json_output
9
11
@@ -52,8 +54,34 @@ def write_json_to_file(file_name: str, json_data: dict[str, Any]) -> str:
52
54
raise Exception ("Error while writing to JSON file" , e )
53
55
54
56
57
+ def transform_to_csv_and_xlsx (json_path : str ) -> tuple [Optional [str ], Optional [str ]]:
58
+ logger .info (f"Transforming { json_path } " )
59
+ try :
60
+ path = Path (json_path )
61
+ # TODO: Files already exist? Delete before starting?
62
+ flattentool .flatten (
63
+ json_path ,
64
+ output_name = str (path .parent / path .stem ),
65
+ root_list_path = "projects" ,
66
+ main_sheet_name = "projects" ,
67
+ ) # type: ignore[no-untyped-call]
68
+ csv_path = str (path .parent / path .stem )
69
+ xlsx_path = f"{ path .parent / path .stem } .xlsx"
70
+ logger .info (f"Transformed to CSV at { csv_path } " )
71
+ logger .info (f"Transformed to XLSX at { xlsx_path } " )
72
+ return csv_path , xlsx_path
73
+ except Exception as e :
74
+ logger .warning (f"Failed to transform JSON to CSV and XLSX with error { e } " )
75
+ return None , None
76
+
77
+
55
78
def save_dataset_metadata (
56
- dataset_name : str , source_url : str , json_data : dict [str , Any ], json_url : str
79
+ dataset_name : str ,
80
+ source_url : str ,
81
+ json_data : dict [str , Any ],
82
+ json_url : str ,
83
+ csv_url : Optional [str ],
84
+ xlsx_url : Optional [str ],
57
85
) -> None :
58
86
logger .info (f"Saving metadata for dataset { dataset_name } " )
59
87
publisher_name = json_data .get ("publisher" , {}).get ("name" , "" )
@@ -66,6 +94,8 @@ def save_dataset_metadata(
66
94
license_url = license_url ,
67
95
license_name = license_name ,
68
96
json_url = json_url ,
97
+ csv_url = csv_url ,
98
+ xlsx_url = xlsx_url ,
69
99
updated_at = datetime .datetime .now (datetime .UTC ),
70
100
)
71
101
save_dataset (dataset )
@@ -76,12 +106,17 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None:
76
106
try :
77
107
json_data = download_json (dataset_url )
78
108
validate_json (dataset_name , json_data )
79
- json_url = write_json_to_file (f"data/{ dataset_name } .json" , json_data )
109
+ json_path = write_json_to_file (
110
+ f"data/{ dataset_name } /{ dataset_name } .json" , json_data
111
+ )
112
+ csv_path , xlsx_path = transform_to_csv_and_xlsx (json_path )
80
113
save_dataset_metadata (
81
114
dataset_name = dataset_name ,
82
115
source_url = dataset_url ,
83
116
json_data = json_data ,
84
- json_url = json_url ,
117
+ json_url = json_path ,
118
+ csv_url = csv_path ,
119
+ xlsx_url = xlsx_path ,
85
120
)
86
121
logger .info (f"Processed dataset { dataset_name } " )
87
122
except Exception as e :
0 commit comments