15
15
get_dataset_ids ,
16
16
save_dataset ,
17
17
)
18
+ from oc4ids_datastore_pipeline .notifications import send_notification
18
19
from oc4ids_datastore_pipeline .registry import (
19
20
fetch_registered_datasets ,
20
21
get_license_name_from_url ,
24
25
logger = logging .getLogger (__name__ )
25
26
26
27
28
+ class ProcessDatasetError (Exception ):
29
+ def __init__ (self , message : str ):
30
+ super ().__init__ (message )
31
+
32
+
33
+ class ValidationError (ProcessDatasetError ):
34
+ def __init__ (self , errors_count : int , errors : list [str ]):
35
+ message = f"Dataset has { errors_count } validation errors: { str (errors )} "
36
+ super ().__init__ (message )
37
+
38
+
27
39
def download_json (url : str ) -> Any :
28
40
logger .info (f"Downloading json from { url } " )
29
41
try :
@@ -33,19 +45,23 @@ def download_json(url: str) -> Any:
33
45
logger .info (f"Downloaded { url } ({ response_size } bytes)" )
34
46
return r .json ()
35
47
except Exception as e :
36
- raise Exception ( "Download failed" , e )
48
+ raise ProcessDatasetError ( f "Download failed: { str ( e ) } " )
37
49
38
50
39
- def validate_json (dataset_name : str , json_data : dict [str , Any ]) -> None :
40
- logger .info (f"Validating dataset { dataset_name } " )
51
+ def validate_json (dataset_id : str , json_data : dict [str , Any ]) -> None :
52
+ logger .info (f"Validating dataset { dataset_id } " )
41
53
try :
42
54
validation_result = oc4ids_json_output (json_data = json_data )
43
55
validation_errors_count = validation_result ["validation_errors_count" ]
56
+ validation_errors = validation_result ["validation_errors" ]
44
57
if validation_errors_count > 0 :
45
- raise Exception (f"Dataset has { validation_errors_count } validation errors" )
46
- logger .info (f"Dataset { dataset_name } is valid" )
58
+ raise ValidationError (
59
+ errors_count = validation_errors_count ,
60
+ errors = validation_errors ,
61
+ )
62
+ logger .info (f"Dataset { dataset_id } is valid" )
47
63
except Exception as e :
48
- raise Exception ( "Validation failed" , e )
64
+ raise ProcessDatasetError ( f "Validation failed: { str ( e ) } " )
49
65
50
66
51
67
def write_json_to_file (file_name : str , json_data : dict [str , Any ]) -> str :
@@ -57,7 +73,7 @@ def write_json_to_file(file_name: str, json_data: dict[str, Any]) -> str:
57
73
logger .info (f"Finished writing to { file_name } " )
58
74
return file_name
59
75
except Exception as e :
60
- raise Exception ( "Error while writing to JSON file" , e )
76
+ raise ProcessDatasetError ( f "Error writing dataset to file: { e } " )
61
77
62
78
63
79
def transform_to_csv_and_xlsx (json_path : str ) -> tuple [Optional [str ], Optional [str ]]:
@@ -76,59 +92,60 @@ def transform_to_csv_and_xlsx(json_path: str) -> tuple[Optional[str], Optional[s
76
92
logger .info (f"Transformed to XLSX at { xlsx_path } " )
77
93
return csv_path , xlsx_path
78
94
except Exception as e :
79
- logger .warning (f"Failed to transform JSON to CSV and XLSX with error { e } " )
95
+ logger .warning (f"Failed to transform JSON to CSV and XLSX: { e } " )
80
96
return None , None
81
97
82
98
83
99
def save_dataset_metadata (
84
- dataset_name : str ,
100
+ dataset_id : str ,
85
101
source_url : str ,
86
102
json_data : dict [str , Any ],
87
103
json_url : Optional [str ],
88
104
csv_url : Optional [str ],
89
105
xlsx_url : Optional [str ],
90
106
) -> None :
91
- logger .info (f"Saving metadata for dataset { dataset_name } " )
92
- publisher_name = json_data .get ("publisher" , {}).get ("name" , "" )
93
- license_url = json_data .get ("license" , None )
94
- license_name = get_license_name_from_url (license_url ) if license_url else None
95
- dataset = Dataset (
96
- dataset_id = dataset_name ,
97
- source_url = source_url ,
98
- publisher_name = publisher_name ,
99
- license_url = license_url ,
100
- license_name = license_name ,
101
- json_url = json_url ,
102
- csv_url = csv_url ,
103
- xlsx_url = xlsx_url ,
104
- updated_at = datetime .datetime .now (datetime .UTC ),
105
- )
106
- save_dataset (dataset )
107
-
108
-
109
- def process_dataset (dataset_name : str , dataset_url : str ) -> None :
110
- logger .info (f"Processing dataset { dataset_name } " )
107
+ logger .info (f"Saving metadata for dataset { dataset_id } " )
111
108
try :
112
- json_data = download_json (dataset_url )
113
- validate_json (dataset_name , json_data )
114
- json_path = write_json_to_file (
115
- f"data/{ dataset_name } /{ dataset_name } .json" , json_data
116
- )
117
- csv_path , xlsx_path = transform_to_csv_and_xlsx (json_path )
118
- json_public_url , csv_public_url , xlsx_public_url = upload_files (
119
- dataset_name , json_path = json_path , csv_path = csv_path , xlsx_path = xlsx_path
109
+ publisher_name = json_data .get ("publisher" , {}).get ("name" , "" )
110
+ license_url = json_data .get ("license" , None )
111
+ license_name = get_license_name_from_url (license_url ) if license_url else None
112
+ dataset = Dataset (
113
+ dataset_id = dataset_id ,
114
+ source_url = source_url ,
115
+ publisher_name = publisher_name ,
116
+ license_url = license_url ,
117
+ license_name = license_name ,
118
+ json_url = json_url ,
119
+ csv_url = csv_url ,
120
+ xlsx_url = xlsx_url ,
121
+ updated_at = datetime .datetime .now (datetime .UTC ),
120
122
)
121
- save_dataset_metadata (
122
- dataset_name = dataset_name ,
123
- source_url = dataset_url ,
124
- json_data = json_data ,
125
- json_url = json_public_url ,
126
- csv_url = csv_public_url ,
127
- xlsx_url = xlsx_public_url ,
128
- )
129
- logger .info (f"Processed dataset { dataset_name } " )
123
+ save_dataset (dataset )
130
124
except Exception as e :
131
- logger .warning (f"Failed to process dataset { dataset_name } with error { e } " )
125
+ raise ProcessDatasetError (f"Failed to update metadata for dataset: { e } " )
126
+
127
+
128
+ def process_dataset (dataset_id : str , source_url : str ) -> None :
129
+ logger .info (f"Processing dataset { dataset_id } " )
130
+ json_data = download_json (source_url )
131
+ validate_json (dataset_id , json_data )
132
+ json_path = write_json_to_file (
133
+ file_name = f"data/{ dataset_id } /{ dataset_id } .json" ,
134
+ json_data = json_data ,
135
+ )
136
+ csv_path , xlsx_path = transform_to_csv_and_xlsx (json_path )
137
+ json_public_url , csv_public_url , xlsx_public_url = upload_files (
138
+ dataset_id , json_path = json_path , csv_path = csv_path , xlsx_path = xlsx_path
139
+ )
140
+ save_dataset_metadata (
141
+ dataset_id = dataset_id ,
142
+ source_url = source_url ,
143
+ json_data = json_data ,
144
+ json_url = json_public_url ,
145
+ csv_url = csv_public_url ,
146
+ xlsx_url = xlsx_public_url ,
147
+ )
148
+ logger .info (f"Processed dataset { dataset_id } " )
132
149
133
150
134
151
def process_deleted_datasets (registered_datasets : dict [str , str ]) -> None :
@@ -143,8 +160,20 @@ def process_deleted_datasets(registered_datasets: dict[str, str]) -> None:
143
160
def process_registry () -> None :
144
161
registered_datasets = fetch_registered_datasets ()
145
162
process_deleted_datasets (registered_datasets )
146
- for name , url in registered_datasets .items ():
147
- process_dataset (name , url )
163
+ errors : list [dict [str , Any ]] = []
164
+ for dataset_id , url in registered_datasets .items ():
165
+ try :
166
+ process_dataset (dataset_id , url )
167
+ except Exception as e :
168
+ logger .warning (f"Failed to process dataset { dataset_id } with error { e } " )
169
+ errors .append (
170
+ {"dataset_id" : dataset_id , "source_url" : url , "message" : str (e )}
171
+ )
172
+ if errors :
173
+ logger .error (
174
+ f"Errors while processing registry: { json .dumps (errors , indent = 4 )} "
175
+ )
176
+ send_notification (errors )
148
177
logger .info ("Finished processing all datasets" )
149
178
150
179
0 commit comments