8
8
from libcoveoc4ids .api import oc4ids_json_output
9
9
10
10
from oc4ids_datastore_pipeline .database import Dataset , save_dataset
11
+ from oc4ids_datastore_pipeline .registry import (
12
+ fetch_registered_datasets ,
13
+ get_license_name_from_url ,
14
+ )
11
15
12
16
logger = logging .getLogger (__name__ )
13
17
14
18
15
- def fetch_registered_datasets () -> dict [str , str ]:
16
- logger .info ("Fetching registered datasets list from registry" )
17
- try :
18
- url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/dataset/records_api.json" # noqa: E501
19
- r = requests .get (url )
20
- r .raise_for_status ()
21
- json_data = r .json ()
22
- registered_datasets = {
23
- key : value ["fields" ]["url" ]["value" ]
24
- for (key , value ) in json_data ["records" ].items ()
25
- }
26
- registered_datasets_count = len (registered_datasets )
27
- logger .info (f"Fetched URLs for { registered_datasets_count } datasets" )
28
- return registered_datasets
29
- except Exception as e :
30
- raise Exception ("Failed to fetch datasets list from registry" , e )
31
-
32
-
33
19
def download_json (url : str ) -> Any :
34
20
logger .info (f"Downloading json from { url } " )
35
21
try :
@@ -42,7 +28,7 @@ def download_json(url: str) -> Any:
42
28
raise Exception ("Download failed" , e )
43
29
44
30
45
- def validate_json (dataset_name : str , json_data : Any ) -> None :
31
+ def validate_json (dataset_name : str , json_data : dict [ str , Any ] ) -> None :
46
32
logger .info (f"Validating dataset { dataset_name } " )
47
33
try :
48
34
validation_result = oc4ids_json_output (json_data = json_data )
@@ -54,26 +40,32 @@ def validate_json(dataset_name: str, json_data: Any) -> None:
54
40
raise Exception ("Validation failed" , e )
55
41
56
42
57
- def write_json_to_file (file_name : str , json_data : Any ) -> None :
43
+ def write_json_to_file (file_name : str , json_data : dict [ str , Any ] ) -> str :
58
44
logger .info (f"Writing dataset to file { file_name } " )
59
45
try :
60
46
os .makedirs (os .path .dirname (file_name ), exist_ok = True )
61
47
with open (file_name , "w" ) as file :
62
48
json .dump (json_data , file , indent = 4 )
63
49
logger .info (f"Finished writing to { file_name } " )
50
+ return file_name
64
51
except Exception as e :
65
52
raise Exception ("Error while writing to JSON file" , e )
66
53
67
54
68
55
def save_dataset_metadata (
69
- dataset_name : str , source_url : str , publisher_name : str , file_name : str
56
+ dataset_name : str , source_url : str , json_data : dict [ str , Any ], json_url : str
70
57
) -> None :
71
58
logger .info (f"Saving metadata for dataset { dataset_name } " )
59
+ publisher_name = json_data .get ("publisher" , {}).get ("name" , "" )
60
+ license_url = json_data .get ("license" , None )
61
+ license_name = get_license_name_from_url (license_url ) if license_url else None
72
62
dataset = Dataset (
73
63
dataset_id = dataset_name ,
74
64
source_url = source_url ,
75
65
publisher_name = publisher_name ,
76
- json_url = file_name ,
66
+ license_url = license_url ,
67
+ license_name = license_name ,
68
+ json_url = json_url ,
77
69
updated_at = datetime .datetime .now (datetime .UTC ),
78
70
)
79
71
save_dataset (dataset )
@@ -84,14 +76,12 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None:
84
76
try :
85
77
json_data = download_json (dataset_url )
86
78
validate_json (dataset_name , json_data )
87
- file_name = f"data/{ dataset_name } .json"
88
- write_json_to_file (file_name , json_data )
89
- publisher_name = json_data .get ("publisher" , {}).get ("name" , "" )
79
+ json_url = write_json_to_file (f"data/{ dataset_name } .json" , json_data )
90
80
save_dataset_metadata (
91
81
dataset_name = dataset_name ,
92
82
source_url = dataset_url ,
93
- publisher_name = publisher_name ,
94
- file_name = file_name ,
83
+ json_data = json_data ,
84
+ json_url = json_url ,
95
85
)
96
86
logger .info (f"Processed dataset { dataset_name } " )
97
87
except Exception as e :
0 commit comments