-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_bco_dmo_erddap_geojson.py
More file actions
116 lines (77 loc) · 3.4 KB
/
get_bco_dmo_erddap_geojson.py
File metadata and controls
116 lines (77 loc) · 3.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import requests
from bs4 import BeautifulSoup
import utilities.erddap_get_dataset_ids as ds
import utilities.erddap_check_if_ctd as ctd
import utilities.erddap_check_if_ctd_jsonld as ctd_jsonld
import utilities.modify_geojson as geo
def get_dataset_url(dataset_id):
url = f'http://www.bco-dmo.org/dataset/{dataset_id}'
return url
def get_dataset_soup(dataset_id):
url = get_dataset_url(dataset_id)
try:
response = requests.get(url)
dataset_soup = BeautifulSoup(response.text, 'html.parser')
except requests.exceptions.RequestException as e:
# response doesn't exist
dataset_soup = None
# # Can't reach data page
# # Log it
# with open(processing_log, 'a+') as f:
# text = f"Site not reached for dataset id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
# f.write(text)
# No dataset available
if dataset_soup and not dataset_soup(text='Data URL:'):
# with open(processing_log, 'a+') as f:
# text = f"No data set for id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
# f.write(text)
dataset_soup = None
return dataset_soup
def main():
# delete processing log if it exists
processing_log = "./geojson_processing_log_tmp.txt"
geojson_files_log = "./geojson_files_log_tmp.txt"
# if os.path.exists(processing_log):
# os.remove(processing_log)
# Get list of CTD dataset ids from ERDAPP page
dataset_ids = ds.get_ctd_dataset_ids()
for dataset_id in dataset_ids:
#dataset_id = '3937'
# check if ctd type
print(f"Processing dataset id {dataset_id}")
#is_ctd = ctd.check_if_ctd(dataset_id, processing_log)
is_ctd = ctd_jsonld.check_if_ctd(dataset_id, processing_log)
# Assume all are ctd
#is_ctd = True
if is_ctd:
# If the file is ctd, get geojson data
geojson = geo.get_geojson(dataset_id, processing_log)
if geojson:
geo.save_geojson(dataset_id, geojson)
geo.write_to_geojson_files_log(dataset_id, geojson_files_log)
else:
continue
# Get html soup of dataset web page
dataset_soup = get_dataset_soup(dataset_id)
# Get temporal coverage from json+ld or
# dataset web page
if dataset_soup:
#start_date, end_date = geo.get_start_end_dates(dataset_soup)
start_date, end_date = geo.get_start_end_dates_from_json_ld(dataset_soup)
else:
start_date = None
end_date = None
print('start_date', start_date)
print('end_date', end_date)
# modify geoJSON attributes
# Remove some attributes and include dataset id as attribute
geojson = geo.modify_geojson_attributes(dataset_id, start_date, end_date, geojson)
geojson = geo.remove_duplicate_features(geojson)
# Save geoJSON to file
geo.save_modified_geojson(dataset_id, geojson)
else:
with open(processing_log, 'a+') as f:
text = f"File is not CTD at dataset id: {dataset_id}\n"
f.write(text)
if __name__ == '__main__':
main()