schema-org-cchdo-bcodmo/get_bco_dmo_erddap_geojson.py at master · cchdo/schema-org-cchdo-bcodmo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import requests
from bs4 import BeautifulSoup

import utilities.erddap_get_dataset_ids as ds
import utilities.erddap_check_if_ctd as ctd
import utilities.erddap_check_if_ctd_jsonld as ctd_jsonld
import utilities.modify_geojson as geo


def get_dataset_url(dataset_id):

    url = f'http://www.bco-dmo.org/dataset/{dataset_id}'
    return url


def get_dataset_soup(dataset_id):

    url = get_dataset_url(dataset_id)

    try:
        response = requests.get(url)
        dataset_soup = BeautifulSoup(response.text, 'html.parser')

    except requests.exceptions.RequestException as e:
        # response doesn't exist
        dataset_soup = None

        # # Can't reach data page
        # # Log it
        # with open(processing_log, 'a+') as f:
        #     text = f"Site not reached for dataset id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
        #     f.write(text)

    # No dataset available
    if dataset_soup and not dataset_soup(text='Data URL:'):

        # with open(processing_log, 'a+') as f:
        #     text = f"No data set for id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
        #     f.write(text)

        dataset_soup = None

    return dataset_soup


def main():

    # delete processing log if it exists
    processing_log = "./geojson_processing_log_tmp.txt"

    geojson_files_log = "./geojson_files_log_tmp.txt"

    # if os.path.exists(processing_log):
    #     os.remove(processing_log)


    # Get list of CTD dataset ids from ERDAPP page
    dataset_ids = ds.get_ctd_dataset_ids()

    for dataset_id in dataset_ids:

        #dataset_id = '3937'

        # check if ctd type
        print(f"Processing dataset id {dataset_id}")

        #is_ctd = ctd.check_if_ctd(dataset_id, processing_log)
        is_ctd = ctd_jsonld.check_if_ctd(dataset_id, processing_log)

        # Assume all are ctd
        #is_ctd = True

        if is_ctd:

            # If the file is ctd, get geojson data
            geojson = geo.get_geojson(dataset_id, processing_log)

            if geojson:
                geo.save_geojson(dataset_id, geojson)
                geo.write_to_geojson_files_log(dataset_id, geojson_files_log)
            else:
                continue

            # Get html soup of dataset web page
            dataset_soup = get_dataset_soup(dataset_id)

            # Get temporal coverage from json+ld or
            # dataset web page
            if dataset_soup:
                #start_date, end_date = geo.get_start_end_dates(dataset_soup)
                start_date, end_date = geo.get_start_end_dates_from_json_ld(dataset_soup)
            else:
                start_date = None
                end_date = None

            print('start_date', start_date)
            print('end_date', end_date)

            # modify geoJSON attributes
            # Remove some attributes and include dataset id as attribute
            geojson = geo.modify_geojson_attributes(dataset_id, start_date, end_date, geojson)

            geojson = geo.remove_duplicate_features(geojson)

            # Save geoJSON to file
            geo.save_modified_geojson(dataset_id, geojson)

        else:
            with open(processing_log, 'a+') as f:
                text = f"File is not CTD at dataset id: {dataset_id}\n"
                f.write(text)


if __name__ == '__main__':
    main()