CatalogBuilder/catalogbuilder/tests/compval.py at 7874191d3eaccc017fcafc3a0778cb8ea4e2d133 · NOAA-GFDL/CatalogBuilder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python

import click
import json
from jsondiff import diff
import pandas as pd
import sys
import os
import re
import math
import logging
from importlib.resources import files
import urllib.request

logger = logging.getLogger('local')
logger.setLevel(logging.INFO)
logging.basicConfig(stream=sys.stdout)

@click.command()
@click.argument('json_path',nargs=1,required=True)
@click.argument('json_template_path', nargs = 1 , required = False)
@click.option('--vocab', is_flag=True, default = False, help="Validates catalog vocabulary")
@click.option('-pg','--proper_generation', is_flag=True, default = False, help="Validates that catalog has been 'properly generated' (No empty columns, reflects template)")
@click.option('-tf', '--test_failure', is_flag=True, default = False, help="Errors are only printed. Program will not exit.")

def main(json_path: str,json_template_path: str, vocab: bool, proper_generation: bool, test_failure: bool):

    ''' The Comprehensive Validator has two types of tests within it:

     Vocabulary (CV) Testing:

         This test validates catalogs against CMIP6 or GFDL controlled vocabularies (CVs) provided by specific JSON schemas for each vocabulary type. Each vocabulary type to be validated will have a raw GitHub URL to its corresponding CV set in the catalog schema's vocabulary section. CMIP6 CVs are found in the WCRP-CMIP/CMIP6_CVs GitHub repository. GFDL CVs are found in the NOAA-GFDL/CMIP6_CVs GitHub repository. The --vocab flag must be used to enable this test.

     Proper generation / Completeness Testing:

         This test ensures catalogs generated by the Catalog Builder tool are minimally valid. This means a few things: the generated catalog JSON file reflects the template it was generated with, the catalog CSV has at least one row of values (not including headers), and each required column exists without any empty values. If a test case is broken or expected to fail, the --test-failure or -tf flag can be used. This flag will print errors instead of raising an exception. This test must be enabled with the -pg or --proper_generation flag.


     USAGE:

         compval.py <json_path> --vocab
         (Validates catalog against CV's defined in vocabulary section of catalog schema)

         compval <json_path> --proper_generation
         (Checks that catalog is minimally valid. Uses default catalog template/schema if no template path is given. This default template is located at catalogbuilder/cats/gfdl_template.json)

         compval <json_path> <json_template_path> --proper_generation
         (Checks that catalog is minimally valid. Uses given template path to check for reflection.)

         * Vocab and proper generation tests can be run at the same time! * '''

    return compval(json_path,json_template_path, vocab, proper_generation, test_failure)

def compval(json_path,json_template_path, vocab, proper_generation, test_failure):
    """ This method is a comprehensive validator for gfdl data catalogs.

     :param json_path: Path to generated catalog json
     :type json_path: str
     :param json_template_path: Path to catalog template schema
     :type json_template_path: str, optional
     :param vocab: Enables vocabulary validation, defaults to False
     :type vocab: bool
     :param proper_generation: Enables proper_generation/completeness checking, defaults to False
     :type proper_generation: bool
     :param test_failure: Errors are only logged and do not raise any exceptions
     :type test_failure: bool
     :raises IOError: A file is unable to be opened
     :raises ValueError: An error was found during validation (missing/improper values or lack of schema reflection)
     :raises Exception: Used to generally report error during validation
    """
    #Open catalog json
    try:
        j = json.load(open(json_path))
    except:
        raise IOError("Unable to open file. Is this the generated catalog json file?")

    # VOCABULARY VALIDATION IS DONE HERE
    if vocab:

        bad_vocab = {}
        nan_list = []
        vocab_list = []
        urls = {}

        #Get CSV from JSON and open it
        csv_path = str(j["catalog_file"])
        catalog = pd.read_csv(csv_path)

        #Parse through the JSON and find which CV is needed
        for attribute in j["attributes"]:

            #Checks to see if the vocabulary field is filled out. If so, use the vocab. Also, we avoid chunk_freq because it's currently broken.
            try:
                if 'chunk_freq' not in attribute["column_name"] and attribute["vocabulary"]:
                    cv_url = attribute["vocabulary"]
                    logger.info("Validating " + attribute["column_name"] + " vocabulary")
                else:
                    continue
            except KeyError:
                logger.warning("Missing vocabulary field in catalog schema")

            try:
                with urllib.request.urlopen(cv_url) as f:
                    json_data = json.load(f)
            except:
                raise IOError("Unable to open json: " + str(list(attribute.values())[0]))

            #This will probably break if formatting is different. Will adjust if necessary.
            try:
                for cv in json_data[attribute['column_name']].keys():
                    vocab_list.append(cv)
            except AttributeError:
                for cv in json_data[attribute['column_name']]:
                    vocab_list.append(cv)

            #Look for "bad vocab" in the CSV
            for _vocab in catalog[list(attribute.values())[0]]:
                if _vocab not in vocab_list and _vocab not in bad_vocab:
                    if not isinstance(_vocab,str) and math.isnan(_vocab):
                        continue
                    else:
                        #Update the bad vocabulary dictionary (column name has to be value because we can't have duplicate keys)
                        bad_vocab.update({_vocab:attribute['column_name']})
                        #We keep track of all the urls users need to correct their bad vocabulary
                        if attribute['column_name'] not in urls:
                            urls.update({attribute['column_name']:cv_url})

        if nan_list:
            logger.warning("WARNING: NaN's found in: " + str(nan_list))

        if bad_vocab:
            for entry in bad_vocab:
                if not entry == list(bad_vocab.keys())[-1]:
                    logger.error("Inconsistent " + bad_vocab[entry] + " value: " + '"' + entry + '"')
                    continue
                logger.error("Inconsistent " + bad_vocab[entry] + " value: " + '"' + entry + '"\n')
            for entry in urls:
                logger.info("Compliant " + entry + " vocabulary can be found here: " + urls[entry])
            if not test_failure:
                raise ValueError("Found inconsistent value(s)")
            logger.warning("Found inconsistent value(s)\n")
        else:
            logger.info("Check passed.")

    # COMPLETENESS/SCHEMA REFLECTION VALIDATION IS DONE HERE
    if proper_generation:

        if json_template_path:
            json_template = json.load(open(json_template_path))
        else:
            jpath = files('catalogbuilder').joinpath('cats/gfdl_template.json')
            json_template = json.load(open(jpath))

        #Validate JSON against JSON template
        comp = (diff(j,json_template))
        for key in comp.keys():
            if key != 'catalog_file':
                if test_failure:
                    logger.warning(key + ' section of JSON does not refect template')
                else:
                    raise ValueError(key + ' section of JSON does not refect template')

        #Get CSV from JSON and open it
        csv_path = j["catalog_file"]
        catalog = pd.read_csv(csv_path)

        if len(catalog.index) < 1:
            if test_failure:
                logger.warning("Catalog has no values")
            else:
                raise ValueError("Catalog has no values")

        #Get required columns
        req = (j["aggregation_control"]["groupby_attrs"])

        #Check the csv headers for required columns/values
        errors = 0
        for column in req:
            if column not in catalog.columns:
                logger.error("The required column '" + column + "' does not exist in the csv. In other words, there is some inconsistency between the json and the csv file. Please check out info listed under aggregation_control and groupby_attrs in your json file and verify if those columns show up in the csv as well.")
                errors += 1

            if column in catalog.columns:
                if(catalog[column].isnull().values.any()):
                    logger.error("'" + column + "' contains empty values.")
                    errors += 1

        if errors > 0:
            if test_failure:
                logger.warning("Found " + str(errors) + " errors.")
            else:
                raise Exception("Found " + str(errors) + " errors.")

    else:
        logger.info("No tests ran. Please use either --vocab or -pg/--proper_generation flags for testing")
    return

if __name__ == '__main__':
    main()