22
33import json
44import sys ,pandas as pd
5+ import time
56import click
67import os
78from pathlib import Path
89import logging
10+ from catalogbuilder .tests .compval import compval as cv
911
1012logger = logging .getLogger ('local' )
1113logger .setLevel (logging .INFO )
1214logging .basicConfig (stream = sys .stdout )
1315
1416try :
15- from catalogbuilder .intakebuilder import gfdlcrawler , CSVwriter , configparser , getinfo
17+ from catalogbuilder .intakebuilder import gfdlcrawler , CSVwriter , configparser , getinfo
1618except ModuleNotFoundError :
17- print ("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? " )
18- print ("Attempting again with adjusted sys.path " )
19+ logger . warning ("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? " )
20+ logger . warning ("Attempting again with adjusted sys.path " )
1921 try :
20- sys .path .append (os .path .dirname (os .path .dirname (os .path .abspath (__file__ ))))
22+ sys .path .append (os .path .dirname (os .path .dirname (os .path .abspath (__file__ ))))
2123 except :
22- print ("Unable to adjust sys.path" )
24+ logger . error ("Unable to adjust sys.path" )
2325 #print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
2426 try :
25- from intakebuilder import gfdlcrawler , CSVwriter , configparser ,getinfo
26- print (gfdlcrawler .__file__ )
27+
28+ from intakebuilder import gfdlcrawler , CSVwriter , builderconfig , configparser ,getinfo
29+ logger .info (gfdlcrawler .__file__ )
30+
2731 except ModuleNotFoundError :
28- sys .exit ("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? " )
32+ logger .error ("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it?" )
33+ raise ImportError ("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it?" )
2934
3035package_dir = os .path .dirname (os .path .abspath (__file__ ))
3136#template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
3237
3338def create_catalog (input_path = None , output_path = None , config = None , filter_realm = None , filter_freq = None , filter_chunk = None ,
34- overwrite = False , append = False , slow = False , verbose = False ):
39+ overwrite = False , append = False , slow = False , strict = False , verbose = False ):
3540 if verbose :
36- logger .setLevel (logging .DEBUG )
37- logger .info ("Verbose log activated." )
41+ logger .setLevel (logging .DEBUG )
42+ logger .info ("Verbose log activated.\n " )
3843 else :
39- logger .setLevel (logging .INFO )
40- logger .info ("[Mostly] silent log activated" )
44+ logger .info ("[Mostly] silent log activated\n " )
45+ if strict :
46+ logger .warning ("!!!!! STRICT MODE IS ACTIVE. CATALOG GENERATION WILL FAIL IF ERRORS ARE FOUND !!!!!\n " )
47+ time .sleep (10 )
4148 configyaml = None
4249 if (config is not None ):
4350 configyaml = configparser .Config (config ,logger )
@@ -48,38 +55,41 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
4855 else :
4956 # If user does not pass a config, we will use the default config with the same format to avoid special cases
5057 #
51- try :
52- pkg = importlib_resources .files ("catalogbuilder.scripts" )
53- config = pkg / "configs" / "config.yaml"
54- logger .info ("Default config path activated from package resources configs/config.yaml" )
55- except :
56- try :
57- config = os .path .join (package_dir , 'configs/config_default.yaml' )
58- logger .info ("Default config path activated from path configs/config_default.yaml" )
59- except :
60- sys . exit ("Can't locate or read config, check --config " )
61- configyaml = configparser .Config (config ,logger )
62- if (input_path is None ):
58+ try :
59+ pkg = importlib_resources .files ("catalogbuilder.scripts" )
60+ config = pkg / "configs" / "config.yaml"
61+ logger .info ("Default config path activated from package resources configs/config.yaml" )
62+ except :
63+ try :
64+ config = os .path .join (package_dir , 'configs/config_default.yaml' )
65+ logger .info ("Default config path activated from path configs/config_default.yaml" )
66+ except :
67+ raise FileNotFoundError ("Can't locate or read config, check --config " )
68+ configyaml = configparser .Config (config ,logger )
69+ if (input_path is None ):
6370 input_path = configyaml .input_path
64- if (output_path is None ):
71+ if (output_path is None ):
6572 output_path = configyaml .output_path
6673 if ((input_path is None ) or (output_path is None )):
67- sys .exit ("Missing: input_path or output_path. Pass it in the config yaml or as command-line option" )
74+ logger .error ("Missing: input_path or output_path. Pass it in the config yaml or as command-line option" )
75+ raise TypeError ("Missing: input_path or output_path. Pass it in the config yaml or as command-line option" )
6876 if config is None or not configyaml .schema :
69- logger .info ("Default schema: catalogbuilder/cats/gfdl_template.json" )
70- template_path = os .path .join (package_dir , '../cats/gfdl_template.json' )
77+ logger .info ("Default schema: catalogbuilder/cats/gfdl_template.json" )
78+ template_path = os .path .join (package_dir , '../cats/gfdl_template.json' )
7179 else :
72- template_path = configyaml .schema
73- print ("Using schema from config file" , template_path )
80+ template_path = configyaml .schema
81+ logger . info ("Using schema from config file" , template_path )
7482 if not os .path .exists (input_path ):
75- sys .exit ("Input path does not exist. Adjust configuration." )
83+ logger .error ("Input path does not exist. Adjust configuration." )
84+ raise FileNotFoundError ("Input path does not exist. Adjust configuration." )
7685 if not os .path .exists (Path (output_path ).parent .absolute ()):
77- sys .exit ("Output path parent directory does not exist. Adjust configuration." )
86+ logger .error ("Output path parent directory does not exist. Adjust configuration." )
87+ raise ValueError ("Output path parent directory does not exist. Adjust configuration." )
7888 logger .info ("input path: " + input_path )
79- logger .info ( " output path: "+ output_path )
89+ logger .info (" output path: "+ output_path )
8090 project_dir = input_path
8191 csv_path = "{0}.csv" .format (output_path )
82- json_path = "{0}.json" .format (output_path )
92+ json_path = "{0}.json" .format (output_path )
8393
8494 ######### SEARCH FILTERS ###########################
8595
@@ -120,40 +130,49 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
120130 os .makedirs (os .path .dirname (csv_path ), exist_ok = True )
121131 CSVwriter .listdict_to_csv (list_files , headers , csv_path , overwrite , append ,slow )
122132 df = None
123- if (slow == False ) & ('standard_name' in headers ):
124- #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs
125- df = pd .read_csv (os .path .abspath (csv_path ), sep = "," , header = 0 ,index_col = False )
126- list_variable_id = []
127- try :
128- list_variable_id = df ["variable_id" ].unique ().tolist ()
129- except :
130- print ("Having trouble finding 'variable_id'... Be sure to add it to the output_path_template field of your configuration" )
131- try :
132- list_realm = df ["realm" ].unique ().tolist ()
133- except :
134- print ("Having trouble finding 'realm'... Be sure to add it to the output_path_template field of your configuration" )
135- dictVarCF = getinfo .getStandardName (list_variable_id ,list_realm )
136- #print("standard name from look-up table-", dictVarCF)
137- for k , v in dictVarCF .items ():
138- try :
139- var = k .split ("," )[0 ]
140- except ValueError :
141- continue
142- try :
143- realm = k .split ("," )[1 ]
144- except ValueError :
145- continue
146- if (var is not None ) & (realm is not None ):
147- df ['standard_name' ].loc [(df ['variable_id' ] == var ) & (df ['realm' ] == realm ) ] = v
148- #df['standard_name'].loc[(df['variable_id'] == k)] = v
149- if (slow == False ) & ('standard_name' in headers ):
150- if ((df is not None ) & (len (df ) != 0 ) ):
151- with open (csv_path , 'w' ) as csvfile :
152- df .to_csv (csvfile ,index = False )
153-
154- print ("JSON generated at:" , os .path .abspath (json_path ))
155- print ("CSV generated at:" , os .path .abspath (csv_path ))
156- logger .info ("CSV generated at" + os .path .abspath (csv_path ))
133+
134+ if not slow and 'standard_name' in headers :
135+ #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs
136+ df = pd .read_csv (os .path .abspath (csv_path ), sep = "," , header = 0 ,index_col = False )
137+ list_variable_id = []
138+ try :
139+ list_variable_id = df ["variable_id" ].unique ().tolist ()
140+ except :
141+ raise KeyError ("Having trouble finding 'variable_id'... Be sure to add it to the output_path_template field of your configuration" )
142+ try :
143+ list_realm = df ["realm" ].unique ().tolist ()
144+ except :
145+ raise KeyError ("Having trouble finding 'realm'... Be sure to add it to the output_path_template field of your configuration" )
146+ dictVarCF = getinfo .getStandardName (list_variable_id ,list_realm )
147+ #print("standard name from look-up table-", dictVarCF)
148+ for k , v in dictVarCF .items ():
149+ try :
150+ var = k .split ("," )[0 ]
151+ except ValueError :
152+ continue
153+ try :
154+ realm = k .split ("," )[1 ]
155+ except ValueError :
156+ continue
157+ if (var is not None ) & (realm is not None ):
158+ df ['standard_name' ].loc [(df ['variable_id' ] == var ) & (df ['realm' ] == realm ) ] = v
159+ #df['standard_name'].loc[(df['variable_id'] == k)] = v
160+
161+ if ((df is not None ) & (len (df ) != 0 ) ):
162+ with open (csv_path , 'w' ) as csvfile :
163+ df .to_csv (csvfile ,index = False )
164+
165+ # Strict Mode
166+ if strict :
167+ vocab = True
168+ proper_generation = False
169+ test_failure = False
170+
171+ #Validate
172+ cv (json_path ,'' ,vocab , proper_generation , test_failure )
173+
174+ logger .info ("JSON generated at: " + os .path .abspath (json_path ))
175+ logger .info ("CSV generated at: " + os .path .abspath (csv_path ))
157176 return (csv_path ,json_path )
158177
159178#Setting up argument parsing/flags
@@ -170,10 +189,11 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
170189@click .option ('--overwrite' , is_flag = True , default = False )
171190@click .option ('--append' , is_flag = True , default = False )
172191@click .option ('--slow' ,'-s' , is_flag = True , default = False , help = 'This option looks up standard names in netcdf file to fill up the standard name column if its present in the header specs. If standard_name is absent, long_name with space replaced by underscore is utilized' )
192+ @click .option ('--strict' , is_flag = True , default = False , help = 'Strict catalog generation ensures catalogs are compliant with CV standards (as defined in vocabulary section of catalog schema)' )
173193@click .option ('--verbose/--silent' , default = False , is_flag = True ) #default has silent option. Use --verbose for detailed logging
174194
175195def create_catalog_cli (** kwargs ):
176196 return create_catalog (** kwargs )
177-
197+
178198if __name__ == '__main__' :
179199 create_catalog_cli ()
0 commit comments