NOAA-GFDL
diff --git a/‎.github/workflows/conda-env-create-run-pytest.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/conda-env-create-run-pytest.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎catalogbuilder/intakebuilder/CSVwriter.py‎
Lines changed: 1 addition & 1 deletion b/‎catalogbuilder/intakebuilder/CSVwriter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎catalogbuilder/scripts/gen_intake_gfdl.py‎
Lines changed: 90 additions & 70 deletions b/‎catalogbuilder/scripts/gen_intake_gfdl.py‎
Lines changed: 90 additions & 70 deletions
diff --git a/‎catalogbuilder/scripts/test_catalog.py‎
Lines changed: 0 additions & 70 deletions b/‎catalogbuilder/scripts/test_catalog.py‎
Lines changed: 0 additions & 70 deletions
@@ -61,5 +61,5 @@ jobs:
     - name: Test for completeness
       run: | 
         which python
-        python catalogbuilder/scripts/test_catalog.py -tf gfdl_autotest.json catalogbuilder/cats/gfdl_template.json
-        python catalogbuilder/scripts/test_catalog.py -tf catalogbuilder/cats/gfdl_autotest_from_yaml.json
+        python catalogbuilder/tests/compval.py --proper_generation -tf gfdl_autotest.json catalogbuilder/cats/gfdl_template.json
+        python catalogbuilder/tests/compval.py --proper_generation -tf catalogbuilder/cats/gfdl_autotest_from_yaml.json
@@ -53,7 +53,7 @@ def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append,slow):
             if os.path.isfile(csvfile):
                 user_input = ''
                 while True:
-                    user_input = input('Found existing file! Overwrite? (y/n)')
+                    user_input = input('\nFound existing file! Overwrite? (y/n)\n')
 
                     if user_input.lower() == 'y':
                         with open(csvfile, 'w') as csvfile:
 
@@ -2,42 +2,49 @@
 
 import json
 import sys,pandas as pd
+import time
 import click
 import os
 from pathlib import Path
 import logging
+from catalogbuilder.tests.compval import compval as cv
 
 logger = logging.getLogger('local')
 logger.setLevel(logging.INFO)
 logging.basicConfig(stream=sys.stdout)
 
 try:
-   from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, configparser, getinfo
+    from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, configparser, getinfo
 except ModuleNotFoundError:
-    print("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
-    print("Attempting again with adjusted sys.path ")
+    logger.warning("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
+    logger.warning("Attempting again with adjusted sys.path ")
     try:
-       sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     except:
-       print("Unable to adjust sys.path")
+        logger.error("Unable to adjust sys.path")
     #print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     try:
-        from intakebuilder import gfdlcrawler, CSVwriter, configparser,getinfo
-        print(gfdlcrawler.__file__)
+
+        from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser,getinfo
+        logger.info(gfdlcrawler.__file__)
+
     except ModuleNotFoundError:
-        sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
+        logger.error("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it?")
+        raise ImportError("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it?")
 
 package_dir = os.path.dirname(os.path.abspath(__file__))
 #template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
 
 def create_catalog(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
-         overwrite=False, append=False, slow = False, verbose=False):
+         overwrite=False, append=False, slow = False, strict = False, verbose=False):
     if verbose:
-       logger.setLevel(logging.DEBUG)
-       logger.info("Verbose log activated.")
+        logger.setLevel(logging.DEBUG)
+        logger.info("Verbose log activated.\n")
     else:
-       logger.setLevel(logging.INFO)
-       logger.info("[Mostly] silent log activated")
+        logger.info("[Mostly] silent log activated\n")
+    if strict:
+        logger.warning("!!!!! STRICT MODE IS ACTIVE. CATALOG GENERATION WILL FAIL IF ERRORS ARE FOUND !!!!!\n")
+        time.sleep(10)
     configyaml = None
     if (config is not None):
         configyaml = configparser.Config(config,logger)
@@ -48,38 +55,41 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
     else:
             # If user does not pass a config, we will use the default config with the same format to avoid special cases
         #
-         try:
-                  pkg = importlib_resources.files("catalogbuilder.scripts")
-                  config = pkg / "configs" / "config.yaml"
-                  logger.info("Default config path activated from package resources configs/config.yaml")
-         except:
-                 try:
-                    config = os.path.join(package_dir, 'configs/config_default.yaml')
-                    logger.info("Default config path activated from path configs/config_default.yaml")
-                 except:
-                    sys.exit("Can't locate or read config, check --config ")
-         configyaml = configparser.Config(config,logger)
-         if(input_path is None):     
+        try:
+            pkg = importlib_resources.files("catalogbuilder.scripts")
+            config = pkg / "configs" / "config.yaml"
+            logger.info("Default config path activated from package resources configs/config.yaml")
+        except:
+            try:
+                config = os.path.join(package_dir, 'configs/config_default.yaml')
+                logger.info("Default config path activated from path configs/config_default.yaml")
+            except:
+                raise FileNotFoundError("Can't locate or read config, check --config ")
+        configyaml = configparser.Config(config,logger)
+        if(input_path is None):
             input_path = configyaml.input_path
-         if(output_path is None):
+        if(output_path is None):
             output_path = configyaml.output_path
     if((input_path is None) or (output_path is None)):
-       sys.exit("Missing: input_path or output_path. Pass it in the config yaml or as command-line option")     
+        logger.error("Missing: input_path or output_path. Pass it in the config yaml or as command-line option")
+        raise TypeError("Missing: input_path or output_path. Pass it in the config yaml or as command-line option")
     if config is None or not configyaml.schema:
-            logger.info("Default schema: catalogbuilder/cats/gfdl_template.json")
-            template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
+        logger.info("Default schema: catalogbuilder/cats/gfdl_template.json")
+        template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
     else:
-            template_path = configyaml.schema
-            print("Using schema from config file", template_path)
+        template_path = configyaml.schema
+        logger.info("Using schema from config file", template_path)
     if not os.path.exists(input_path):
-        sys.exit("Input path does not exist. Adjust configuration.")
+        logger.error("Input path does not exist. Adjust configuration.")
+        raise FileNotFoundError("Input path does not exist. Adjust configuration.")
     if not os.path.exists(Path(output_path).parent.absolute()):
-        sys.exit("Output path parent directory does not exist. Adjust configuration.")
+        logger.error("Output path parent directory does not exist. Adjust configuration.")
+        raise ValueError("Output path parent directory does not exist. Adjust configuration.")
     logger.info("input path: "+ input_path)
-    logger.info( " output path: "+ output_path)
+    logger.info("output path: "+ output_path)
     project_dir = input_path
     csv_path = "{0}.csv".format(output_path)
-    json_path = "{0}.json".format(output_path) 
+    json_path = "{0}.json".format(output_path)
 
     ######### SEARCH FILTERS ###########################
 
@@ -120,40 +130,49 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
         os.makedirs(os.path.dirname(csv_path), exist_ok=True)
     CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append,slow)
     df = None
-    if(slow == False) & ('standard_name' in headers ):
-               #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs
-                      df = pd.read_csv(os.path.abspath(csv_path), sep=",", header=0,index_col=False)
-                      list_variable_id = []
-                      try:
-                          list_variable_id = df["variable_id"].unique().tolist()
-                      except:
-                          print("Having trouble finding 'variable_id'... Be sure to add it to the output_path_template field of your configuration")
-                      try:
-                          list_realm = df["realm"].unique().tolist()
-                      except:
-                          print("Having trouble finding 'realm'... Be sure to add it to the output_path_template field of your configuration")
-                      dictVarCF = getinfo.getStandardName(list_variable_id,list_realm)
-                      #print("standard name from look-up table-", dictVarCF)
-                      for k, v in dictVarCF.items():
-                        try:
-                           var = k.split(",")[0]
-                        except ValueError:
-                           continue
-                        try:
-                           realm = k.split(",")[1]
-                        except ValueError:
-                           continue 
-                        if(var is not None) & (realm is not None):
-                            df['standard_name'].loc[(df['variable_id'] == var) & (df['realm'] == realm) ] = v
-                        #df['standard_name'].loc[(df['variable_id'] == k)] = v
-    if(slow == False) & ('standard_name' in headers):
-       if ((df is not None) & (len(df) != 0) ):
-           with open(csv_path, 'w') as csvfile:
-               df.to_csv(csvfile,index=False)
-
-    print("JSON generated at:", os.path.abspath(json_path))
-    print("CSV generated at:", os.path.abspath(csv_path))
-    logger.info("CSV generated at" + os.path.abspath(csv_path))
+
+    if not slow and 'standard_name' in headers:
+        #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs
+        df = pd.read_csv(os.path.abspath(csv_path), sep=",", header=0,index_col=False)
+        list_variable_id = []
+        try:
+            list_variable_id = df["variable_id"].unique().tolist()
+        except:
+            raise KeyError("Having trouble finding 'variable_id'... Be sure to add it to the output_path_template field of your configuration")
+        try:
+            list_realm = df["realm"].unique().tolist()
+        except:
+            raise KeyError("Having trouble finding 'realm'... Be sure to add it to the output_path_template field of your configuration")
+        dictVarCF = getinfo.getStandardName(list_variable_id,list_realm)
+        #print("standard name from look-up table-", dictVarCF)
+        for k, v in dictVarCF.items():
+            try:
+                var = k.split(",")[0]
+            except ValueError:
+                continue
+            try:
+                realm = k.split(",")[1]
+            except ValueError:
+                continue
+            if(var is not None) & (realm is not None):
+                df['standard_name'].loc[(df['variable_id'] == var) & (df['realm'] == realm) ] = v
+                #df['standard_name'].loc[(df['variable_id'] == k)] = v
+
+        if ((df is not None) & (len(df) != 0) ):
+            with open(csv_path, 'w') as csvfile:
+                df.to_csv(csvfile,index=False)
+
+    # Strict Mode
+    if strict:
+        vocab = True
+        proper_generation = False
+        test_failure = False
+
+        #Validate
+        cv(json_path,'',vocab, proper_generation, test_failure)
+
+    logger.info("JSON generated at: " + os.path.abspath(json_path))
+    logger.info("CSV generated at: " + os.path.abspath(csv_path))
     return(csv_path,json_path)
 
 #Setting up argument parsing/flags
@@ -170,10 +189,11 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
 @click.option('--overwrite', is_flag=True, default=False)
 @click.option('--append', is_flag=True, default=False)
 @click.option('--slow','-s', is_flag=True, default=False, help='This option looks up standard names in netcdf file to fill up the standard name column if its present in the header specs. If standard_name is absent, long_name with space replaced by underscore is utilized')
+@click.option('--strict', is_flag=True, default=False, help='Strict catalog generation ensures catalogs are compliant with CV standards (as defined in vocabulary section of catalog schema)')
 @click.option('--verbose/--silent', default=False, is_flag=True) #default has silent option. Use --verbose for detailed logging
 
 def create_catalog_cli(**kwargs):
     return create_catalog(**kwargs)
-                          
+
 if __name__ == '__main__':
     create_catalog_cli()