Moving files

Ciheim Brown · Ciheim Brown · commit b1c6cd93a2f6 · 2025-08-11T12:57:17.000-04:00
diff --git a/catalogbuilder/scripts/make_sample_data.py b/catalogbuilder/scripts/make_sample_data.py
@@ -0,0 +1,67 @@
+"""
+This script generates a sample directory structure with a few PP components and empty netcdf files. 
+There is a subdirs.py in the scripts directory that provides info as to how the structure can be, while the root structure is defined in the script below.
+In the near future, this will allow us to add more patterns and directory structure expectations, components,different chunks, time series/time average etc and expand
+the following script and subdirs.py. 
+
+Context for this script- We use this in our GitHub Actions workflow to create sample directories on the fly before running the GFDL catalog builder 
+script dynamically.
+
+Author: A.Radhakrishnan and workflow team
+Date: Nov 15, 2023
+
+"""
+import os
+from pathlib import Path
+
+root_dir = 'archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp'
+chunk_freq = '1yr'
+
+def make_sample_data():
+    # Create directory
+    try: 
+        import subdirs
+    except:
+        import sys
+        print((os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"tests")))
+        sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"tests"))
+        import subdirs
+    from subdirs import realm, freq, time,vars 
+    realm_mapping = [realm]
+    freq_mapping = [freq]
+
+    realm_ctr = (len(subdirs.realm))
+    i = 0
+    for j in range(0, realm_ctr):
+           dirName = str(root_dir) + '/' + str(realm_mapping[i][j]) + '/' + 'ts'
+           realm_name = realm_mapping[i][j]
+           try:
+               os.makedirs(dirName)
+               print("Directory " , dirName ,  " Created ") 
+           except FileExistsError:
+               print("Directory " , dirName ,  " already exists")       
+               pass 
+           for j in range(0,len(subdirs.freq)):
+             dirName2 = dirName + '/' + str(freq_mapping[i][j]) + '/' + chunk_freq + '/'
+             try:
+                  os.makedirs(dirName2)
+                  print("Directory " , dirName2 ,  " Created ")
+             except FileExistsError:
+                  print("Directory " , dirName2 ,  " already exists")
+                  pass
+              #touch files
+             for v in range(0, len(subdirs.vars)):
+               for t in range(0, len(subdirs.time)):
+                 filename = "{0}.{1}.{2}.nc".format(str(realm_name),str(subdirs.time[t]),str(subdirs.vars[v]))
+                 try:
+                    Path(dirName2+filename).touch()
+                    print(dirName2+filename+" created")
+                 except:
+                    print("touch failed on ", dirName2+filename)
+                    pass
+
+def main():
+    make_sample_data()
+
+if __name__ == "__main__":
+    main()
diff --git a/catalogbuilder/scripts/subdirs.py b/catalogbuilder/scripts/subdirs.py
@@ -0,0 +1,20 @@
+""" 
+utils for scripts/make_sample_data.py
+the sub-directories and other patterns in the directory strucure and file names needed for testing are defined below, assuming there is a root defined in the caller script
+"""
+
+realm = [
+'atmos',
+'atmos_cmip'
+]
+freq = [
+'monthly',
+]
+vars = [
+'tas',
+'uas'
+]
+time = [
+'000101-000112',
+'000201-000212'
+]
diff --git a/configs/config-mdtf.yaml b/configs/config-mdtf.yaml
@@ -0,0 +1,42 @@
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the input_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in input_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in input_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the input_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in input_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in input_path_template
+#for the fourth value.
+
+input_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+input_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+schema: "/home/a1r/git/forkCatalogBuilder-/catalogbuilder/cats/mdtf_template.json" #if your json schema is slighlty different but vetted with MSD, you may use your json schema here
+input_path:  "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30_test" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
diff --git a/examples/config-cfname.yaml b/examples/config-cfname.yaml
@@ -0,0 +1,41 @@
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the input_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in input_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in input_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the input_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in input_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in input_path_template
+#for the fourth value.
+
+input_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+input_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+input_path:  "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)