Skip to content

Commit b1c6cd9

Browse files
Ciheim BrownCiheim Brown
authored andcommitted
Moving files
1 parent 3c9c653 commit b1c6cd9

File tree

4 files changed

+170
-0
lines changed

4 files changed

+170
-0
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
This script generates a sample directory structure with a few PP components and empty netcdf files.
3+
There is a subdirs.py in the scripts directory that provides info as to how the structure can be, while the root structure is defined in the script below.
4+
In the near future, this will allow us to add more patterns and directory structure expectations, components,different chunks, time series/time average etc and expand
5+
the following script and subdirs.py.
6+
7+
Context for this script- We use this in our GitHub Actions workflow to create sample directories on the fly before running the GFDL catalog builder
8+
script dynamically.
9+
10+
Author: A.Radhakrishnan and workflow team
11+
Date: Nov 15, 2023
12+
13+
"""
14+
import os
15+
from pathlib import Path
16+
17+
root_dir = 'archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp'
18+
chunk_freq = '1yr'
19+
20+
def make_sample_data():
21+
# Create directory
22+
try:
23+
import subdirs
24+
except:
25+
import sys
26+
print((os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"tests")))
27+
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"tests"))
28+
import subdirs
29+
from subdirs import realm, freq, time,vars
30+
realm_mapping = [realm]
31+
freq_mapping = [freq]
32+
33+
realm_ctr = (len(subdirs.realm))
34+
i = 0
35+
for j in range(0, realm_ctr):
36+
dirName = str(root_dir) + '/' + str(realm_mapping[i][j]) + '/' + 'ts'
37+
realm_name = realm_mapping[i][j]
38+
try:
39+
os.makedirs(dirName)
40+
print("Directory " , dirName , " Created ")
41+
except FileExistsError:
42+
print("Directory " , dirName , " already exists")
43+
pass
44+
for j in range(0,len(subdirs.freq)):
45+
dirName2 = dirName + '/' + str(freq_mapping[i][j]) + '/' + chunk_freq + '/'
46+
try:
47+
os.makedirs(dirName2)
48+
print("Directory " , dirName2 , " Created ")
49+
except FileExistsError:
50+
print("Directory " , dirName2 , " already exists")
51+
pass
52+
#touch files
53+
for v in range(0, len(subdirs.vars)):
54+
for t in range(0, len(subdirs.time)):
55+
filename = "{0}.{1}.{2}.nc".format(str(realm_name),str(subdirs.time[t]),str(subdirs.vars[v]))
56+
try:
57+
Path(dirName2+filename).touch()
58+
print(dirName2+filename+" created")
59+
except:
60+
print("touch failed on ", dirName2+filename)
61+
pass
62+
63+
def main():
64+
make_sample_data()
65+
66+
if __name__ == "__main__":
67+
main()

catalogbuilder/scripts/subdirs.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""
2+
utils for scripts/make_sample_data.py
3+
the sub-directories and other patterns in the directory strucure and file names needed for testing are defined below, assuming there is a root defined in the caller script
4+
"""
5+
6+
realm = [
7+
'atmos',
8+
'atmos_cmip'
9+
]
10+
freq = [
11+
'monthly',
12+
]
13+
vars = [
14+
'tas',
15+
'uas'
16+
]
17+
time = [
18+
'000101-000112',
19+
'000201-000212'
20+
]

configs/config-mdtf.yaml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#what kind of directory structure to expect?
2+
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
3+
# the input_path_template is set as follows.
4+
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
5+
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
6+
#above is the model (source_id), so the third list value in input_path_template is set to 'source_id'. We make sure
7+
#this is a valid value in headerlist as well.
8+
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in input_path_template
9+
#for the fourth value.
10+
11+
#catalog headers
12+
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
13+
#with the ESM collection specification standards and the appropriate workflows.
14+
15+
headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
16+
"frequency", "realm", "table_id",
17+
"member_id", "grid_label", "variable_id",
18+
"time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
19+
20+
#what kind of directory structure to expect?
21+
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
22+
# the input_path_template is set as follows.
23+
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
24+
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
25+
#above is the model (source_id), so the third list value in input_path_template is set to 'source_id'. We make sure
26+
#this is a valid value in headerlist as well.
27+
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in input_path_template
28+
#for the fourth value.
29+
30+
input_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
31+
32+
input_file_template: ['realm','time_range','variable_id']
33+
34+
#OUTPUT FILE INFO is currently passed as command-line argument.
35+
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
36+
#csvfile = #jsonfile = #logfile =
37+
38+
#######################################################
39+
40+
schema: "/home/a1r/git/forkCatalogBuilder-/catalogbuilder/cats/mdtf_template.json" #if your json schema is slighlty different but vetted with MSD, you may use your json schema here
41+
input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
42+
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30_test" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)

examples/config-cfname.yaml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#what kind of directory structure to expect?
2+
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
3+
# the input_path_template is set as follows.
4+
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
5+
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
6+
#above is the model (source_id), so the third list value in input_path_template is set to 'source_id'. We make sure
7+
#this is a valid value in headerlist as well.
8+
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in input_path_template
9+
#for the fourth value.
10+
11+
#catalog headers
12+
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
13+
#with the ESM collection specification standards and the appropriate workflows.
14+
15+
headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
16+
"frequency", "realm", "table_id",
17+
"member_id", "grid_label", "variable_id",
18+
"time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
19+
20+
#what kind of directory structure to expect?
21+
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
22+
# the input_path_template is set as follows.
23+
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
24+
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
25+
#above is the model (source_id), so the third list value in input_path_template is set to 'source_id'. We make sure
26+
#this is a valid value in headerlist as well.
27+
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in input_path_template
28+
#for the fourth value.
29+
30+
input_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
31+
32+
input_file_template: ['realm','time_range','variable_id']
33+
34+
#OUTPUT FILE INFO is currently passed as command-line argument.
35+
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
36+
#csvfile = #jsonfile = #logfile =
37+
38+
#######################################################
39+
40+
input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
41+
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)

0 commit comments

Comments
 (0)