Skip to content

Commit aab24d4

Browse files
Ciheim BrownCiheim Brown
authored andcommitted
Adding new pyproject file + code refresh + docs
1 parent 174c935 commit aab24d4

File tree

8 files changed

+198
-133
lines changed

8 files changed

+198
-133
lines changed

MANIFEST.in

Lines changed: 0 additions & 3 deletions
This file was deleted.

catalogbuilder/scripts/gen_intake_gfdl.py

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,31 +7,13 @@
77
import os
88
from pathlib import Path
99
import logging
10-
from catalogbuilder.tests.compval import compval as cv
10+
from catalogbuilder.scripts.compval import compval as cv
11+
from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, configparser, getinfo
1112

1213
logger = logging.getLogger('local')
1314
logger.setLevel(logging.INFO)
1415
logging.basicConfig(stream=sys.stdout)
1516

16-
try:
17-
from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, configparser, getinfo
18-
except ModuleNotFoundError:
19-
logger.warning("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
20-
logger.warning("Attempting again with adjusted sys.path ")
21-
try:
22-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
23-
except:
24-
logger.error("Unable to adjust sys.path")
25-
#print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
26-
try:
27-
28-
from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser,getinfo
29-
logger.info(gfdlcrawler.__file__)
30-
31-
except ModuleNotFoundError:
32-
logger.error("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it?")
33-
raise ImportError("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it?")
34-
3517
package_dir = os.path.dirname(os.path.abspath(__file__))
3618
#template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
3719

@@ -45,16 +27,16 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
4527
if strict:
4628
logger.warning("!!!!! STRICT MODE IS ACTIVE. CATALOG GENERATION WILL FAIL IF ERRORS ARE FOUND !!!!!\n")
4729
time.sleep(10)
30+
4831
configyaml = None
49-
if (config is not None):
32+
if config is not None:
5033
configyaml = configparser.Config(config,logger)
51-
if(input_path is None):
34+
if input_path is None:
5235
input_path = configyaml.input_path
53-
if(output_path is None):
36+
if output_path is None:
5437
output_path = configyaml.output_path
5538
else:
5639
# If user does not pass a config, we will use the default config with the same format to avoid special cases
57-
#
5840
try:
5941
pkg = importlib_resources.files("catalogbuilder.scripts")
6042
config = pkg / "configs" / "config.yaml"
@@ -66,25 +48,29 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
6648
except:
6749
raise FileNotFoundError("Can't locate or read config, check --config ")
6850
configyaml = configparser.Config(config,logger)
69-
if(input_path is None):
51+
52+
if input_path is None:
7053
input_path = configyaml.input_path
71-
if(output_path is None):
54+
if output_path is None:
7255
output_path = configyaml.output_path
73-
if((input_path is None) or (output_path is None)):
56+
if input_path is None or output_path is None:
7457
logger.error("Missing: input_path or output_path. Pass it in the config yaml or as command-line option")
7558
raise TypeError("Missing: input_path or output_path. Pass it in the config yaml or as command-line option")
59+
7660
if config is None or not configyaml.schema:
7761
logger.info("Default schema: catalogbuilder/cats/gfdl_template.json")
7862
template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
7963
else:
8064
template_path = configyaml.schema
8165
logger.info("Using schema from config file", template_path)
66+
8267
if not os.path.exists(input_path):
8368
logger.error("Input path does not exist. Adjust configuration.")
8469
raise FileNotFoundError("Input path does not exist. Adjust configuration.")
8570
if not os.path.exists(Path(output_path).parent.absolute()):
8671
logger.error("Output path parent directory does not exist. Adjust configuration.")
8772
raise ValueError("Output path parent directory does not exist. Adjust configuration.")
73+
8874
logger.info("input path: "+ input_path)
8975
logger.info("output path: "+ output_path)
9076
project_dir = input_path

doc/developers.rst

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
==============
2+
For developers
3+
==============
4+
5+
The Catalog Builder team welcomes all contributions. If you would like to help develop the package, please follow the steps outlined below.
6+
7+
8+
How to contribute
9+
=================
10+
11+
Set up a clean environment
12+
--------------------------
13+
14+
First, create a new environment for your Catalog Builder development work. The recommended approach is to use a `python virtual environment (venv) <https://docs.python.org/3/library/venv.html>`_. A conda environment will also work fine if such is desired.
15+
16+
.. code-block:: console
17+
18+
python3 -m venv /path/to/new/virtual/environment
19+
20+
Then, activate the environment by sourcing the activation script. The command varies by operating system and shell:
21+
22+
* **Linux/macOS (bash/zsh):**
23+
24+
.. code-block:: console
25+
26+
source /path/to/new/virtual/environment/bin/activate
27+
28+
* **Linux/macOS (csh):**
29+
30+
.. code-block:: console
31+
32+
source /path/to/new/virtual/environment/bin/activate.csh
33+
34+
* **Linux/macOS (fish):**
35+
36+
.. code-block:: console
37+
38+
source /path/to/new/virtual/environment/bin/activate.fish
39+
40+
* **Linux/macOS (pwsh):**
41+
42+
.. code-block:: console
43+
44+
/path/to/new/virtual/environment/bin/activate.ps1
45+
46+
* **Windows (Command Prompt):**
47+
48+
.. code-block:: console
49+
50+
\path\to\new\virtual\environment\Scripts\activate.bat
51+
52+
* **Windows (PowerShell):**
53+
54+
.. code-block:: console
55+
56+
\path\to\new\virtual\environment\Scripts\Activate.ps1
57+
58+
Clone the Catalog Builder source code
59+
-------------------------------------
60+
61+
Clone the `Github repository <https://github.com/NOAA-GFDL/CatalogBuilder>`_ using ssh:
62+
63+
.. code-block:: console
64+
65+
git clone git@github.com:NOAA-GFDL/CatalogBuilder.git
66+
67+
or https:
68+
69+
.. code-block:: console
70+
71+
git clone https://github.com/NOAA-GFDL/CatalogBuilder.git
72+
73+
Install the package
74+
-------------------
75+
76+
It is recommended that developers install an `editable <https://setuptools.pypa.io/en/latest/userguide/development_mode.html>`_ Catalog Builder package. This makes development simple as any local changes will immediately be testable. From the root of the repository, run:
77+
78+
.. code-block:: console
79+
80+
pip install -e .

doc/generation.rst

Lines changed: 68 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,63 @@
1+
========================
12
Generating data catalogs
23
========================
34

4-
There are a few ways to use the catalog builder.
5+
There are a few ways to use the catalog builder. This page contains instructions to help you start using the tool.
56

67
Installation
7-
------------
8+
============
89

9-
Recommended approach: Install as a `conda package <https://anaconda.org/NOAA-GFDL/catalogbuilder>`_
10+
You will need to install the Catalog Builder package to begin.
1011

11-
.. code-block:: console
12+
Cloning the repository
13+
----------------------
1214

13-
conda install catalogbuilder -c noaa-gfdl
15+
The current recommended approach for installing the catalog builder is to install the tool as a pip package. You'll need to first clone the `github repository <https://github.com/NOAA-GFDL/CatalogBuilder>`_:
1416

15-
Alternatively, you may clone the `git repository <https://github.com/NOAA-GFDL/CatalogBuilder>`_
16-
and create your conda environment using the `environment.yml <https://github.com/NOAA-GFDL/CatalogBuilder/blob/main/environment.yml>`_ in the git repository.
17+
**With ssh**
1718

1819
.. code-block:: console
1920
20-
git clone https://github.com/NOAA-GFDL/CatalogBuilder
21-
22-
conda env create -f environment_intake.yml
23-
24-
Expected output
25-
---------------
26-
27-
A JSON catalog specification file and a CSV catalog in the specfied output directory with the specified name.
28-
29-
Using conda package
30-
-------------------
21+
git clone git@github.com:NOAA-GFDL/CatalogBuilder.git
3122
32-
**1. Install the package using conda:**
23+
**With https**
3324

3425
.. code-block:: console
3526
36-
conda install catalogbuilder -c noaa-gfdl
37-
38-
If you're trying these steps from GFDL, likely that you may need to do additional things to get it to work. See below
39-
40-
Add these to your ~/.condarc file
41-
42-
whitelist_channels:
43-
- noaa-gfdl
44-
- conda-forge
45-
- anaconda
46-
channels:
47-
- noaa-gfdl
48-
- conda-forge
49-
- anaconda
50-
51-
(and try: conda config --add channels noaa-gfdl conda config --append channels conda-forge)
52-
53-
If you encounter issues "ChecksumMismatchError: Conda detected a mismatch between the expected.." , do the following:
27+
https://github.com/NOAA-GFDL/CatalogBuilder.git
5428
55-
conda config --add pkgs_dirs /local2/home/conda/pkgs
56-
conda config --add envs_dirs /local2/home/conda/envs
29+
Installing the package
30+
----------------------
5731

58-
**2. Add conda environment's site packages to PATH**
59-
60-
See example below.
32+
Now that you have a local copy of the source code, you are able to install the package. From the root of the repository, run:
6133

6234
.. code-block:: console
6335
64-
setenv PATH ${PATH}:${CONDA_PREFIX}/lib/python3.1/site-packages/scripts/
65-
66-
**3. Call the builder**
36+
pip install .
6737
68-
Catalogs are generated by the following command: *gen_intake_gfdl.py <INPUT_PATH> <OUTPUT_PATH>*
38+
.. note::
39+
This installation method is expected to change slightly when the package is uploaded to the Python Package index.
6940

70-
Output path argumment should end with the desired output filename WITHOUT a file ending. See example below.
41+
Alternative approach: Install as a `conda package <https://anaconda.org/NOAA-GFDL/catalogbuilder>`_:
7142

7243
.. code-block:: console
7344
74-
gen_intake_gfdl.py /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp $HOME/catalog
45+
conda install catalogbuilder -c noaa-gfdl
7546
76-
This would create a catalog.csv and catalog.json in the user's home directory.
47+
Configuration
48+
=============
7749

78-
.. image:: _static/ezgif-4-786144c287.gif
79-
:width: 1000px
80-
:alt: Catalog generation demonstration
50+
A template/configuration file is used for all catalog generation.
8151

82-
See `Flags`_ here.
52+
What is a catalog template?
53+
---------------------------
54+
55+
A catalog template is a YAML file defining headerlist, output path template, output file template, and input/output paths.
8356

84-
Using a configuration file
85-
--------------------------
57+
Using a custom template
58+
-----------------------
8659

87-
We recommend the use of a configuration file to provide input to the catalog builder. This is necessary and useful if you want to work with datasets and directories that are *not quite* GFDL post-processed directory oriented.
60+
A default configuration is used for catalog generation unless a custom configuration is provided. We recommend the use of a custom configuration file if you want to work with datasets and directories that are *not quite* GFDL post-processed directory oriented. Configs must be passed to the builder using the ``--config flag``. See `Flags`_ here.
8861

8962
`Here <https://github.com/NOAA-GFDL/CatalogBuilder/blob/main/catalogbuilder/tests/config-cfname.yaml>`_ is an example configuration file.
9063

@@ -106,9 +79,12 @@ with the ESM collection specification standards and the appropriate workflows.
10679
#Directory structure information
10780
output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
10881
109-
For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
110-
the output_path_template is set as above. We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
111-
simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure this is a valid value in headerlist as well. The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template for the fourth value. We have NA in values that do not match up with any of the expected headerlist (CSV columns), otherwise we simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure this is a valid value in headerlist as well. #The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply set NA in output_path_template for the fourth value.
82+
For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp the output_path_template is set as above.
83+
84+
We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
85+
simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure this is a valid value in headerlist as well. The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply add NA in output_path_template for the fourth value.
86+
87+
We have NA in values that do not match up with any of the expected headerlist (CSV columns), otherwise we simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure this is a valid value in headerlist as well.
11288

11389
.. code-block:: yaml
11490
@@ -121,14 +97,32 @@ simply specify the associated header name in the appropriate place. E.g. The thi
12197
input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
12298
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. This can be an absolute or a relative path
12399
124-
Template
125-
--------
100+
Creating a data catalog
101+
=======================
102+
103+
Using the installed package
104+
---------------------------
105+
106+
Catalogs are generated by the following command: *gen_intake_gfdl.py <INPUT_PATH> <OUTPUT_PATH>*
107+
108+
Output path argumment should end with the desired output filename WITHOUT a file ending. See example below.
109+
110+
.. code-block:: console
111+
112+
gen_intake_gfdl.py /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp $HOME/catalog
113+
114+
This would create a catalog.csv and catalog.json in the user's home directory.
115+
116+
.. image:: _static/ezgif-4-786144c287.gif
117+
:width: 1000px
118+
:alt: Catalog generation demonstration
126119

127-
All data catalogs are generated using a template file. This file defines headerlist, output path template, output file template, and input/output paths.
120+
See `Flags`_ here.
128121

129122
From a Python script
130123
---------------------
131124
Do you have a python script or a notebook where you could also include steps to generate a data catalog?
125+
132126
See example `here <https://github.com/NOAA-GFDL/CatalogBuilder/blob/main/catalogbuilder/scripts/gen_intake_gfdl_runner_config.py>`_
133127

134128
Here is another example *with a custom configuration*:
@@ -211,17 +205,10 @@ Refer to this `notebook <https://github.com/aradhakrishnanGFDL/canopy-cats/blob/
211205
.. image:: _static/catalog_generation.png
212206
:alt: Screenshot of a notebook showing catalog generation
213207

214-
215208
Using FRE-CLI (GFDL only)
216209
-------------------------
217210

218-
**1. Activate conda environment**
219-
220-
.. code-block:: console
221-
222-
conda activate /nbhome/fms/conda/envs/fre-cli
223-
224-
**2. Call the builder**
211+
Follow the `fre-cli setup documentation <https://noaa-gfdl.readthedocs.io/projects/fre-cli/en/latest/setup.html>`_ to gain access to fre-cli.
225212

226213
Catalogs are generated by the following command: *fre catalog buildcatalog <INPUT_PATH> <OUTPUT_PATH>*
227214

@@ -234,13 +221,18 @@ Catalogs are generated by the following command: *fre catalog buildcatalog <INPU
234221
235222
See `Flags`_ here.
236223

237-
See `Fre-CLI Documentation here <https://noaa-gfdl.github.io/fre-cli/>`_
224+
See `Fre-CLI Documentation here <https://noaa-gfdl.readthedocs.io/projects/fre-cli/en/latest/>`_
225+
226+
Expected output
227+
---------------
238228

229+
The catalog builder tool generates a JSON catalog specification file and a CSV catalog in the specfied output directory with the specified name.
239230

240-
Arguments/Options
241-
_____
231+
Arguments and Options
232+
=====================
242233

243-
**Input/Output paths can be passed directly to catalog builder tool through calling command**
234+
Arguments
235+
---------
244236

245237
All methods of catalog builder generation support direct input/output path passing.
246238

@@ -249,6 +241,8 @@ Input path must be the 1st argument. Output path must be the 2nd.
249241
Ex. gen_intake_gfdl.py /archive/Some.User/input-path ./output_path
250242

251243

244+
Flags
245+
-----
252246
.. Reference `Flags`_.
253247
254248
- --config - Allows for catalogs to be generated with a custom configuration. Requires path to YAML configuration file. (Ex. "--config custom_config.yaml")

0 commit comments

Comments
 (0)