Skip to content

Commit 79f0399

Browse files
downloader script works with hydra and snakemake; needs more accurate API call
1 parent b38702c commit 79f0399

File tree

15 files changed

+306
-108
lines changed

15 files changed

+306
-108
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
logs/*
33
.snakemake/*
44
.DS_Store
5-
sandbox
5+
sandbox
6+
slurm*

conf/config.yaml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,25 @@
11
defaults:
22
- _self_
33
- datapaths: datapaths
4-
#- CDS_API_KEY: $HOME/.cdsapirc
4+
5+
development_mode: false
6+
7+
CDS_API_KEY:
8+
path: "$HOME/.cdsapirc"
9+
510
query:
611
product_type: reanalysis
12+
gadm_file: "https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_MDG.gpkg"
713
# check precipitation
8-
variable: ["2m_dewpoint_temperature", "2m_temperature", "skin_temperature", "total_precipitation"]
14+
# variable: ["2m_dewpoint_temperature", "2m_temperature", "skin_temperature", "total_precipitation"]
15+
variable: ["2m_dewpoint_temperature", "2m_temperature"]
916
year: [2010, 2011]
1017
month: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
1118
day: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
1219
time: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
1320

21+
# this may have to be added for the
22+
#levtype: pl
1423
# in the current workflow we can test with a small number of healthsheds
1524
# this bounding box will need to be expanded by ~ 50km (in G's dataset it is 50) or even up to 70 or 08
1625
# we can also experiment with a buffer that follows the coastline precisely by 100KM

conf/datapaths/datapaths.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1-
# if the files are stored in local, then paths are null
21
input: null
3-
output: null
2+
3+
intermediate: null
4+
5+
output: null
6+
7+
testing: null

notes/00_core.ipynb

Lines changed: 55 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,26 @@
5353
"Some utilities are provided to help you with the ERA5 dataset."
5454
]
5555
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": null,
59+
"metadata": {},
60+
"outputs": [],
61+
"source": [
62+
"#| export\n",
63+
"def describe(\n",
64+
" cfg: DictConfig=None, # Configuration file\n",
65+
" )-> None:\n",
66+
" \"Describe the configuration file used by Hydra for the pipeline\"\n",
67+
" \n",
68+
" if cfg is None:\n",
69+
" print(\"No configuration file provided. Generating default configuration file.\")\n",
70+
" cfg = OmegaConf.create()\n",
71+
" \n",
72+
" print(\"This package fetches ERA5 data. The following is the config file used by Hydra for the pipeline:\\n\")\n",
73+
" print(OmegaConf.to_yaml(cfg))"
74+
]
75+
},
5676
{
5777
"cell_type": "code",
5878
"execution_count": 7,
@@ -76,21 +96,31 @@
7696
},
7797
{
7898
"cell_type": "code",
79-
"execution_count": 8,
99+
"execution_count": null,
80100
"metadata": {},
81101
"outputs": [],
82102
"source": [
83-
"#| export\n",
84-
"def describe(\n",
85-
" cfg: DictConfig=None, # Configuration file\n",
86-
" )-> None:\n",
87-
" \"Describe the configuration file used by Hydra for the pipeline\"\n",
88-
" \n",
89-
" if cfg is None:\n",
90-
" cfg = OmegaConf.create()\n",
103+
"#| exporti\n",
104+
"\n",
105+
"def _create_directory_structure(\n",
106+
" base_path: str, # The base directory where the structure will be created\n",
107+
" structure: dict # A dictionary representing the directory structure\n",
108+
" )->None:\n",
109+
" \"\"\"\n",
110+
" Recursively creates a directory structure from a dictionary.\n",
111+
"\n",
112+
" Args:\n",
113+
" base_path (str): The base directory where the structure will be created.\n",
114+
" structure (dict): A dictionary representing the directory structure.\n",
115+
" \"\"\"\n",
116+
" for folder, substructure in structure.items():\n",
117+
" # Create the current directory\n",
118+
" current_path = os.path.join(base_path, folder)\n",
119+
" os.makedirs(current_path, exist_ok=True)\n",
91120
" \n",
92-
" print(\"This package fetches ERA5 data. The following is the config file used by Hydra for the pipeline:\\n\")\n",
93-
" print(OmegaConf.to_yaml(cfg))"
121+
" # Recursively create subdirectories if substructure is a dictionary\n",
122+
" if isinstance(substructure, dict):\n",
123+
" _create_directory_structure(current_path, substructure)"
94124
]
95125
},
96126
{
@@ -104,32 +134,25 @@
104134
},
105135
{
106136
"cell_type": "code",
107-
"execution_count": 9,
137+
"execution_count": null,
108138
"metadata": {},
109139
"outputs": [],
110140
"source": [
111141
"#| export\n",
112142
"def testAPI(\n",
113143
" cfg: DictConfig=None,\n",
114-
" output_path:str=None,\n",
115-
" dataset:str=\"reanalysis-era5-pressure-levels\",\n",
116-
" remove:bool=True\n",
144+
" dataset:str=\"reanalysis-era5-pressure-levels\"\n",
117145
" )-> bool: \n",
118146
" \n",
147+
" # parse config\n",
148+
" testing=cfg.development_mode\n",
149+
" output_path=here(\"data\") / \"testing\"\n",
150+
"\n",
119151
" print(OmegaConf.to_yaml(cfg))\n",
120152
"\n",
121153
" try:\n",
122154
" client = cdsapi.Client()\n",
123155
"\n",
124-
" # check the path\n",
125-
" if output_path is None:\n",
126-
" output_path = here() / \"data\"\n",
127-
" else:\n",
128-
" output_path = _expand_path(output_path)\n",
129-
"\n",
130-
" if not os.path.exists(output_path):\n",
131-
" os.makedirs(output_path)\n",
132-
"\n",
133156
" # build request\n",
134157
" request = {\n",
135158
" 'product_type': ['reanalysis'],\n",
@@ -142,13 +165,13 @@
142165
" 'data_format': 'grib',\n",
143166
" }\n",
144167
"\n",
145-
" target = output_path / 'download.grib'\n",
168+
" target = output_path / 'test_download.grib'\n",
146169
" \n",
147170
" print(\"Testing API connection by downloading a dummy dataset to {}...\".format(output_path))\n",
148171
"\n",
149172
" client.retrieve(dataset, request, target)\n",
150173
"\n",
151-
" if remove:\n",
174+
" if not testing:\n",
152175
" os.remove(target)\n",
153176
" \n",
154177
" print(\"API connection test successful.\")\n",
@@ -310,9 +333,13 @@
310333
"outputs": [],
311334
"source": [
312335
"#| export\n",
313-
"@hydra.main(version_base=None, config_path=\"../conf\", config_name=\"config\")\n",
336+
"@hydra.main(version_base=None, config_path=\"../../conf\", config_name=\"config\")\n",
314337
"def main(cfg: DictConfig) -> None:\n",
315-
" describe(cfg=cfg)\n",
338+
"\n",
339+
" # Create the directory structure\n",
340+
" _create_directory_structure(here() / \"data\", cfg.datapaths)\n",
341+
"\n",
342+
" # test the api\n",
316343
" testAPI(cfg=cfg)"
317344
]
318345
},

notes/01_download_raw_data.ipynb

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@
4646
"import os\n",
4747
"import hydra\n",
4848
"import cdsapi\n",
49+
"import geopandas as gpd\n",
4950
"from core import _expand_path\n",
50-
"from omegaconf import DictConfig, OmegaConf"
51+
"from pyprojroot import here\n",
52+
"from omegaconf import DictConfig, ListConfig, OmegaConf"
5153
]
5254
},
5355
{
@@ -69,9 +71,22 @@
6971
" print(f\"Missing required key in query. Required keys are {required_keys}\")\n",
7072
" print(\"Query validation failed\")\n",
7173
" raise ValueError(\"Invalid query\")\n",
72-
" \n",
74+
" \n",
75+
" if isinstance(query_body['year'], ListConfig):\n",
76+
" query_body['year'] = [str(x).zfill(2) for x in query_body['year']]\n",
77+
" else:\n",
78+
" query_body['year'] = str(query_body['year'])\n",
79+
" if isinstance(query_body['month'], ListConfig):\n",
80+
" query_body['month'] = [str(x).zfill(2) for x in query_body['month']]\n",
81+
" else:\n",
82+
" query_body['month'] = str(query_body['month']).zfill(2)\n",
83+
" \n",
84+
" if isinstance(query_body['day'], ListConfig):\n",
85+
" query_body['day'] = [str(x).zfill(2) for x in query_body['day']]\n",
7386
" else:\n",
74-
" return query_body"
87+
" query_body['day'] = str(query_body['day']).zfill(2)\n",
88+
"\n",
89+
" return OmegaConf.to_container(query_body, resolve=True)"
7590
]
7691
},
7792
{
@@ -81,7 +96,8 @@
8196
"outputs": [],
8297
"source": [
8398
"#| export\n",
84-
"def fetch_MDG_GADM(\n",
99+
"def fetch_GADM(\n",
100+
" url: str=\"https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_MDG.gpkg\",\n",
85101
" output_file: str=\"gadm41_MDG.gpkg\" # file path to save the GADM data\n",
86102
" )-> str:\n",
87103
" '''\n",
@@ -94,8 +110,8 @@
94110
" print(\"GADM data already exists\")\n",
95111
" return output_file_path\n",
96112
" \n",
97-
" print(\"Fetching GADM bounding box data for Madagascar\")\n",
98-
" os.system(\"curl --output {} https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_MDG.gpkg\".format(output_file))\n",
113+
" print(\"Fetching GADM bounding box data for region\")\n",
114+
" os.system(\"curl --output {} {}\".format(output_file, url))\n",
99115
" print(\"GADM data fetched\")\n",
100116
" \n",
101117
" return output_file_path"
@@ -108,23 +124,60 @@
108124
"outputs": [],
109125
"source": [
110126
"#| export\n",
111-
"def download(\n",
127+
"\n",
128+
"def create_bounding_box(\n",
129+
" gadm_file: str, \n",
130+
" round_to: int = 1, \n",
131+
" buffer: float = 0.1)->list:\n",
132+
" '''\n",
133+
" Create a bounding box from the GADM data.\n",
134+
"\n",
135+
" This function reads the GADM data from URL and extracts the bounding box of the region.\n",
136+
" '''\n",
137+
"\n",
138+
" ground_shape = gpd.read_file(gadm_file, layer = \"ADM_ADM_0\")\n",
139+
"\n",
140+
" bbox = ground_shape.total_bounds\n",
141+
"\n",
142+
" bbox[0] = round(bbox[0], round_to) - buffer\n",
143+
" bbox[1] = round(bbox[1], round_to) - buffer\n",
144+
" bbox[2] = round(bbox[2], round_to) + buffer\n",
145+
" bbox[3] = round(bbox[3], round_to) + buffer\n",
146+
" \n",
147+
" bbox = [bbox[0], bbox[2], bbox[1], bbox[3]]\n",
148+
"\n",
149+
" return bbox\n"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": null,
155+
"metadata": {},
156+
"outputs": [],
157+
"source": [
158+
"#| export\n",
159+
"def download_raw_era5(\n",
112160
" cfg: DictConfig, # hydra configuration file\n",
113-
" output_dir: str = \"data/input/\", # output directory\n",
114-
" dataset: str = \"reanalysis-era5-pressure-levels\", # dataset to download\n",
115-
" testing: bool = False # testing mode\n",
161+
" dataset: str = \"reanalysis-era5-land\", # dataset to download\n",
116162
" )->None:\n",
117163
" '''\n",
118164
" Send the query to the API and download the data\n",
119165
" '''\n",
166+
"\n",
167+
" # parse the cfg\n",
168+
" testing = cfg.development_mode # for testing\n",
169+
" output_dir = here(\"data/input\") # output directory\n",
120170
" \n",
121171
" client = cdsapi.Client()\n",
122172
" \n",
123173
" query = _validate_query(cfg.query)\n",
124174
" \n",
125175
" # Send the query to the client\n",
126176
" if not testing:\n",
127-
" client.retrieve(dataset, query).download(os.path.join(_expand_path(output_dir), \"{}_{}.nc\".format(query.year, query.month)))\n",
177+
" bounds = create_bounding_box(cfg.query['gadm_file'])\n",
178+
" query['area'] = bounds\n",
179+
" del query['gadm_file']\n",
180+
" client.retrieve(dataset, query).download(os.path.join(_expand_path(output_dir), \"{}_{}.nc\".format(query['year'], query['month'])))\n",
128181
" else:\n",
129182
" print(f\"Testing mode. Not downloading data. Query is {query}\")\n",
130183
"\n",
@@ -157,7 +210,7 @@
157210
"with initialize(version_base=None, config_path=\"../conf\"):\n",
158211
" cfg = compose(config_name='config.yaml')\n",
159212
"\n",
160-
"download(cfg, testing=True)"
213+
"download_raw_era5(cfg, testing=True)"
161214
]
162215
},
163216
{
@@ -167,9 +220,9 @@
167220
"outputs": [],
168221
"source": [
169222
"#| export\n",
170-
"@hydra.main(config_path=\"../conf\", config_name=\"config\", version_base=None)\n",
223+
"@hydra.main(config_path=\"../../conf\", config_name=\"config\", version_base=None)\n",
171224
"def main(cfg: DictConfig) -> None:\n",
172-
" download(cfg=cfg)"
225+
" download_raw_era5(cfg=cfg)"
173226
]
174227
},
175228
{
@@ -206,7 +259,15 @@
206259
"name": "python3"
207260
},
208261
"language_info": {
262+
"codemirror_mode": {
263+
"name": "ipython",
264+
"version": 3
265+
},
266+
"file_extension": ".py",
267+
"mimetype": "text/x-python",
209268
"name": "python",
269+
"nbconvert_exporter": "python",
270+
"pygments_lexer": "ipython3",
210271
"version": "3.11.11"
211272
}
212273
},

notes/index.ipynb

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,14 @@
8787
"# make sure era5_sandbox package is installed in development mode\n",
8888
"$ pip install -e .\n",
8989
"\n",
90-
"# make changes under nbs/ directory\n",
91-
"# ...\n",
90+
"# To make changes, go to the \"notes\" directory and edit the notebooks as necessary.\n",
91+
"# Each notebook refers to a module in the era5_sandbox package. Cells are exported to the module\n",
92+
"# when the notebook is saved and you run the following command:\n",
9293
"\n",
93-
"# compile to have changes apply to era5_sandbox\n",
94-
"$ nbdev_prepare\n",
95-
"```"
94+
"$ nbdev_export\n",
95+
"```\n",
96+
"\n",
97+
"For e.g., to change functionality of the `testAPI()` function in the testAPI Hydra rule, you would edit the `testAPI` notebook in the `notes` directory `notes/testAPI.ipynb`, and then save that notebook and run `nbdev_export` to update the `core` module in the package."
9698
]
9799
},
98100
{

settings.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ status = 3
3838
user = TinasheMTapera
3939

4040
### Optional ###
41-
requirements = fastcore pyprojroot ipykernel nbdev cdsapi hydra-core ipykernel
41+
requirements = fastcore pyprojroot ipykernel nbdev cdsapi hydra-core ipykernel snakemake geopandas xarray matplotlib cartopy
4242
# dev_requirements =
4343
# console_scripts = src
4444
# conda_user =

0 commit comments

Comments
 (0)