NSAPH-Data-Processing
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎conf/config.yaml‎
Lines changed: 11 additions & 2 deletions b/‎conf/config.yaml‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎conf/datapaths/datapaths.yaml‎
Lines changed: 6 additions & 2 deletions b/‎conf/datapaths/datapaths.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎notes/00_core.ipynb‎
Lines changed: 55 additions & 28 deletions b/‎notes/00_core.ipynb‎
Lines changed: 55 additions & 28 deletions
diff --git a/‎notes/01_download_raw_data.ipynb‎
Lines changed: 75 additions & 14 deletions b/‎notes/01_download_raw_data.ipynb‎
Lines changed: 75 additions & 14 deletions
diff --git a/‎notes/index.ipynb‎
Lines changed: 7 additions & 5 deletions b/‎notes/index.ipynb‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎settings.ini‎
Lines changed: 1 addition & 1 deletion b/‎settings.ini‎
Lines changed: 1 addition & 1 deletion
@@ -2,4 +2,5 @@
 logs/*
 .snakemake/*
 .DS_Store
-sandbox
+sandbox
+slurm*
@@ -1,16 +1,25 @@
 defaults:
   - _self_
   - datapaths: datapaths
-  #- CDS_API_KEY: $HOME/.cdsapirc
+
+development_mode: false
+
+CDS_API_KEY: 
+  path: "$HOME/.cdsapirc"
+
 query: 
   product_type: reanalysis
+  gadm_file: "https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_MDG.gpkg"
   # check precipitation
-  variable: ["2m_dewpoint_temperature", "2m_temperature", "skin_temperature", "total_precipitation"]
+  # variable: ["2m_dewpoint_temperature", "2m_temperature", "skin_temperature", "total_precipitation"]
+  variable: ["2m_dewpoint_temperature", "2m_temperature"]
   year: [2010, 2011]
   month: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
   day: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
   time: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
 
+  # this may have to be added for the 
+  #levtype: pl
   # in the current workflow we can test with a small number of healthsheds
   # this bounding box will need to be expanded by ~ 50km (in G's dataset it is 50) or even up to 70 or 08
   # we can also experiment with a buffer that follows the coastline precisely by 100KM
 
@@ -1,3 +1,7 @@
-# if the files are stored in local, then paths are null
 input: null
-output: null
+
+intermediate: null
+
+output: null
+
+testing: null
@@ -53,6 +53,26 @@
     "Some utilities are provided to help you with the ERA5 dataset."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def describe(\n",
+    "    cfg: DictConfig=None,  # Configuration file\n",
+    "    )-> None:\n",
+    "    \"Describe the configuration file used by Hydra for the pipeline\"\n",
+    "    \n",
+    "    if cfg is None:\n",
+    "        print(\"No configuration file provided. Generating default configuration file.\")\n",
+    "        cfg = OmegaConf.create()\n",
+    "        \n",
+    "    print(\"This package fetches ERA5 data. The following is the config file used by Hydra for the pipeline:\\n\")\n",
+    "    print(OmegaConf.to_yaml(cfg))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,
@@ -76,21 +96,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| export\n",
-    "def describe(\n",
-    "    cfg: DictConfig=None,  # Configuration file\n",
-    "    )-> None:\n",
-    "    \"Describe the configuration file used by Hydra for the pipeline\"\n",
-    "    \n",
-    "    if cfg is None:\n",
-    "        cfg = OmegaConf.create()\n",
+    "#| exporti\n",
+    "\n",
+    "def _create_directory_structure(\n",
+    "        base_path: str,  # The base directory where the structure will be created\n",
+    "        structure: dict  # A dictionary representing the directory structure\n",
+    "    )->None:\n",
+    "    \"\"\"\n",
+    "    Recursively creates a directory structure from a dictionary.\n",
+    "\n",
+    "    Args:\n",
+    "        base_path (str): The base directory where the structure will be created.\n",
+    "        structure (dict): A dictionary representing the directory structure.\n",
+    "    \"\"\"\n",
+    "    for folder, substructure in structure.items():\n",
+    "        # Create the current directory\n",
+    "        current_path = os.path.join(base_path, folder)\n",
+    "        os.makedirs(current_path, exist_ok=True)\n",
     "        \n",
-    "    print(\"This package fetches ERA5 data. The following is the config file used by Hydra for the pipeline:\\n\")\n",
-    "    print(OmegaConf.to_yaml(cfg))"
+    "        # Recursively create subdirectories if substructure is a dictionary\n",
+    "        if isinstance(substructure, dict):\n",
+    "            _create_directory_structure(current_path, substructure)"
    ]
   },
   {
@@ -104,32 +134,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "#| export\n",
     "def testAPI(\n",
     "    cfg: DictConfig=None,\n",
-    "    output_path:str=None,\n",
-    "    dataset:str=\"reanalysis-era5-pressure-levels\",\n",
-    "    remove:bool=True\n",
+    "    dataset:str=\"reanalysis-era5-pressure-levels\"\n",
     "    )-> bool:    \n",
     "    \n",
+    "    # parse config\n",
+    "    testing=cfg.development_mode\n",
+    "    output_path=here(\"data\") / \"testing\"\n",
+    "\n",
     "    print(OmegaConf.to_yaml(cfg))\n",
     "\n",
     "    try:\n",
     "        client = cdsapi.Client()\n",
     "\n",
-    "        # check the path\n",
-    "        if output_path is None:\n",
-    "            output_path = here() / \"data\"\n",
-    "        else:\n",
-    "            output_path = _expand_path(output_path)\n",
-    "\n",
-    "        if not os.path.exists(output_path):\n",
-    "            os.makedirs(output_path)\n",
-    "\n",
     "        # build request\n",
     "        request = {\n",
     "            'product_type': ['reanalysis'],\n",
@@ -142,13 +165,13 @@
     "            'data_format': 'grib',\n",
     "        }\n",
     "\n",
-    "        target = output_path / 'download.grib'\n",
+    "        target = output_path / 'test_download.grib'\n",
     "        \n",
     "        print(\"Testing API connection by downloading a dummy dataset to {}...\".format(output_path))\n",
     "\n",
     "        client.retrieve(dataset, request, target)\n",
     "\n",
-    "        if remove:\n",
+    "        if not testing:\n",
     "            os.remove(target)\n",
     "        \n",
     "        print(\"API connection test successful.\")\n",
@@ -310,9 +333,13 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "@hydra.main(version_base=None, config_path=\"../conf\", config_name=\"config\")\n",
+    "@hydra.main(version_base=None, config_path=\"../../conf\", config_name=\"config\")\n",
     "def main(cfg: DictConfig) -> None:\n",
-    "    describe(cfg=cfg)\n",
+    "\n",
+    "    # Create the directory structure\n",
+    "    _create_directory_structure(here() / \"data\", cfg.datapaths)\n",
+    "\n",
+    "    # test the api\n",
     "    testAPI(cfg=cfg)"
    ]
   },
 
@@ -46,8 +46,10 @@
     "import os\n",
     "import hydra\n",
     "import cdsapi\n",
+    "import geopandas as gpd\n",
     "from core import _expand_path\n",
-    "from omegaconf import DictConfig, OmegaConf"
+    "from pyprojroot import here\n",
+    "from omegaconf import DictConfig, ListConfig, OmegaConf"
    ]
   },
   {
@@ -69,9 +71,22 @@
     "        print(f\"Missing required key in query. Required keys are {required_keys}\")\n",
     "        print(\"Query validation failed\")\n",
     "        raise ValueError(\"Invalid query\")\n",
-    "        \n",
+    "    \n",
+    "    if isinstance(query_body['year'], ListConfig):\n",
+    "        query_body['year'] = [str(x).zfill(2) for x in query_body['year']]\n",
+    "    else:\n",
+    "        query_body['year'] = str(query_body['year'])\n",
+    "    if isinstance(query_body['month'], ListConfig):\n",
+    "        query_body['month'] = [str(x).zfill(2) for x in query_body['month']]\n",
+    "    else:\n",
+    "        query_body['month'] = str(query_body['month']).zfill(2)\n",
+    "    \n",
+    "    if isinstance(query_body['day'], ListConfig):\n",
+    "        query_body['day'] = [str(x).zfill(2) for x in query_body['day']]\n",
     "    else:\n",
-    "        return query_body"
+    "        query_body['day'] = str(query_body['day']).zfill(2)\n",
+    "\n",
+    "    return OmegaConf.to_container(query_body, resolve=True)"
    ]
   },
   {
@@ -81,7 +96,8 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def fetch_MDG_GADM(\n",
+    "def fetch_GADM(\n",
+    "        url: str=\"https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_MDG.gpkg\",\n",
     "        output_file: str=\"gadm41_MDG.gpkg\" # file path to save the GADM data\n",
     "    )-> str:\n",
     "    '''\n",
@@ -94,8 +110,8 @@
     "        print(\"GADM data already exists\")\n",
     "        return output_file_path\n",
     "    \n",
-    "    print(\"Fetching GADM bounding box data for Madagascar\")\n",
-    "    os.system(\"curl --output {} https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_MDG.gpkg\".format(output_file))\n",
+    "    print(\"Fetching GADM bounding box data for region\")\n",
+    "    os.system(\"curl --output {} {}\".format(output_file, url))\n",
     "    print(\"GADM data fetched\")\n",
     "    \n",
     "    return output_file_path"
@@ -108,23 +124,60 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def download(\n",
+    "\n",
+    "def create_bounding_box(\n",
+    "        gadm_file: str, \n",
+    "        round_to: int = 1, \n",
+    "        buffer: float = 0.1)->list:\n",
+    "    '''\n",
+    "    Create a bounding box from the GADM data.\n",
+    "\n",
+    "    This function reads the GADM data from URL and extracts the bounding box of the region.\n",
+    "    '''\n",
+    "\n",
+    "    ground_shape = gpd.read_file(gadm_file, layer = \"ADM_ADM_0\")\n",
+    "\n",
+    "    bbox = ground_shape.total_bounds\n",
+    "\n",
+    "    bbox[0] = round(bbox[0], round_to) - buffer\n",
+    "    bbox[1] = round(bbox[1], round_to) - buffer\n",
+    "    bbox[2] = round(bbox[2], round_to) + buffer\n",
+    "    bbox[3] = round(bbox[3], round_to) + buffer\n",
+    "    \n",
+    "    bbox = [bbox[0], bbox[2], bbox[1], bbox[3]]\n",
+    "\n",
+    "    return bbox\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def download_raw_era5(\n",
     "        cfg: DictConfig,  # hydra configuration file\n",
-    "        output_dir: str = \"data/input/\", # output directory\n",
-    "        dataset: str = \"reanalysis-era5-pressure-levels\", # dataset to download\n",
-    "        testing: bool = False  # testing mode\n",
+    "        dataset: str = \"reanalysis-era5-land\", # dataset to download\n",
     "    )->None:\n",
     "    '''\n",
     "    Send the query to the API and download the data\n",
     "    '''\n",
+    "\n",
+    "    # parse the cfg\n",
+    "    testing = cfg.development_mode  # for testing\n",
+    "    output_dir = here(\"data/input\") # output directory\n",
     "    \n",
     "    client = cdsapi.Client()\n",
     "    \n",
     "    query = _validate_query(cfg.query)\n",
     "    \n",
     "    # Send the query to the client\n",
     "    if not testing:\n",
-    "        client.retrieve(dataset, query).download(os.path.join(_expand_path(output_dir), \"{}_{}.nc\".format(query.year, query.month)))\n",
+    "        bounds = create_bounding_box(cfg.query['gadm_file'])\n",
+    "        query['area'] = bounds\n",
+    "        del query['gadm_file']\n",
+    "        client.retrieve(dataset, query).download(os.path.join(_expand_path(output_dir), \"{}_{}.nc\".format(query['year'], query['month'])))\n",
     "    else:\n",
     "        print(f\"Testing mode. Not downloading data. Query is {query}\")\n",
     "\n",
@@ -157,7 +210,7 @@
     "with initialize(version_base=None, config_path=\"../conf\"):\n",
     "    cfg = compose(config_name='config.yaml')\n",
     "\n",
-    "download(cfg, testing=True)"
+    "download_raw_era5(cfg, testing=True)"
    ]
   },
   {
@@ -167,9 +220,9 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "@hydra.main(config_path=\"../conf\", config_name=\"config\", version_base=None)\n",
+    "@hydra.main(config_path=\"../../conf\", config_name=\"config\", version_base=None)\n",
     "def main(cfg: DictConfig) -> None:\n",
-    "    download(cfg=cfg)"
+    "    download_raw_era5(cfg=cfg)"
    ]
   },
   {
@@ -206,7 +259,15 @@
    "name": "python3"
   },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
    "version": "3.11.11"
   }
  },
 
@@ -87,12 +87,14 @@
     "# make sure era5_sandbox package is installed in development mode\n",
     "$ pip install -e .\n",
     "\n",
-    "# make changes under nbs/ directory\n",
-    "# ...\n",
+    "# To make changes, go to the \"notes\" directory and edit the notebooks as necessary.\n",
+    "# Each notebook refers to a module in the era5_sandbox package. Cells are exported to the module\n",
+    "# when the notebook is saved and you run the following command:\n",
     "\n",
-    "# compile to have changes apply to era5_sandbox\n",
-    "$ nbdev_prepare\n",
-    "```"
+    "$ nbdev_export\n",
+    "```\n",
+    "\n",
+    "For e.g., to change functionality of the `testAPI()` function in the testAPI Hydra rule, you would edit the `testAPI` notebook in the `notes` directory `notes/testAPI.ipynb`, and then save that notebook and run `nbdev_export` to update the `core` module in the package."
    ]
   },
   {
 
@@ -38,7 +38,7 @@ status = 3
 user = TinasheMTapera
 
 ### Optional ###
-requirements = fastcore pyprojroot ipykernel nbdev cdsapi hydra-core ipykernel
+requirements = fastcore pyprojroot ipykernel nbdev cdsapi hydra-core ipykernel snakemake geopandas xarray matplotlib cartopy
 # dev_requirements = 
 # console_scripts = src
 # conda_user =