diff --git a/README.md b/README.md index 77c417d..ff81e4a 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,66 @@ # drb_gridmet_tools -Repository with functions to aggregate [gridmet climate raster data](https://www.climatologylab.org/gridmet.html) from pixel grid to vector aoi (hru polygon grid, polylines). This repository heavily relies on [grd2shp_xagg](https://github.com/rmcd-mscb/grd2shp_xagg) library, by [rmcd-mscb](https://github.com/rmcd-mscb) + Repository with functions to aggregate [gridmet climate raster data](https://www.climatologylab.org/gridmet.html) from pixel grid to vector aoi (hru polygon grid, polylines). This repository heavily relies on the [grd2shp_xagg](https://github.com/rmcd-mscb/grd2shp_xagg) library, by [rmcd-mscb](https://github.com/rmcd-mscb), which in turn relies oni [xagg](https://github.com/ks905383/xagg). +# Accessing the re-gridded files +## On Caldera +On Caldera, re-gridded file paths are structured like this:`/caldera/projects/usgs/water/impd/pump/gridmet/drb_gridmet_tools/drb-gridmet/{fabric}/{run_date}/drb_climate_{run_date}.nc`. For example: ``/caldera/projects/usgs/water/impd/pump/gridmet/drb_gridmet_tools/drb-gridmet/nhm/2022_06_14/drb_climate_2022_06_14.nc`. -## Selected gridMET variables: +## On S3 +As of 6/2022, the results of the workflow are, by default, also stored S3 in the `drb-gridmet` bucket. For example, `s3://drb-gridmet/nhm/2022_06_14/drb_climate_2022_06_14.nc`.:w + +# Running re-gridding for the Delaware River Basin via Snakemake +The re-gridding process for the Delaware River Basin has been run on USGS's Tallgrass via Singularity. It has been run for the National Hydrologic Model (NHM) fabric and the National Hydrographic Database (NHD) fabric. It should be able to be run via Docker as well as Singularity, but it has not been tried yet. Instructions for running and modifying the pipeline are below. + +_Parallelization in Snakemake_ +The Snakemake workflow parallelizes the re-gridding of the 8 Gridmet variables. As long as you provide at least 8 cores, these tasks will run in parallel. + +_S3_ +By default, the results of the workflow are stored locally and on S3 in the `drb-gridmet` bucket. If you would like to only store the resulting files locally, change the `use_S3` option in the config file to `False`. + +NOTE: Although this workflow was first run on DRB catchments, it should be able to be run on any set of polygons in the conterminous US. This however, has not be tested. + +## Run via Singularity +To run the workflow with Singularity, you can either use the image already in Caldera via Tallgrass, or pull the image into a new directory. + +### Option A. Use the existing image and cloned repo +1. move to the correct directory + +``` +cd /caldera/projects/usgs/water/impd/pump/gridmet/drb_gridmet_tools/ +``` + +2. Decide on which fabric you want to use. You will use a different `config` file depending on which fabric you use: `config_nhm.yml` for the NHM fabric and `config_nhd.yml` for the `nhd` fabric. +3. [Optional] edit the options in the config file +4. Run the workflow +You can run the workflow on Tallgrass either in batch mode or interactively. Subtite your HPC account (may not be 'iidd') and desired config file (may not be `config_nhd.yml`) + +You can run the workflow via `sbatch` +``` +sbatch -A iidd slurm/launch_snakemake.slurm config_nhd.yml +``` +It may be helpful instead to run it interactively + +``` +salloc -N 1 -n 8 -t 10:00:00 -p cpu -A iidd +module load singularity +singularity exec-agg_v0.3.sif /opt/conda/bin/snakemake -j --configfile config_nhd.yml +``` + +### Option B. Executing in a different directory +1. Clone the repo and move to the `drb_gridmet_tools` directory +``` +git clone git@github.com:USGS-R/drb_gridmet_tools.git +cd drb_gridmet_tools +``` +2. Pull down Docker image into a Singularity file +``` +singularity pull docker://jsadler2/gridmet-agg:v0.3 +``` +3. Do Steps 2-4 from Option A. + + +# Selected gridMET variables: tmmx: * Description: Daily Maximum Temperature (2m)\ @@ -38,8 +95,4 @@ sph: * Units: kg/kg -## Running re-gridding for the Delaware River Basin - -`gridmet_split_script.py` processes the gridmet raster dataset values to the scale of the input multi-polygon shapefile. -`gridmet_aggregation_PRMS.py` processes the output of `gridmet_splot_script.py` and aggregates the PRMS_segid scale calculating an area weighted average. diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..7c34e69 --- /dev/null +++ b/Snakefile @@ -0,0 +1,167 @@ +import geopandas as gpd +import xarray as xr +from snakemake.remote.S3 import RemoteProvider as S3RemoteProvider +from gridmet_split_script import get_gridmet_datasets, create_weightmap, g2shp_regridding +from gridmet_aggregation_PRMS import ncdf_to_gdf, gridmet_prms_area_avg_agg +import requests +from datetime import datetime + +todays_date = datetime.today().strftime('%Y_%m_%d') + +final_files = [] + +nc_file_path = "drb-gridmet/{fabric_id}/{todays_date}/{run_prefix}_climate_{todays_date}.nc" +final_nc_file_path = nc_file_path.format(fabric_id = config['fabric_id'], + todays_date=todays_date, + run_prefix=config['run_prefix']) + +seg_file_path = "drb-gridmet/{fabric_id}/{todays_date}/{run_prefix}_climate_{todays_date}_segments.csv" +final_seg_file_path = seg_file_path.format(fabric_id = config['fabric_id'], + todays_date=todays_date, + run_prefix=config['run_prefix']) + +zarr_path = "drb-gridmet/{fabric_id}/{todays_date}/{run_prefix}_climate_{todays_date}.zarr.ind" +final_zarr_path = zarr_path.format(fabric_id = config['fabric_id'], + todays_date=todays_date, + run_prefix=config['run_prefix']) + + +if config['fabric_id'] == 'nhm': + final_files.append(final_seg_file_path) + +if config['make_zarr']: + final_files.append(final_zarr_path) + +if config['use_S3']: + S3 = S3RemoteProvider(keep_local=True) + final_files = [S3.remote(f) for f in final_files] + nc_file_path = S3.remote(nc_file_path) + seg_file_path = S3.remote(seg_file_path) + + + +rule all: + input: + final_files + + +rule make_weight_map: + output: + "drb-gridmet/{fabric_id}/grd2shp_weights.pickle" + run: + gdf = gpd.read_file(config['catchment_file_path']) + # getting just one date and one variable to make the weight map. + # the same weight map applies to all dates and all variables + data_dict = get_gridmet_datasets(variable="tmmn", + start_date="2001-01-01", + end_date="2001-01-02", + polygon_for_bbox=gdf) + create_weightmap(xarray_dict=data_dict, + polygon=gdf, + output_data_folder = os.path.split(output[0])[0]) + + +rule aggregate_gridmet_to_polygons_one_var: + input: + "drb-gridmet/{fabric_id}/grd2shp_weights.pickle" + output: + "drb-gridmet/{fabric_id}/{todays_date}/{run_prefix}_var_{variable}_climate_{todays_date}.nc" + run: + gdf = gpd.read_file(config['catchment_file_path']) + data_dict = get_gridmet_datasets(variable=wildcards.variable, + start_date=config.get('start_date', "1979-01-01"), + end_date=config.get('end_date', todays_date.replace("_", "-")), + polygon_for_bbox=gdf) + g2shp_regridding(xarray_dict=data_dict, + polygon=gdf, + weightmap_file= input[0], + g2s_file_prefix=f'{wildcards.run_prefix}_var_{wildcards.variable}_', + output_data_folder= os.path.split(output[0])[0], + g2s_time_var = 'day', + g2s_lat_var = 'lat', + g2s_lon_var = 'lon') + + +rule gather_gridmets: + input: + expand("drb-gridmet/{fabric_id}/{todays_date}/{run_prefix}_var_{variable}_climate_{todays_date}.nc", + fabric_id=config['fabric_id'], + todays_date=todays_date, + run_prefix=config['run_prefix'], + variable=config['data_vars']) + output: + nc_file_path + run: + gdf = gpd.read_file(config['catchment_file_path']) + ds_list = [xr.open_dataset(nc_file) for nc_file in input] + ds_combined = xr.merge(ds_list) + ds_combined = ds_combined.assign_coords({config["id_col"]: ("geomid", gdf[config["id_col"]])}).swap_dims({"geomid":config["id_col"]}) + ds_combined = ds_combined.drop("geomid") + ds_combined.to_netcdf(output[0]) + + +rule aggregate_gridmet_polygons_to_flowlines: + input: + "drb-gridmet/{fabric_id}/{todays_date}/{run_prefix}_climate_{todays_date}.nc" + output: + seg_file_path + run: + gdf = gpd.read_file(config['catchment_file_path']) + gridmet_drb_gdf = ncdf_to_gdf(ncdf_path=input[0], + shp = gdf, + left_on = config["id_col"], + right_on = config["id_col"]) + df_agg = gridmet_prms_area_avg_agg(gridmet_drb_gdf, + groupby_cols = ['PRMS_segid',"time"], + val_colnames = config['data_vars'], + wgt_col='hru_area_m2', + output_path= output[0]) + +checkpoint write_zarr: + input: + "drb-gridmet/{filename}.nc" + output: + directory("/tmp/{filename}.zarr") + run: + ds = xr.open_dataset(input[0]) + ds = ds.chunk({"time": len(ds.time), config["id_col"]: 100}) + ds.to_zarr(output[0]) + + +def get_zarr_files(wildcards): + zarr_files = [] + for path, currentDirectory, files in os.walk(f"/tmp/{wildcards.filename}.zarr"): + for file in files: + scratch_path = os.path.join(path, file) + drb_path = scratch_path.replace("/tmp", "drb-gridmet") + zarr_files.append(drb_path) + return zarr_files + + +rule write_zarr_ind: + input: + "/tmp/{filename}.zarr", + get_zarr_files + output: + "drb-gridmet/{filename}.zarr.ind" + shell: + "touch {output[0]}" + +if config['use_S3']: + rule copy_from_scratch_to_s3: + input: + "/tmp/{filename}.zarr/{zarr_file}" + output: + S3.remote("drb-gridmet/{filename}.zarr/{zarr_file}") + shell: + "cp {input[0]} {output[0]}" + +else: + rule copy_from_scratch_to_s3: + input: + "/tmp/{filename}.zarr/{zarr_file}" + output: + "drb-gridmet/{filename}.zarr/{zarr_file}" + shell: + "cp {input[0]} {output[0]}" + diff --git a/config_nhd.yml b/config_nhd.yml new file mode 100644 index 0000000..85767c2 --- /dev/null +++ b/config_nhd.yml @@ -0,0 +1,23 @@ +# probably don't change these +catchment_file_path: "https://github.com/USGS-R/drb-network-prep/blob/main/1_fetch/out/NHDPlusv2_catchments.gpkg?raw=true" + +fabric_id: 'nhd' + + +id_col: "COMID" + +# feel free to change these +data_vars: ['tmmx', 'tmmn', 'pr', 'srad', 'vs','rmax','rmin','sph'] + +use_S3: True + +make_zarr: True + +run_prefix: "drb" + +# if not specified, data will be processed from 1979-01-01 +#start_date: "2022-05-09" + +# if not specified, data will be processed to today's date +#end_date: "2022-06-13" + diff --git a/config_nhm.yml b/config_nhm.yml new file mode 100644 index 0000000..2379f9b --- /dev/null +++ b/config_nhm.yml @@ -0,0 +1,21 @@ +# probably don't change these +catchment_file_path: "https://github.com/USGS-R/drb-network-prep/blob/940073e8d77c911b6fb9dc4e3657aeab1162a158/2_process/out/GFv1_catchments_edited.gpkg?raw=true" + +fabric_id: 'nhm' + +id_col: "PRMS_segid" + +# feel free to change these +data_vars: ['tmmx', 'tmmn', 'pr', 'srad', 'vs','rmax','rmin','sph'] + +use_S3: True + +make_zarr: True + +run_prefix: "drb" + +# leave blank if you want to run from 1979-01-01 +#start_date: "2022-05-09" + +# leave blank if you want to run through current date +#end_date: "2022-06-13" diff --git a/data/.DS_Store b/data/.DS_Store deleted file mode 100644 index 1a65eb4..0000000 Binary files a/data/.DS_Store and /dev/null differ diff --git a/data/GFv1_catchments_edited.gpkg.zip b/data/GFv1_catchments_edited.gpkg.zip deleted file mode 100644 index 15dbb64..0000000 Binary files a/data/GFv1_catchments_edited.gpkg.zip and /dev/null differ diff --git a/data/PRMS_catchments_4326.zip b/data/PRMS_catchments_4326.zip deleted file mode 100644 index 587a00e..0000000 Binary files a/data/PRMS_catchments_4326.zip and /dev/null differ diff --git a/data/nhru_01.zip b/data/nhru_01.zip deleted file mode 100644 index c81d7f2..0000000 Binary files a/data/nhru_01.zip and /dev/null differ diff --git a/data/nhru_02.zip b/data/nhru_02.zip deleted file mode 100644 index 914c8db..0000000 Binary files a/data/nhru_02.zip and /dev/null differ diff --git a/gridmet_aggregation_PRMS.py b/gridmet_aggregation_PRMS.py index 049c0a3..3ca59f8 100644 --- a/gridmet_aggregation_PRMS.py +++ b/gridmet_aggregation_PRMS.py @@ -5,7 +5,7 @@ import time # ncdf_to_gdf() function converts ncdf to dataset and merged with shpfile information (geometry + area) -def ncdf_to_gdf(ncdf_path, shp, left_on = 'geomid', right_on_index = True, gpkg_layer = None): +def ncdf_to_gdf(ncdf_path, shp, left_on, right_on, gpkg_layer = None): """ :param str ncdf_path: path to regridded ncdf file (output of g2shp_regridding()) @@ -27,7 +27,7 @@ def ncdf_to_gdf(ncdf_path, shp, left_on = 'geomid', right_on_index = True, gpkg_ print('shp must be path to geospatial file or a geodataframe') ## Merge ncdf w/ shapefile (the shpfile has area info) & convert to GeoDataFrame - gridmet_drb_df = xr_mapped_df.merge(gdf, how ='left', left_on = left_on, right_index = right_on_index) + gridmet_drb_df = xr_mapped_df.merge(gdf, how ='left', left_on = left_on, right_on = right_on) gridmet_drb_gdf = gpd.GeoDataFrame(gridmet_drb_df) return gridmet_drb_gdf @@ -76,27 +76,3 @@ def gridmet_prms_area_avg_agg(df, groupby_cols, val_colnames, wgt_col, output_pa return df_final -# Define variables and run -if __name__ =='__main__': - - ## Variable definitions - gdf_prms_path_edited = 'https://github.com/USGS-R/drb-network-prep/blob/940073e8d77c911b6fb9dc4e3657aeab1162a158/2_process/out/GFv1_catchments_edited.gpkg?raw=true' - gdf = gpd.read_file(gdf_prms_path_edited, layer='GFv1_catchments_edited') - gridmet_ncdf = './data/t_climate_2022_03_31.nc' - data_vars_shrt_all = ['tmmx', 'tmmn', 'pr', 'srad', 'vs', 'rmax', 'rmin', 'sph'] - - ## Create dataframe and merge with shapefile information - gridmet_drb_gdf = ncdf_to_gdf(ncdf_path=gridmet_ncdf, - shp = gdf, - left_on = 'geomid', - right_on_index = True) - - ## run aggregation on PRMS_segid and time - df_agg = gridmet_prms_area_avg_agg(gridmet_drb_gdf, - groupby_cols = ['PRMS_segid',"time"], - val_colnames = data_vars_shrt_all, - wgt_col='hru_area_m2', - output_path= None) - - ## Uncomment to run - # df_agg.reset_index().to_csv('../drb-inland-salinity-ml/1_fetch/in/grdmet_drb_agg_032321.csv', index = False) \ No newline at end of file diff --git a/gridmet_split_script.py b/gridmet_split_script.py index 42d80d5..73ae52f 100644 --- a/gridmet_split_script.py +++ b/gridmet_split_script.py @@ -88,6 +88,9 @@ def create_weightmap(xarray_dict, polygon, output_data_folder, weightmap_var = N else: raise TypeError('polygon should be str path to shapefile or geodataframe') + polygon = polygon.to_crs('EPSG:4326') + print('geodataframe crs changed to EPSG:4326') + ## Name output weightmap_file if weightmap_var is None: # If no var is specified, first var of xarray_dict will be used for generating weight file, and name of output will be generic: 'grd2shp_weights.pickle' @@ -129,6 +132,9 @@ def g2shp_regridding(xarray_dict, polygon, weightmap_file, output_data_folder, g # Lsit of xarray.datasets vars_grd_list = list(xarray_dict.values()) + polygon = polygon.to_crs('EPSG:4326') + print('geodataframe crs changed to EPSG:4326') + ## initialize Grd2ShpXagg() g2s = grd2shp_xagg.Grd2ShpXagg() g2s.initialize( @@ -148,7 +154,6 @@ def g2shp_regridding(xarray_dict, polygon, weightmap_file, output_data_folder, g g2s.run_weights() end = time.perf_counter() print('finished agg in {} second(s)'.format(round(end - start, 2))) - print(g2s) ## Saving output g2s.write_gm_file(opath=output_data_folder, prefix=g2s_file_prefix) @@ -160,42 +165,3 @@ def g2shp_regridding(xarray_dict, polygon, weightmap_file, output_data_folder, g return g2s -# Variables and run functions -if __name__ =='__main__': - - ## Variable definitions - ### official list of variables needed for drb-inland-salinity model - data_vars_shrt_all = ['tmmx', 'tmmn', 'pr', 'srad', 'vs','rmax','rmin','sph'] - ### Catchment polygons - gdf_prms_path_edited = 'https://github.com/USGS-R/drb-network-prep/blob/940073e8d77c911b6fb9dc4e3657aeab1162a158/2_process/out/GFv1_catchments_edited.gpkg?raw=true' - gdf = gpd.read_file(gdf_prms_path_edited, layer = 'GFv1_catchments_edited') - ### date range - start_date = '1979-01-01' - end_date = '2022-01-01' - ### lat lon - lon_min, lat_min, lon_max, lat_max = gdf.total_bounds - ### output folder - output_path = './data/' - - xarray_dict = get_gridmet_datasets(variable = data_vars_shrt_all, - start_date= start_date, end_date = end_date, - polygon_for_bbox = gdf) - # lat_min=round(lat_min,2), lon_min=round(lon_min,2), - # lat_max=round(lat_max,2), lon_max=round(lon_max,2)) - - weight_map_path = create_weightmap(xarray_dict = xarray_dict, - polygon=gdf, - output_data_folder = output_path, - weightmap_var = 'tmmn') - - ### Subset for streamlined testing - # subset = {key: xarray_dict[key] for key in ['tmmx']} - - g2shp_regridding(xarray_dict= xarray_dict, - polygon=gdf, - weightmap_file= weight_map_path, - g2s_file_prefix='t_', - output_data_folder= output_path, - g2s_time_var = 'day', - g2s_lat_var = 'lat', - g2s_lon_var = 'lon') \ No newline at end of file