Merge pull request #414 from RTIInternational/410-grid-weights-file-being-generated-with-rowcolumns-as-float-instead-of-integer

samlamont · web-flow · commit 7222ff9bbec9 · 2025-03-26T12:27:55.000-04:00
Add schema validation to weights file for NWM grid fetching
diff --git a/src/teehr/evaluation/fetch.py b/src/teehr/evaluation/fetch.py
@@ -478,11 +478,12 @@ def nwm_retrospective_grids(
         >>> ev = teehr.Evaluation()
 
         >>> ev.fetch.nwm_retrospective_grids(
-        >>>     nwm_configuration="forcing_short_range",
+        >>>     nwm_version="nwm30",
         >>>     variable_name="RAINRATE",
         >>>     zonal_weights_filepath = Path(Path.home(), "nextgen_03S_weights.parquet"),
         >>>     start_date=datetime(2000, 1, 1),
-        >>>     end_date=datetime(2001, 1, 1)
+        >>>     end_date=datetime(2001, 1, 1),
+        >>>     location_id_prefix="huc10"
         >>> )
 
         .. note::
@@ -496,12 +497,12 @@ def nwm_retrospective_grids(
 
         >>> nwm_retro_grids_to_parquet(
         >>>     nwm_version="nwm30",
-        >>>     nwm_configuration="forcing_short_range",
         >>>     variable_name="RAINRATE",
         >>>     zonal_weights_filepath=Path(Path.home(), "nextgen_03S_weights.parquet"),
         >>>     start_date=2020-12-18,
         >>>     end_date=2022-12-18,
-        >>>     output_parquet_dir=Path(Path.home(), "temp/parquet")
+        >>>     output_parquet_dir=Path(Path.home(), "temp/parquet"),
+        >>>     location_id_prefix="huc10",
         >>> )
 
         See Also
diff --git a/src/teehr/fetching/nwm/grid_utils.py b/src/teehr/fetching/nwm/grid_utils.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import xarray as xr
 
+import teehr.models.pandera_dataframe_schemas as schemas
 from teehr.fetching.utils import (
     get_dataset,
     write_timeseries_parquet_file,
@@ -94,6 +95,17 @@ def compute_weighted_average(
     return df[[LOCATION_ID, VALUE]].copy()
 
 
+def read_and_validate_weights_file(
+    weights_filepath: str
+) -> pd.DataFrame:
+    """Read weights file from parquet, validating data types."""
+    schema = schemas.weights_file_schema()
+    weights_df = pd.read_parquet(
+        weights_filepath, columns=list(schema.columns.keys())
+    )
+    return schema.validate(weights_df)
+
+
 @dask.delayed
 def process_single_nwm_grid_file(
     row: Tuple,
@@ -121,9 +133,7 @@ def process_single_nwm_grid_file(
     value_time = ds.time.values[0]
     da = ds[variable_name][0]
 
-    weights_df = pd.read_parquet(
-        weights_filepath, columns=["row", "col", "weight", LOCATION_ID]
-    )
+    weights_df = read_and_validate_weights_file(weights_filepath)
 
     weights_bounds = get_weights_row_col_stats(weights_df)
 
diff --git a/src/teehr/fetching/nwm/nwm_grids.py b/src/teehr/fetching/nwm/nwm_grids.py
@@ -295,31 +295,3 @@ def nwm_grids_to_parquet(
             variable_mapper=variable_mapper,
             timeseries_type=timeseries_type
         )
-
-
-# if __name__ == "__main__":
-#     # Local testing
-#     weights_parquet = "/mnt/data/ciroh/onehuc10_weights.parquet"
-
-#     import time
-#     t1 = time.time()
-
-#     nwm_grids_to_parquet(
-#         configuration="forcing_analysis_assim",
-#         output_type="forcing",
-#         variable_name="RAINRATE",
-#         start_date="2023-11-28",
-#         ingest_days=1,
-#         zonal_weights_filepath=weights_parquet,
-#         json_dir="/mnt/data/ciroh/jsons",
-#         output_parquet_dir="/mnt/data/ciroh/parquet",
-#         nwm_version="nwm30",
-#         data_source="GCS",
-#         kerchunk_method="auto",
-#         t_minus_hours=[0],
-#         ignore_missing_file=False,
-#         overwrite_output=True,
-#         location_id_prefix="wbd10"
-#     )
-
-#     print(f"elapsed: {time.time() - t1:.2f} s")
diff --git a/src/teehr/fetching/nwm/retrospective_grids.py b/src/teehr/fetching/nwm/retrospective_grids.py
@@ -45,7 +45,6 @@
 from teehr.fetching.const import (
     VALUE_TIME,
     REFERENCE_TIME,
-    LOCATION_ID,
     UNIT_NAME,
     VARIABLE_NAME,
     CONFIGURATION_NAME
@@ -61,7 +60,8 @@
     update_location_id_prefix,
     compute_weighted_average,
     get_nwm_grid_data,
-    get_weights_row_col_stats
+    get_weights_row_col_stats,
+    read_and_validate_weights_file
 )
 from teehr.fetching.utils import (
     write_timeseries_parquet_file,
@@ -107,9 +107,7 @@ def process_nwm30_retro_group(
     and the output is saved to parquet files.
     """
     logger.debug("Processing NWM v3.0 retro grid data chunk.")
-    weights_df = pd.read_parquet(
-        weights_filepath, columns=["row", "col", "weight", LOCATION_ID]
-    )
+    weights_df = read_and_validate_weights_file(weights_filepath)
 
     weights_bounds = get_weights_row_col_stats(weights_df)
 
@@ -149,7 +147,6 @@ def process_nwm30_retro_group(
     if location_id_prefix:
         chunk_df = update_location_id_prefix(chunk_df, location_id_prefix)
 
-
     return chunk_df
 
 
@@ -203,9 +200,7 @@ def process_single_nwm21_retro_grid_file(
     value_time = row.datetime
     da = ds[variable_name].isel(Time=0)
 
-    weights_df = pd.read_parquet(
-        weights_filepath, columns=["row", "col", "weight", LOCATION_ID]
-    )
+    weights_df = read_and_validate_weights_file(weights_filepath)
 
     weights_bounds = get_weights_row_col_stats(weights_df)
 
diff --git a/src/teehr/models/pandera_dataframe_schemas.py b/src/teehr/models/pandera_dataframe_schemas.py
@@ -313,6 +313,36 @@ def location_crosswalks_schema(
             coerce=True
         )
 
+
+def weights_file_schema() -> pa.DataFrameSchema:
+    """Return the schema for a weights file."""
+    return pa.DataFrameSchema(
+        columns={
+            "row": pa.Column(
+                pa.Int32,
+                nullable=False,
+                coerce=True
+            ),
+            "col": pa.Column(
+                pa.Int32,
+                nullable=False,
+                coerce=True
+            ),
+            "weight": pa.Column(
+                pa.Float32,
+                nullable=False,
+                coerce=True
+            ),
+            "location_id": pa.Column(
+                pa.String,
+                nullable=False,
+                coerce=True
+            )
+        },
+        strict="filter"
+    )
+
+
 # Timeseries
 pandas_value_type = pa.Float32()
 pyspark_value_type = T.FloatType()
diff --git a/src/teehr/utilities/generate_weights.py b/src/teehr/utilities/generate_weights.py
@@ -14,6 +14,7 @@
 
 from teehr.fetching.utils import load_gdf
 from teehr.fetching.const import LOCATION_ID
+import teehr.models.pandera_dataframe_schemas as schemas
 
 
 @dask.delayed
@@ -236,11 +237,19 @@ def generate_weights_file(
     grid_transform = src_da.rio.transform()
     nodata_val = src_da.rio.nodata
 
+    if not all([dim in src_da.dims for dim in ["x", "y"]]):
+        raise ValueError("Template dataset must have x and y dimensions.")
+
     # Get the subset of the grid that intersects the total zone bounds
     bbox = tuple(zone_gdf.total_bounds)
-    src_da = src_da.sel(x=slice(bbox[0], bbox[2]), y=slice(bbox[1], bbox[3]))[
-        0
-    ]
+    if len(ds.dims) == 2:
+        src_da = src_da.sel(
+            x=slice(bbox[0], bbox[2]), y=slice(bbox[1], bbox[3])
+        )
+    else:
+        src_da = src_da.sel(
+            x=slice(bbox[0], bbox[2]), y=slice(bbox[1], bbox[3])
+        )[0]
     src_da = src_da.astype("float32")
     src_da["x"] = np.float32(src_da.x.values)
     src_da["y"] = np.float32(src_da.y.values)
@@ -275,28 +284,11 @@ def generate_weights_file(
     if location_id_prefix:
         df.loc[:, LOCATION_ID] = location_id_prefix + "-" + df[LOCATION_ID]
 
+    schema = schemas.weights_file_schema()
+    validated_df = schema.validate(df)
+
     if output_weights_filepath:
-        df.to_parquet(output_weights_filepath)
-        df = None
-
-    return df
-
-
-# if __name__ == "__main__":
-#     # Local testing
-#     zone_polygon_filepath = "/mnt/data/wbd/one_alaska_huc10.parquet"
-#     template_dataset = "/mnt/data/ciroh/nwm_temp/nwm.20231101_forcing_analysis_assim_alaska_nwm.t00z.analysis_assim.forcing.tm01.alaska.nc"  # noqa
-#     variable_name = "RAINRATE"
-#     unique_zone_id = "huc10"
-#     output_weights_filepath = (
-#         "/mnt/sf_shared/data/ciroh/one_huc10_alaska_weights.parquet"
-#     )
-
-#     generate_weights_file(
-#         zone_polygon_filepath=zone_polygon_filepath,
-#         template_dataset=template_dataset,
-#         variable_name=variable_name,
-#         output_weights_filepath=output_weights_filepath,
-#         crs_wkt=AL_NWM_WKT,
-#         unique_zone_id=unique_zone_id
-#     )
+        validated_df.to_parquet(output_weights_filepath)
+        validated_df = None
+
+    return validated_df
diff --git a/tests/data/nwm30/nwm_retro_v3_template_grid.nc b/tests/data/nwm30/nwm_retro_v3_template_grid.nc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:160df6bd1d64618fe23aca51903693b8aa5ce6e7fca53b56501282bfbda66436
+size 70864318
diff --git a/tests/data/nwm30/one_huc10_1016000606_teehr_weights.parquet b/tests/data/nwm30/one_huc10_1016000606_teehr_weights.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cc0cebb8ffde0f474417d0d063c9dbb5125cdc853ca91c2e9b6d6fe4dfd4e87
+size 6014
diff --git a/tests/data/nwm30/one_huc10_conus_1016000606.parquet b/tests/data/nwm30/one_huc10_conus_1016000606.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04cb0fb0df5f9c628394749cda31fc0bb78c5a56fe94dffde4a77bf3b6a08759
+size 3047
diff --git a/tests/test_generate_weights.py b/tests/test_generate_weights.py
@@ -0,0 +1,32 @@
+"""Test the generation of weights."""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from teehr.utilities.generate_weights import generate_weights_file
+from teehr.fetching.const import CONUS_NWM_WKT
+
+
+TEST_DIR = Path("tests", "data", "nwm30")
+TEMPLATE_FILEPATH = Path(TEST_DIR, "nwm_retro_v3_template_grid.nc")
+ZONES_FILEPATH = Path(TEST_DIR, "one_huc10_conus_1016000606.parquet")
+WEIGHTS_FILEPATH = Path(TEST_DIR, "one_huc10_1016000606_teehr_weights.parquet")
+
+
+def test_weights():
+    """Test the generation of weights."""
+    df = generate_weights_file(
+        zone_polygon_filepath=ZONES_FILEPATH,
+        template_dataset=TEMPLATE_FILEPATH,
+        variable_name="RAINRATE",
+        crs_wkt=CONUS_NWM_WKT,
+        output_weights_filepath=None,
+        unique_zone_id="id",
+    )
+
+    df_test = pd.read_parquet(WEIGHTS_FILEPATH).astype({"weight": np.float32})
+
+    assert df.equals(df_test)
+
+
+if __name__ == "__main__":
+    test_weights()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:160df6bd1d64618fe23aca51903693b8aa5ce6e7fca53b56501282bfbda66436`
	`3`	`+size 70864318`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:0cc0cebb8ffde0f474417d0d063c9dbb5125cdc853ca91c2e9b6d6fe4dfd4e87`
	`3`	`+size 6014`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:04cb0fb0df5f9c628394749cda31fc0bb78c5a56fe94dffde4a77bf3b6a08759`
	`3`	`+size 3047`