fix bug with drop overlapping assim flag (#597)

samlamont · web-flow · commit d56cb88dccf4 · 2025-11-25T15:52:29.000-05:00
* fix bug with drop overlapping assim flag

* publish docs on merge or manual trigger not PR
diff --git a/.github/workflows/documentation-publish.yml b/.github/workflows/documentation-publish.yml
@@ -1,9 +1,11 @@
 name: build-publish-documentation
 
 on:
-  pull_request:
-    types: [opened, reopened, ready_for_review]
-    branches: [main]
+  workflow_dispatch:
+
+  # pull_request:
+  #   types: [opened, reopened, ready_for_review]
+  #   branches: [main]
 
   push:
     branches: [main]
diff --git a/src/teehr/evaluation/fetch.py b/src/teehr/evaluation/fetch.py
@@ -911,8 +911,7 @@ def nwm_operational_points(
             ),
             timeseries_type=timeseries_type,
             write_mode=write_mode,
-            drop_duplicates=drop_duplicates,
-            drop_overlapping_assimilation_values=drop_overlapping_assimilation_values  # noqa
+            drop_duplicates=drop_duplicates
         )
 
     def nwm_operational_grids(
@@ -1215,6 +1214,5 @@ def nwm_operational_grids(
             in_path=Path(self.nwm_cache_dir),
             timeseries_type=timeseries_type,
             write_mode=write_mode,
-            drop_duplicates=drop_duplicates,
-            drop_overlapping_assimilation_values=drop_overlapping_assimilation_values  # noqa
+            drop_duplicates=drop_duplicates
         )
diff --git a/src/teehr/fetching/nwm/grid_utils.py b/src/teehr/fetching/nwm/grid_utils.py
@@ -183,7 +183,8 @@ def fetch_and_format_nwm_grids(
     overwrite_output: bool,
     location_id_prefix: Union[str, None],
     variable_mapper: Dict[str, Dict[str, str]],
-    timeseries_type: TimeseriesTypeEnum
+    timeseries_type: TimeseriesTypeEnum,
+    drop_overlapping_assimilation_values: bool
 ):
     """Compute weighted average, grouping by reference time.
 
@@ -243,6 +244,11 @@ def fetch_and_format_nwm_grids(
             Path(output_parquet_dir), f"{ref_time_str}.parquet"
         )
         z_hour_df.sort_values([LOCATION_ID, VALUE_TIME], inplace=True)
+
+        if drop_overlapping_assimilation_values and "assim" in nwm_configuration_name:
+            # Set reference_time to NaT for assimilation values
+            z_hour_df.loc[:, REFERENCE_TIME] = pd.NaT
+
         write_timeseries_parquet_file(
             filepath=parquet_filepath,
             overwrite_output=overwrite_output,
diff --git a/src/teehr/fetching/nwm/nwm_grids.py b/src/teehr/fetching/nwm/nwm_grids.py
@@ -382,5 +382,6 @@ def nwm_grids_to_parquet(
             overwrite_output=overwrite_output,
             location_id_prefix=location_id_prefix,
             variable_mapper=variable_mapper,
-            timeseries_type=timeseries_type
+            timeseries_type=timeseries_type,
+            drop_overlapping_assimilation_values=drop_overlapping_assimilation_values
         )
diff --git a/src/teehr/fetching/nwm/nwm_points.py b/src/teehr/fetching/nwm/nwm_points.py
@@ -348,5 +348,6 @@ def nwm_to_parquet(
             overwrite_output,
             nwm_version,
             variable_mapper,
-            timeseries_type
+            timeseries_type,
+            drop_overlapping_assimilation_values
         )
diff --git a/src/teehr/fetching/nwm/point_utils.py b/src/teehr/fetching/nwm/point_utils.py
@@ -102,7 +102,8 @@ def process_chunk_of_files(
     overwrite_output: bool,
     nwm_version: str,
     variable_mapper: Dict[str, Dict[str, str]],
-    timeseries_type: TimeseriesTypeEnum
+    timeseries_type: TimeseriesTypeEnum,
+    drop_overlapping_assimilation_values: bool
 ):
     """Assemble a table for a chunk of NWM files."""
     location_ids = np.array(location_ids).astype(int)
@@ -156,6 +157,12 @@ def process_chunk_of_files(
         end = f"{end_json[1]}T{end_json[3][1:3]}F{end_json[6][1:]}"
         filename = f"{start}_{end}.parquet"
 
+    if drop_overlapping_assimilation_values and "assim" in configuration:
+        # Set reference_time to NaT for assimilation values
+        df_output = output_table.to_pandas()
+        df_output.loc[:, REFERENCE_TIME] = pd.NaT
+        output_table = pa.Table.from_pandas(df_output, schema=schema)
+
     write_timeseries_parquet_file(
         Path(output_parquet_dir, filename),
         overwrite_output,
@@ -176,7 +183,8 @@ def fetch_and_format_nwm_points(
     overwrite_output: bool,
     nwm_version: str,
     variable_mapper: Dict[str, Dict[str, str]],
-    timeseries_type: TimeseriesTypeEnum
+    timeseries_type: TimeseriesTypeEnum,
+    drop_overlapping_assimilation_values: bool
 ):
     """Fetch NWM point data and save as parquet files.
 
@@ -211,6 +219,12 @@ def fetch_and_format_nwm_points(
         they already exist.  True = overwrite; False = fail.
     nwm_version : str
         Specified NWM version.
+    variable_mapper : Dict[str, Dict[str, str]]
+        A mapping dictionary for variable names and units.
+    timeseries_type : TimeseriesTypeEnum
+        The type of timeseries being processed.
+    drop_overlapping_assimilation_values : bool
+        Whether to drop assimilation values that overlap in value_time.
     """
     output_parquet_dir = Path(output_parquet_dir)
     if not output_parquet_dir.exists():
@@ -241,5 +255,6 @@ def fetch_and_format_nwm_points(
             overwrite_output,
             nwm_version,
             variable_mapper,
-            timeseries_type
+            timeseries_type,
+            drop_overlapping_assimilation_values
         )
diff --git a/src/teehr/loading/timeseries.py b/src/teehr/loading/timeseries.py
@@ -247,8 +247,7 @@ def validate_and_insert_timeseries(
     timeseries_type: str,
     pattern: str = "**/*.parquet",
     write_mode: TableWriteEnum = "append",
-    drop_duplicates: bool = True,
-    drop_overlapping_assimilation_values: bool = False
+    drop_duplicates: bool = True
 ):
     """Validate and insert primary timeseries data.
 
@@ -272,12 +271,6 @@ def validate_and_insert_timeseries(
     drop_duplicates : bool, optional (default: True)
         Whether to drop duplicates in the dataframe before writing
         to the table.
-    drop_overlapping_assimilation_values: Optional[bool] = True
-        Whether to drop overlapping assimilation values. Default is True.
-        If True, values that overlap in value_time are dropped, keeping those with
-        the most recent reference_time. In this case, all reference_time values
-        are set to None. If False, overlapping values are kept and reference_time
-        is retained.
     """ # noqa
     in_path = Path(in_path)
     logger.info(f"Validating and inserting timeseries data from {in_path}")
@@ -292,9 +285,6 @@ def validate_and_insert_timeseries(
     # Read the converted files to Spark DataFrame
     df = table._read_files(in_path, pattern)
 
-    if drop_overlapping_assimilation_values:
-        df = df.withColumn("reference_time", lit(None))
-
     # Validate using the _validate() method
     validated_df = table._validate(
         df=df,

Original file line number	Diff line number	Diff line change
`@@ -382,5 +382,6 @@ def nwm_grids_to_parquet(`
`382`	`382`	`overwrite_output=overwrite_output,`
`383`	`383`	`location_id_prefix=location_id_prefix,`
`384`	`384`	`variable_mapper=variable_mapper,`
`385`		`- timeseries_type=timeseries_type`
	`385`	`+ timeseries_type=timeseries_type,`
	`386`	`+ drop_overlapping_assimilation_values=drop_overlapping_assimilation_values`
`386`	`387`	`)`
Original file line number	Diff line number	Diff line change
`@@ -348,5 +348,6 @@ def nwm_to_parquet(`
`348`	`348`	`overwrite_output,`
`349`	`349`	`nwm_version,`
`350`	`350`	`variable_mapper,`
`351`		`- timeseries_type`
	`351`	`+ timeseries_type,`
	`352`	`+ drop_overlapping_assimilation_values`
`352`	`353`	`)`