RTIInternational
diff --git a/‎docs/sphinx/changelog/index.rst‎
Lines changed: 12 additions & 0 deletions b/‎docs/sphinx/changelog/index.rst‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/sphinx/user_guide/notebooks/03_introduction_class.ipynb‎
Lines changed: 3 additions & 3 deletions b/‎docs/sphinx/user_guide/notebooks/03_introduction_class.ipynb‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎poetry.lock‎
Lines changed: 155 additions & 1 deletion b/‎poetry.lock‎
Lines changed: 155 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/teehr/const.py‎
Lines changed: 3 additions & 1 deletion b/‎src/teehr/const.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/teehr/evaluation/evaluation.py‎
Lines changed: 1 addition & 0 deletions b/‎src/teehr/evaluation/evaluation.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/teehr/evaluation/fetch.py‎
Lines changed: 28 additions & 8 deletions b/‎src/teehr/evaluation/fetch.py‎
Lines changed: 28 additions & 8 deletions
@@ -1,6 +1,18 @@
 Release Notes
 =============
 
+0.4.11 - 2025-05-19
+------------------
+
+Changed
+^^^^^^^
+- Fixes bug in _write_spark_df() method in the BaseTable class that caused writing larger dataframes to fail.
+- Parallelizes `convert_single_timeseries()` when a directory is passed to the `in_path` argument.
+- Fixes doc string in `generate_weights_file()`
+- Switched to the built-in dropDuplicates() method in the `BaseTable` class to drop duplicates instead of using a custom implementation.
+- Added option to specify the number of partitions when writing dataframes in the BaseTable class.
+- Added the option to skip the dropDuplicates() method when writing dataframes in the BaseTable class.
+
 0.4.10 - 2025-04-14
 ------------------
 
 
@@ -253,7 +253,7 @@
    "outputs": [],
    "source": [
     "# Filter using a raw SQL string\n",
-    "ev.locations.filter(\"id = 'gage-A'\").to_geopandas()"
+    "ev.locations.filter(\"id = 'usgs-A'\").to_geopandas()"
    ]
   },
   {
@@ -266,7 +266,7 @@
     "ev.locations.filter({\n",
     "    \"column\": \"id\",\n",
     "    \"operator\": \"=\",\n",
-    "    \"value\": \"gage-A\"\n",
+    "    \"value\": \"usgs-A\"\n",
     "}).to_geopandas()"
    ]
   },
@@ -286,7 +286,7 @@
     "lf = LocationFilter(\n",
     "    column=fields.id,\n",
     "    operator=Operators.eq,\n",
-    "    value=\"gage-A\"\n",
+    "    value=\"usgs-A\"\n",
     ")\n",
     "ev.locations.filter(lf).to_geopandas()"
    ]
 
@@ -38,6 +38,7 @@ bokeh = "^3.5.0"
 scoringrules = "^0.7.1"
 hvplot = "^0.11.1"
 geoviews = "^1.14.0"
+lxml = "^5.3.2"
 
 [tool.poetry.group.test.dependencies]
 pytest = "^7.4.3"
 
@@ -1,6 +1,6 @@
 """This module contains constants
  used throughout the package."""
-
+import os
 
 # Primary evaluation directories
 DATASET_DIR = "dataset"
@@ -24,3 +24,5 @@
 LOADING_CACHE_DIR = "loading"
 
 S3_EVALUATIONS_PATH = "s3://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/evaluations.yaml"
+
+MAX_CPUS = max(os.cpu_count() - 1, 1)
@@ -93,6 +93,7 @@ def __init__(
                 .set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
                 .set("spark.sql.execution.arrow.pyspark.enabled", "true")
                 .set("spark.sql.session.timeZone", "UTC")
+                .set("spark.driver.host", "localhost")
             )
             self.spark = SparkSession.builder.config(conf=conf).getOrCreate()
 
 
@@ -131,6 +131,7 @@ def usgs_streamflow(
         overwrite_output: Optional[bool] = False,
         timeseries_type: TimeseriesTypeEnum = "primary",
         write_mode: TableWriteEnum = "append",
+        drop_duplicates: bool = True,
     ):
         """Fetch USGS gage data and load into the TEEHR dataset.
 
@@ -175,6 +176,8 @@ def usgs_streamflow(
             that does not already exist.
             If "upsert", existing data will be replaced and new data that
             does not exist will be appended.
+        drop_duplicates : bool
+            Whether to drop duplicates in the data. Default is True.
 
 
         .. note::
@@ -268,7 +271,8 @@ def usgs_streamflow(
                 self.usgs_cache_dir
             ),
             timeseries_type=timeseries_type,
-            write_mode=write_mode
+            write_mode=write_mode,
+            drop_duplicates=drop_duplicates
         )
 
     def nwm_retrospective_points(
@@ -281,7 +285,8 @@ def nwm_retrospective_points(
         overwrite_output: Optional[bool] = False,
         domain: Optional[SupportedNWMRetroDomainsEnum] = "CONUS",
         timeseries_type: TimeseriesTypeEnum = "secondary",
-        write_mode: TableWriteEnum = "append"
+        write_mode: TableWriteEnum = "append",
+        drop_duplicates: bool = True,
     ):
         """Fetch NWM retrospective point data and load into the TEEHR dataset.
 
@@ -334,6 +339,8 @@ def nwm_retrospective_points(
             that does not already exist.
             If "upsert", existing data will be replaced and new data that
             does not exist will be appended.
+        drop_duplicates : bool
+            Whether to drop duplicates in the data. Default is True.
 
 
         .. note::
@@ -427,7 +434,8 @@ def nwm_retrospective_points(
                 self.nwm_cache_dir
             ),
             timeseries_type=timeseries_type,
-            write_mode=write_mode
+            write_mode=write_mode,
+            drop_duplicates=drop_duplicates
         )
 
     def nwm_retrospective_grids(
@@ -443,7 +451,8 @@ def nwm_retrospective_grids(
         location_id_prefix: Optional[str] = None,
         timeseries_type: TimeseriesTypeEnum = "primary",
         write_mode: TableWriteEnum = "append",
-        zonal_weights_filepath: Optional[Union[Path, str]] = None
+        zonal_weights_filepath: Optional[Union[Path, str]] = None,
+        drop_duplicates: bool = True,
     ):
         """
         Fetch NWM retrospective gridded data, calculate zonal statistics (currently only
@@ -507,6 +516,8 @@ def nwm_retrospective_grids(
             The path to the zonal weights file. If None and calculate_zonal_weights
             is False, the weights file must exist in the cache for the configuration.
             Default is None.
+        drop_duplicates : bool
+            Whether to drop duplicates in the data. Default is True.
 
         Examples
         --------
@@ -622,7 +633,8 @@ def nwm_retrospective_grids(
                 self.nwm_cache_dir
             ),
             timeseries_type=timeseries_type,
-            write_mode=write_mode
+            write_mode=write_mode,
+            drop_duplicates=drop_duplicates
         )
 
     def nwm_operational_points(
@@ -644,7 +656,8 @@ def nwm_operational_points(
         timeseries_type: TimeseriesTypeEnum = "secondary",
         starting_z_hour: Optional[int] = None,
         ending_z_hour: Optional[int] = None,
-        write_mode: TableWriteEnum = "append"
+        write_mode: TableWriteEnum = "append",
+        drop_duplicates: bool = True,
     ):
         """Fetch operational NWM point data and load into the TEEHR dataset.
 
@@ -732,6 +745,8 @@ def nwm_operational_points(
             that does not already exist.
             If "upsert", existing data will be replaced and new data that
             does not exist will be appended.
+        drop_duplicates : bool
+            Whether to drop duplicates in the data. Default is True.
 
 
         .. note::
@@ -863,7 +878,8 @@ def nwm_operational_points(
                 self.nwm_cache_dir
             ),
             timeseries_type=timeseries_type,
-            write_mode=write_mode
+            write_mode=write_mode,
+            drop_duplicates=drop_duplicates
         )
 
     def nwm_operational_grids(
@@ -887,6 +903,7 @@ def nwm_operational_grids(
         ending_z_hour: Optional[int] = None,
         write_mode: TableWriteEnum = "append",
         zonal_weights_filepath: Optional[Union[Path, str]] = None,
+        drop_duplicates: bool = True,
     ):
         """
         Fetch NWM operational gridded data, calculate zonal statistics (currently only
@@ -980,6 +997,8 @@ def nwm_operational_grids(
             The path to the zonal weights file. If None and calculate_zonal_weights
             is False, the weights file must exist in the cache for the configuration.
             Default is None.
+        drop_duplicates : bool
+            Whether to drop duplicates in the data. Default is True.
 
 
         .. note::
@@ -1135,5 +1154,6 @@ def nwm_operational_grids(
             ev=self.ev,
             in_path=Path(self.nwm_cache_dir),
             timeseries_type=timeseries_type,
-            write_mode=write_mode
+            write_mode=write_mode,
+            drop_duplicates=drop_duplicates
         )
Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,7 @@ def __init__(`
`93`	`93`	`.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")`
`94`	`94`	`.set("spark.sql.execution.arrow.pyspark.enabled", "true")`
`95`	`95`	`.set("spark.sql.session.timeZone", "UTC")`
	`96`	`+ .set("spark.driver.host", "localhost")`
`96`	`97`	`)`
`97`	`98`	`self.spark = SparkSession.builder.config(conf=conf).getOrCreate()`
`98`	`99`