hydroserver2
diff --git a/‎src/hydroserverpy/etl/README.md‎
Lines changed: 76 additions & 5 deletions b/‎src/hydroserverpy/etl/README.md‎
Lines changed: 76 additions & 5 deletions
diff --git a/‎src/hydroserverpy/etl/models/temporal_aggregation.py‎
Lines changed: 261 additions & 0 deletions b/‎src/hydroserverpy/etl/models/temporal_aggregation.py‎
Lines changed: 261 additions & 0 deletions
@@ -115,8 +115,8 @@ transformer = CSVTransformer(
 
 | `timezone_type` | Behaviour | Requires |
 |---|---|---|
-| `"utc"` (default) | Treats all timestamps as UTC. | — |
-| `"embedded"` | Reads timezone offset from the timestamp string itself. Falls back to UTC if the timestamps are naive. | — |
+| `None` (default) | Reads timezone offset from the timestamp string itself. Falls back to UTC if the timestamps are naive. | — |
+| `"utc"` | Treats all timestamps as UTC. | — |
 | `"offset"` | Treats timestamps as naive and applies a fixed UTC offset. Strips any embedded offset if present. | `timezone` in `±HHMM` or `±HH:MM` format |
 | `"iana"` | Treats timestamps as naive and applies a named IANA timezone. Strips any embedded offset if present. | `timezone` as a valid IANA name |
 
@@ -135,10 +135,10 @@ transformer = CSVTransformer(
     timezone="America/Denver",
 )
 
-# Embedded offset — timestamps include their own offset, e.g. "2024-01-15T08:30:00-07:00"
+# Embedded offsets — timestamps include their own offset, e.g. "2024-01-15T08:30:00-07:00"
+# Omit timezone_type (or set it to None) to read offsets from the timestamps directly.
 transformer = CSVTransformer(
     timestamp_key="datetime",
-    timezone_type="embedded",
 )
 ```
 
@@ -209,6 +209,77 @@ ETLTargetPath(
 
 Operations are applied in order. The output of each operation becomes the input of the next.
 
+### Temporal Aggregation
+
+Temporal aggregation is an optional step that reduces the per-observation DataFrame produced by the transformer into period-level summaries before loading. When configured, the same aggregation is applied uniformly to every target series in the pipeline.
+
+```python
+from hydroserverpy.etl.models import TemporalAggregation
+
+aggregation = TemporalAggregation(
+    aggregation_statistic="simple_mean",
+    aggregation_interval=1,
+    aggregation_interval_unit="day",
+)
+```
+
+Pass it to the transformer at construction time:
+
+```python
+transformer = CSVTransformer(
+    timestamp_key="datetime",
+    temporal_aggregation=aggregation,
+)
+```
+
+#### Aggregation statistic
+
+| `aggregation_statistic` | Behaviour |
+|---|---|
+| `"simple_mean"` | Arithmetic mean of all observations within the window. |
+| `"time_weighted_mean"` | Mean weighted by the time between observations, computed via trapezoidal integration. Values at window boundaries are estimated by linear interpolation from the nearest surrounding observations. |
+| `"last_value_of_period"` | The last observation within the window. |
+
+#### Aggregation interval
+
+`aggregation_interval` (integer, default `1`) and `aggregation_interval_unit` (currently `"day"`) together define the window width. An `aggregation_interval` of `3` with unit `"day"` produces 3-day windows.
+
+#### Timezone
+
+Window boundaries are aligned to local midnight in the configured timezone. The timezone fields follow the same conventions as the transformer timestamp configuration, with `None` (the default) falling back to UTC-day boundaries.
+
+| `timezone_type` | Window boundary alignment | Requires |
+|---|---|---|
+| `None` (default) | UTC midnight | — |
+| `"utc"` | UTC midnight | — |
+| `"offset"` | Local midnight at a fixed UTC offset | `timezone` in `±HHMM` or `±HH:MM` format |
+| `"iana"` | Local midnight in a named timezone, handling DST automatically | `timezone` as a valid IANA name |
+
+```python
+# Daily windows aligned to US Mountain Time (UTC-7, DST-aware)
+aggregation = TemporalAggregation(
+    aggregation_statistic="simple_mean",
+    aggregation_interval=1,
+    aggregation_interval_unit="day",
+    timezone_type="iana",
+    timezone="America/Denver",
+)
+
+# Daily windows at a fixed offset (no DST adjustment)
+aggregation = TemporalAggregation(
+    aggregation_statistic="time_weighted_mean",
+    aggregation_interval=1,
+    aggregation_interval_unit="day",
+    timezone_type="offset",
+    timezone="-0700",
+)
+```
+
+**Window boundary semantics:** Windows run from the local midnight that contains the first observation to the local midnight that contains the last observation. The last observation defines the exclusive upper boundary — observations on that final local day are not aggregated. Ensure your source data extends at least one day past the last period you want included, or that the last observation falls on the day following the final window.
+
+Days with no observations are omitted from the output rather than filled with null values.
+
+
 ### Loader
 
 ```python
@@ -349,4 +420,4 @@ for target_id, target in context.results.target_results.items():
 | Error | Likely cause |
 |---|---|
 | `Missing datastream IDs: ...` | One or more target datastream UUIDs don't exist on the HydroServer instance |
-| `HydroServer loader failed to retrieve datastream` | A network or authentication error occurred while looking up a datastream |
+| `HydroServer loader failed to retrieve datastream` | A network or authentication error occurred while looking up a datastream |
@@ -0,0 +1,261 @@
+import math
+import pandas as pd
+from typing import Literal, Optional
+from bisect import bisect_left
+from datetime import datetime, time, timedelta, timezone as dt_timezone
+from .timestamp import Timezone
+
+
+AggregationStatistic = Literal["simple_mean", "time_weighted_mean", "last_value_of_period"]
+AggregationIntervalUnit = Literal["day"]
+
+
+class TemporalAggregation(Timezone):
+    aggregation_statistic: AggregationStatistic
+    aggregation_interval: int = 1
+    aggregation_interval_unit: AggregationIntervalUnit = "day"
+
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Aggregate all non-timestamp columns in df over fixed-duration day windows.
+
+        The df must have a UTC-normalized 'timestamp' column. The returned DataFrame
+        has one row per window with the window-start UTC timestamp as the
+        'timestamp' column. Windows where every target column has no observations
+        are dropped entirely.
+        """
+
+        target_columns = [col for col in df.columns if col != "timestamp"]
+
+        if df.empty:
+            return pd.DataFrame(columns=["timestamp"] + target_columns)
+
+        timestamps: list[datetime] = df["timestamp"].dt.to_pydatetime().tolist()
+        start_utc = timestamps[0]
+        end_utc = timestamps[-1]
+
+        result: dict[str, list] = {"timestamp": [], **{col: [] for col in target_columns}}
+
+        for ws, we in self._iter_windows(start_utc, end_utc):
+            row = {
+                col: self._aggregate_window(
+                    timestamps,
+                    pd.to_numeric(df[col], errors="coerce").tolist(),
+                    ws,
+                    we,
+                )
+                for col in target_columns
+            }
+            if all(v is None for v in row.values()):
+                continue
+            result["timestamp"].append(ws)
+            for col, val in row.items():
+                result[col].append(val)
+
+        out = pd.DataFrame(result)
+        out["timestamp"] = pd.to_datetime(out["timestamp"], utc=True)
+
+        return out
+
+    def _effective_tz(self):
+        """
+        Return the tzinfo to use for window boundaries, defaulting to UTC.
+        """
+
+        return self.tz or dt_timezone.utc
+
+    def _interval_delta(self) -> timedelta:
+        """
+        Return the timedelta for the configured aggregation interval.
+        """
+
+        if self.aggregation_interval_unit == "day":
+            return timedelta(days=self.aggregation_interval)
+
+        raise NotImplementedError(
+            f"Invalid temporal aggregation configuration. "
+            f"Received unsupported aggregation interval unit: {self.aggregation_interval_unit!r}"
+        )
+
+    def _window_start(self, ts_utc: datetime) -> datetime:
+        """
+        Return the local datetime aligned to the start of the window containing ts_utc.
+        """
+
+        tz = self._effective_tz()
+        local = ts_utc.astimezone(tz)
+
+        if self.aggregation_interval_unit == "day":
+            return datetime.combine(local.date(), time.min, tzinfo=tz)
+
+        raise NotImplementedError(
+            f"Invalid temporal aggregation configuration. "
+            f"Received unsupported aggregation interval unit: {self.aggregation_interval_unit!r}"
+        )
+
+    def _next_window_start(self, current: datetime) -> datetime:
+        """
+        Return the local datetime of the next window boundary after current.
+        """
+
+        tz = self._effective_tz()
+
+        if self.aggregation_interval_unit == "day":
+            next_date = current.date() + timedelta(days=self.aggregation_interval)
+            return datetime.combine(next_date, time.min, tzinfo=tz)
+
+        raise NotImplementedError(
+            f"Invalid temporal aggregation configuration. "
+            f"Received unsupported aggregation interval unit: {self.aggregation_interval_unit!r}"
+        )
+
+    def _iter_windows(self, start_utc: datetime, end_utc: datetime):
+        """
+        Yield (window_start_utc, window_end_utc) pairs covering [start_utc, end_utc).
+
+        Windows are aligned to unit boundaries in local time (e.g. midnight for days)
+        and stepped using _next_window_start to handle DST transitions correctly.
+        """
+
+        current_local = self._window_start(start_utc)
+        end_local = self._window_start(end_utc)
+
+        while current_local < end_local:
+            next_local = self._next_window_start(current_local)
+            yield current_local.astimezone(dt_timezone.utc), next_local.astimezone(dt_timezone.utc)
+            current_local = next_local
+
+    @staticmethod
+    def _boundary_value(
+        target: datetime,
+        timestamps: list[datetime],
+        values: list[float],
+        prev_idx: Optional[int],
+        next_idx: Optional[int],
+    ) -> Optional[float]:
+        """
+        Estimate the value at a window boundary by exact match or linear interpolation.
+
+        If the observation immediately before (prev_idx) or after (next_idx) the boundary
+        falls exactly on the target timestamp, that value is returned directly. Otherwise,
+        if observations exist on both sides, the value is linearly interpolated. If only
+        one side is available, that side's value is used as a flat extrapolation.
+
+        Returns None if no usable observations are available on either side.
+        """
+
+        prev = None
+        nxt = None
+
+        if prev_idx is not None and 0 <= prev_idx < len(timestamps):
+            prev = (timestamps[prev_idx], values[prev_idx])
+        if next_idx is not None and 0 <= next_idx < len(timestamps):
+            nxt = (timestamps[next_idx], values[next_idx])
+
+        if prev is not None and prev[0] == target:
+            return prev[1]
+        if nxt is not None and nxt[0] == target:
+            return nxt[1]
+
+        if prev is not None and nxt is not None:
+            t0, v0 = prev
+            t1, v1 = nxt
+            span = (t1 - t0).total_seconds()
+            if span <= 0:
+                return v1
+            ratio = (target - t0).total_seconds() / span
+            return v0 + ratio * (v1 - v0)
+
+        if prev is not None:
+            return prev[1]
+        if nxt is not None:
+            return nxt[1]
+
+        return None
+
+    def _aggregate_window(
+        self,
+        timestamps: list[datetime],
+        values: list[float],
+        window_start: datetime,
+        window_end: datetime,
+    ) -> Optional[float]:
+        """
+        Compute the configured aggregation statistic for a single window.
+
+        Observations are selected from timestamps in [window_start, window_end) using
+        binary search. Returns None if no observations fall within the window.
+
+        For simple_mean: returns the arithmetic mean of all observations in the window.
+        For last_value_of_period: returns the last observation in the window.
+        For time_weighted_mean: computes a time-weighted mean via trapezoidal integration
+            over the full window duration. Boundary values at window_start and window_end
+            are estimated by _boundary_value if no observation falls exactly on those
+            timestamps. Returns None if either boundary value cannot be determined.
+        """
+
+        if not timestamps or window_end <= window_start:
+            return None
+
+        left = bisect_left(timestamps, window_start)
+        right = bisect_left(timestamps, window_end)
+
+        if left == right:
+            return None
+
+        window_values = values[left:right]
+
+        if self.aggregation_statistic == "simple_mean":
+            return sum(window_values) / len(window_values)
+
+        if self.aggregation_statistic == "last_value_of_period":
+            return window_values[-1]
+
+        # time_weighted_mean: trapezoidal integration over the window.
+        start_value = self._boundary_value(
+            target=window_start,
+            timestamps=timestamps,
+            values=values,
+            prev_idx=(left - 1) if left > 0 else None,
+            next_idx=left,
+        )
+        end_value = self._boundary_value(
+            target=window_end,
+            timestamps=timestamps,
+            values=values,
+            prev_idx=(right - 1) if right > 0 else None,
+            next_idx=right if right < len(timestamps) else None,
+        )
+
+        if start_value is None or end_value is None:
+            return None
+
+        area_points: list[tuple[datetime, float]] = [(window_start, start_value)]
+        for idx in range(left, right):
+            ts = timestamps[idx]
+            val = values[idx]
+            if ts == window_start:
+                area_points[0] = (ts, val)
+                continue
+            area_points.append((ts, val))
+
+        if area_points[-1][0] == window_end:
+            area_points[-1] = (window_end, end_value)
+        else:
+            area_points.append((window_end, end_value))
+
+        total_area = 0.0
+        for idx in range(1, len(area_points)):
+            t0, v0 = area_points[idx - 1]
+            t1, v1 = area_points[idx]
+            span = (t1 - t0).total_seconds()
+            if span > 0:
+                total_area += (v0 + v1) * 0.5 * span
+
+        duration = (window_end - window_start).total_seconds()
+        if duration <= 0:
+            return None
+
+        result = total_area / duration
+
+        return None if (math.isnan(result) or math.isinf(result)) else result