hydroserver2
diff --git a/‎src/hydroserverpy/api/models/etl/task.py‎
Lines changed: 12 additions & 20 deletions b/‎src/hydroserverpy/api/models/etl/task.py‎
Lines changed: 12 additions & 20 deletions
diff --git a/‎src/hydroserverpy/etl/STATES_README.md‎
Lines changed: 55 additions & 22 deletions b/‎src/hydroserverpy/etl/STATES_README.md‎
Lines changed: 55 additions & 22 deletions
diff --git a/‎src/hydroserverpy/etl/etl_configuration.py‎
Lines changed: 20 additions & 3 deletions b/‎src/hydroserverpy/etl/etl_configuration.py‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎src/hydroserverpy/etl/extractors/base.py‎
Lines changed: 40 additions & 7 deletions b/‎src/hydroserverpy/etl/extractors/base.py‎
Lines changed: 40 additions & 7 deletions
diff --git a/‎src/hydroserverpy/etl/extractors/ftp_extractor.py‎
Lines changed: 24 additions & 6 deletions b/‎src/hydroserverpy/etl/extractors/ftp_extractor.py‎
Lines changed: 24 additions & 6 deletions
@@ -8,6 +8,7 @@
 from datetime import datetime, timedelta, timezone
 from pydantic import Field, AliasPath, AliasChoices, TypeAdapter
 from hydroserverpy.etl.factories import extractor_factory, transformer_factory, loader_factory
+from hydroserverpy.etl.loaders.hydroserver_loader import LoadSummary
 from hydroserverpy.etl.etl_configuration import ExtractorConfig, TransformerConfig, LoaderConfig, SourceTargetMapping, MappingPath
 from ..base import HydroServerBaseModel
 from .orchestration_system import OrchestrationSystem
@@ -222,11 +223,11 @@ def run_local(self):
                 return
 
             logging.info("Starting load")
-            load_stats = loader_cls.load(data, self)
+            load_summary = loader_cls.load(data, self)
             self._update_status(
                 task_run,
                 True,
-                self._success_message(load_stats),
+                self._success_message(load_summary),
                 runtime_source_uri=runtime_source_uri,
             )
         except Exception as e:
@@ -271,35 +272,26 @@ def _update_status(
         self.save()
 
     @staticmethod
-    def _success_message(load_stats: Optional[dict]) -> str:
-        if not isinstance(load_stats, dict):
-            return "OK"
-
-        loaded = load_stats.get("observations_loaded")
-        datastreams_loaded = load_stats.get("datastreams_loaded")
-        available = load_stats.get("observations_available")
-        timestamps_total = load_stats.get("timestamps_total")
-        timestamps_after_cutoff = load_stats.get("timestamps_after_cutoff")
-
-        if loaded is None:
+    def _success_message(load: Optional[LoadSummary]) -> str:
+        if not load:
             return "OK"
 
+        loaded = load.observations_loaded
         if loaded == 0:
-            if timestamps_total and timestamps_after_cutoff == 0:
-                cutoff = load_stats.get("cutoff")
-                if cutoff:
+            if load.timestamps_total and load.timestamps_after_cutoff == 0:
+                if load.cutoff:
                     return (
                         "No new observations to load "
-                        f"(all timestamps were at or before {cutoff})."
+                        f"(all timestamps were at or before {load.cutoff})."
                     )
                 return "No new observations to load (all timestamps were at or before the cutoff)."
-            if available == 0:
+            if load.observations_available == 0:
                 return "No new observations to load."
             return "No new observations were loaded."
 
-        if datastreams_loaded is not None:
+        if load.datastreams_loaded:
             return (
-                f"Load completed successfully ({loaded} rows across {datastreams_loaded} datastreams)."
+                f"Load completed successfully ({loaded} rows across {load.datastreams_loaded} datastreams)."
             )
         return f"Load completed successfully ({loaded} rows loaded)."
 
 
@@ -1,32 +1,65 @@
-## Possible error states:
-
-Config file validation
-Tell the user exactly which configuration variables are missing or invalid
-Could not connect to the source system.
-The source system did not respond before the timeout.
-Authentication with the source system failed; credentials may be invalid or expired.
-The requested payload was not found on the source system.
-The source system returned no data.
-
-The source returned a format different from what this job expects.
-The payload’s expected fields were not found.
-For CSV:
+## Possible Needs Attention states:
+
+These are the most important end-user messages the ETL system can return for a task run
+that needs user action.
+
+### Configuration / Setup
+
+- Invalid extractor configuration. Tell the user exactly which field is missing or invalid.
+- Invalid transformer configuration. Tell the user exactly which field is missing or invalid.
+- A required configuration value is missing.
+- A required configuration value is null where a value is expected.
+- Missing required per-task extractor variable "<name>".
+- Extractor source URI contains a placeholder "<name>", but it was not provided.
+- Task configuration is missing required daylight savings offset (when using daylightSavings mode).
+
+### Data Source (Connectivity / Authentication)
+
+- Could not connect to the source system.
+- The source system did not respond before the timeout.
+- Authentication with the source system failed; credentials may be invalid or expired.
+- The requested payload was not found on the source system.
+- The source system returned no data.
 
+### Source Data Did Not Match The Task
+
+- The source returned a format different from what this job expects.
+- The payload's expected fields were not found.
+- One or more timestamps could not be read with the current settings.
+- This job references a resource that no longer exists.
+- The file structure does not match the configuration.
+
+For CSV:
 - The header row contained unexpected values and could not be processed.
 - One or more data rows contained unexpected values and could not be processed.
+- Timestamp column "<key>" was not found in the extracted data.
+- A mapping source index is out of range for the extracted data.
+- A mapping source column was not found in the extracted data.
 
 For JSON:
+- The timestamp or value key could not be found with the specified query.
+- Transformer did not receive any extracted data to parse.
+
+### Targets / HydroServer
+
+- HydroServer rejected some or all of the data.
+- The target data series (datastream) could not be found.
+  - This may happen if the datastream was deleted or the mapping points to the wrong target.
 
-- The timestamp or value key couldn’t be found with the specified JMESPath query
+### Unexpected System Error
 
-This job references a resource that no longer exists.
-The file structure does not match the configuration.
+- An internal system error occurred while processing the job.
+- The job stopped before completion.
 
-HydroServer rejected some or all of the data.
-The target datastream could not be found.
-An internal system error occurred while processing the job.
-The job stopped before completion.
+## Possible OK states:
 
-## Possible warning states:
+These are the most important end-user messages the ETL system can return for a successful run.
 
-## Possible success states:
+- Load completed successfully.
+- Load completed successfully (<n> rows loaded).
+- Load completed successfully (<n> rows across <m> datastreams).
+- No new observations to load.
+- No new observations were loaded.
+- No new observations to load (all timestamps were at or before <cutoff>).
+- No data returned from the extractor. Nothing to load.
+- Transform produced no rows. Nothing to load.
@@ -2,6 +2,7 @@
 from typing import Annotated, Dict, List, Literal, Optional, Union
 from pydantic import BaseModel, Field, field_validator
 from enum import Enum
+from zoneinfo import ZoneInfo
 
 WorkflowType = Literal["ETL", "Aggregation", "Virtual", "SDL"]
 CSVDelimiterType = Literal[",", "|", "\t", ";", " "]
@@ -76,12 +77,28 @@ class Timestamp(BaseModel):
 
     class Config:
         populate_by_name = True
+        validate_default = True
 
-    @field_validator("timezone")
+    @field_validator("timezone", mode="after")
     def check_timezone(cls, timezone_value, info):
         mode = info.data.get("timezone_mode")
-        if mode == TimezoneMode.fixedOffset and timezone_value is None:
-            raise ValueError("`timezone` must be set when timezoneMode is fixedOffset")
+        if mode == TimezoneMode.fixedOffset:
+            if timezone_value is None:
+                raise ValueError(
+                    "`timezone` must be set when timezoneMode is fixedOffset (e.g. '-0700')"
+                )
+        if mode == TimezoneMode.daylightSavings:
+            if timezone_value is None or str(timezone_value).strip() == "":
+                raise ValueError(
+                    "Task configuration is missing required daylight savings offset (when using daylightSavings mode)."
+                )
+            # Validate it's a real IANA tz name early to avoid cryptic ZoneInfo errors later.
+            try:
+                ZoneInfo(str(timezone_value))
+            except Exception:
+                raise ValueError(
+                    f"Invalid timezone {timezone_value!r}. Use an IANA timezone like 'America/Denver'."
+                )
         return timezone_value
 
 
 
@@ -4,6 +4,10 @@
 from datetime import datetime
 from ..etl_configuration import ExtractorConfig, Task
 from ..timestamp_parser import TimestampParser
+from ..logging_utils import redact_url, summarize_list
+
+
+logger = logging.getLogger(__name__)
 
 
 class Extractor:
@@ -12,21 +16,32 @@ def __init__(self, extractor_config: ExtractorConfig):
         self.runtime_source_uri = None
 
     def resolve_placeholder_variables(self, task: Task, loader):
-        logging.info("Resolving extractor runtime variables...")
+        placeholders = list(self.cfg.placeholder_variables or [])
         filled = {}
-        for placeholder in self.cfg.placeholder_variables:
+        runtime_names: set[str] = set()
+        task_names: set[str] = set()
+        for placeholder in placeholders:
             name = placeholder.name
 
             if placeholder.type == "runTime":
-                logging.info("Resolving runtime var: %s", name)
+                logger.debug("Resolving runtime var: %s", name)
+                runtime_names.add(name)
                 if placeholder.run_time_value == "latestObservationTimestamp":
                     value = loader.earliest_begin_date(task)
                 elif placeholder.run_time_value == "jobExecutionTime":
                     value = pd.Timestamp.now(tz="UTC")
             elif placeholder.type == "perTask":
-                logging.info("Resolving task var: %s", name)
+                logger.debug("Resolving task var: %s", name)
+                task_names.add(name)
                 if name not in task.extractor_variables:
-                    raise KeyError(f"Missing per-task variable '{name}'")
+                    logger.error(
+                        "Missing per-task extractor variable '%s'. Provided extractorVariables keys=%s",
+                        name,
+                        summarize_list(sorted((task.extractor_variables or {}).keys())),
+                    )
+                    raise ValueError(
+                        f"Missing required per-task extractor variable '{name}'."
+                    )
                 value = task.extractor_variables[name]
             else:
                 continue
@@ -36,21 +51,39 @@ def resolve_placeholder_variables(self, task: Task, loader):
                 value = parser.utc_to_string(value)
 
             filled[name] = value
+
+        if runtime_names:
+            names = ", ".join(sorted(runtime_names))
+            logger.debug(
+                "Runtime variables resolved (%s): %s", len(runtime_names), names
+            )
+        if task_names:
+            names = ", ".join(sorted(task_names))
+            logger.debug("Task variables resolved (%s): %s", len(task_names), names)
+
         if not filled:
             uri = self.cfg.source_uri
         else:
             uri = self.format_uri(filled)
 
         self.runtime_source_uri = uri
-        logging.info("Resolved runtime source URI: %s", uri)
+        # Keep a stable log prefix for downstream parsing, but redact secrets.
+        logger.info("Resolved runtime source URI: %s", redact_url(uri))
         return uri
 
     def format_uri(self, placeholder_variables):
         try:
             uri = self.cfg.source_uri.format(**placeholder_variables)
         except KeyError as e:
             missing_key = e.args[0]
-            raise KeyError(f"Missing placeholder variable: {missing_key}")
+            logger.error(
+                "Failed to format sourceUri: missing placeholder '%s'. Provided placeholders=%s",
+                missing_key,
+                summarize_list(sorted(placeholder_variables.keys())),
+            )
+            raise ValueError(
+                f"Extractor source URI contains a placeholder '{missing_key}', but it was not provided."
+            )
         return uri
 
     @abstractmethod
 
@@ -1,12 +1,15 @@
 import logging
-from ftplib import FTP
+from ftplib import FTP, error_perm
 from io import BytesIO
 from typing import Dict
 
 from .base import Extractor
 from ..types import TimeRange
 
 
+logger = logging.getLogger(__name__)
+
+
 class FTPExtractor(Extractor):
     def __init__(
         self,
@@ -33,18 +36,33 @@ def extract(self):
         try:
             ftp.connect(self.host, self.port)
             ftp.login(user=self.username, passwd=self.password)
-            logging.info(f"Connected to FTP server: {self.host}:{self.port}")
+            logger.debug("Connected to FTP server %s:%s", self.host, self.port)
 
             data = BytesIO()
             ftp.retrbinary(f"RETR {self.filepath}", data.write)
-            logging.info(
-                f"Successfully downloaded file '{self.filepath}' from FTP server."
+            logger.debug(
+                "Successfully downloaded file %r from FTP server.",
+                self.filepath,
             )
             data.seek(0)
+            if data.getbuffer().nbytes == 0:
+                raise ValueError("The source system returned no data.")
             return data
+        except error_perm as e:
+            msg = str(e)
+            # Common FTP status codes:
+            # 530 = not logged in / auth failure
+            # 550 = file unavailable
+            if msg.startswith("530"):
+                raise ValueError(
+                    "Authentication with the source system failed; credentials may be invalid or expired."
+                ) from e
+            if msg.startswith("550"):
+                raise ValueError("The requested payload was not found on the source system.") from e
+            raise ValueError("The source system returned an error.") from e
         except Exception as e:
-            logging.error(f"Error retrieving file from FTP server: {e}")
-            return None
+            logger.error("Error retrieving file from FTP server: %s", e, exc_info=True)
+            raise ValueError("Could not connect to the source system.") from e
         finally:
             if ftp:
                 ftp.quit()