Export rateacuity reports as json instead of csv

MeadBarrel · MeadBarrel · commit 991db4ccc683 · 2025-10-23T13:56:20.000Z
diff --git a/tariff_fetch/_cli/rateacuity.py b/tariff_fetch/_cli/rateacuity.py
@@ -1,7 +1,7 @@
+import json
 import os
 from pathlib import Path
 
-import polars as pl
 import questionary
 import tenacity
 from dotenv import load_dotenv
@@ -23,7 +23,7 @@ def process_rateacuity(output_folder: Path, state: str, utility: Utility):
 
     selected_utility = None
     tariffs_to_include = None
-    frames = []
+    results = []
 
     for attempt in tenacity.Retrying(
         stop=tenacity.stop_after_attempt(3), retry=tenacity.retry_if_exception_type(WebDriverException)
@@ -70,15 +70,12 @@ def process_rateacuity(output_folder: Path, state: str, utility: Utility):
                     tariff = tariffs_to_include.pop(0)
                     console.log(f"Fetching {tariff}")
                     scraping_state = scraping_state.select_schedule(tariff)
-                    df = scraping_state.as_dataframe()
-                    df = df.with_columns(pl.lit(tariff).alias("Schedule"))
-                    df = df.select(["Schedule", *[name for name in df.columns if name != "Schedule"]])
-                    frames.append(df)
+                    sections = scraping_state.as_sections()
+                    results.append({"schedule": tariff, "sections": sections})
                     scraping_state = scraping_state.back_to_selections()
 
     assert selected_utility
     suggested_filename = f"rateacuity_{selected_utility}"
-    filename = prompt_filename(output_folder, suggested_filename, "csv")
+    filename = prompt_filename(output_folder, suggested_filename, "json")
     filename.parent.mkdir(exist_ok=True)
-    combined_df = pl.concat(frames, how="diagonal_relaxed", rechunk=True) if frames else pl.DataFrame()
-    combined_df.write_csv(filename)
+    filename.write_text(json.dumps(results, indent=2))
diff --git a/tariff_fetch/rateacuity/report_tables.py b/tariff_fetch/rateacuity/report_tables.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from typing import TypedDict
+
+from selenium.webdriver.common.by import By
+from selenium.webdriver.remote.webdriver import WebDriver
+from selenium.webdriver.remote.webelement import WebElement
+
+
+class TableJson(TypedDict):
+    title: str
+    columns: list[str]
+    values: list[dict[str, str]]
+
+
+class SectionJson(TypedDict):
+    section: str
+    tables: list[TableJson]
+
+
+def _headers_from_table(table: WebElement) -> list[str]:
+    ths = table.find_elements(By.CSS_SELECTOR, "thead th")
+    result = []
+    for th in ths:
+        links = th.find_elements(By.TAG_NAME, "a")
+        result.append(links[0].text if links else th.text)
+    return result
+
+
+def _rows_from_table(table: WebElement) -> list[str]:
+    rows = []
+    for tr in table.find_elements(By.CSS_SELECTOR, "tbody tr"):
+        tds = tr.find_elements(By.TAG_NAME, "td")
+        rows.append([td.text for td in tds])
+    return rows
+
+
+def _table_json(table: WebElement) -> TableJson | None:
+    columns = _headers_from_table(table)
+    if not columns:
+        return None
+    rows = _rows_from_table(table)
+    title = columns[0]
+
+    values = []
+
+    for row in rows:
+        v = {}
+        for c, r in zip(columns, row, strict=False):
+            v[c] = r
+        values.append(v)
+
+    return TableJson(
+        {
+            "title": title,
+            "columns": columns,
+            "values": values,
+        }
+    )
+
+
+def sections_to_json(driver: WebDriver) -> list[SectionJson]:
+    seq = driver.find_elements(By.CSS_SELECTOR, "h3, table.eamwebgrid-table")
+
+    sections = []
+    current = {"section": None, "tables": []}
+    for el in seq:
+        tag = el.tag_name.lower()
+        if tag == "h3":
+            if current["section"] is not None or current["tables"]:
+                sections.append(current)
+            current = {"section": el.text, "tables": []}
+        else:
+            table = _table_json(el)
+            if table:
+                current["tables"].append(table)
+
+    if current["section"] is not None or current["tables"]:
+        sections.append(current)
+
+    return sections
diff --git a/tariff_fetch/rateacuity/state.py b/tariff_fetch/rateacuity/state.py
@@ -33,13 +33,16 @@
 from collections.abc import Sequence
 from pathlib import Path
 from time import sleep
+from typing import TypeVar
 
 import polars as pl
 from selenium.webdriver import Chrome
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import Select, WebDriverWait
 
+from tariff_fetch.rateacuity.report_tables import SectionJson, sections_to_json
+
 from .base import ScrapingContext, create_context, login
 
 logger = logging.getLogger(__name__)
@@ -66,6 +69,9 @@ def _wait(self) -> WebDriverWait:
         return WebDriverWait(self.driver, 10)
 
 
+S = TypeVar("S", bound=State)
+
+
 class LoginState(State):
     def login(self, username: str, password: str) -> PortalState:
         """Authenticate with RateAcuity and transition into the portal state."""
@@ -96,83 +102,108 @@ def _select_report(self, report: str):
             radio.click()
 
 
-class ElectricBenchmarkStateDropdown(State):
-    def _wait_for_element(self):
-        return self._wait().until(EC.presence_of_element_located((By.ID, "StateSelect")))
+class DropdownState(State):
+    """Shared behavior for dropdown-driven selections on the benchmark workflow."""
+
+    element_id: str
+
+    def _dropdown(self):
+        return self._wait().until(EC.presence_of_element_located((By.ID, self.element_id)))
+
+    def _visible_options(self) -> list[str]:
+        dropdown = self._dropdown()
+        return [option.text for option in dropdown.find_elements(By.TAG_NAME, "option")]
+
+    def _select(self, choice: str, *, category: str, next_state: S) -> S:
+        raw_options = self._visible_options()
+        normalized = {option.strip(): option for option in raw_options}
+        stripped_choice = choice.strip()
+
+        if choice in raw_options:
+            visible_choice = choice
+            normalized_choice = stripped_choice
+        elif stripped_choice in normalized:
+            visible_choice = normalized[stripped_choice]
+            normalized_choice = stripped_choice
+        else:
+            raise ValueError(f"{category} {choice} is invalid. Available options are: {list(normalized)}")
+
+        dropdown = self._dropdown()
+        select = Select(dropdown)
+        current = select.first_selected_option.text.strip() if select.first_selected_option else None
+        if current != normalized_choice:
+            logger.info(f"Selecting {category.lower()} {normalized_choice}")
+            select.select_by_visible_text(visible_choice)
+        return next_state
+
+
+class ElectricBenchmarkAllStateDropdown(DropdownState):
+    element_id = "StateSelect"
+
+    def get_states(self) -> list[str]:
+        return self._visible_options()
+
+    def select_state(self, state: str) -> ElectricBenchmarkAllUtilityDropdown:
+        return self._select(state, category="State", next_state=ElectricBenchmarkAllUtilityDropdown(self._context))
+
+
+class ElectricBenchmarkAllUtilityDropdown(ElectricBenchmarkAllStateDropdown):
+    element_id = "UtilitySelect"
+
+    def get_utilities(self) -> list[str]:
+        return self._visible_options()
+
+    def select_utility(self, utility: str) -> ElectricBenchmarkAllScheduleDropdown:
+        return self._select(utility, category="Utility", next_state=ElectricBenchmarkAllScheduleDropdown(self._context))
+
+
+class ElectricBenchmarkAllScheduleDropdown(ElectricBenchmarkAllUtilityDropdown):
+    element_id = "ScheduleSelect"
+
+    def get_schedules(self) -> list[str]:
+        return self._visible_options()
+
+
+class ElectricBenchmarkStateDropdown(DropdownState):
+    element_id = "StateSelect"
 
     def get_states(self) -> list[str]:
         """Return all available states visible in the State dropdown."""
-        dropdown = self._wait_for_element()
-        options = dropdown.find_elements(By.TAG_NAME, "option")
-        return [_.text for _ in options]
+        return self._visible_options()
 
     def select_state(self, state: str) -> ElectricBenchmarkUtilityDropdown:
         """Select the provided state and transition to the utility dropdown."""
-        dropdown = self._wait_for_element()
-        options = [_.text.strip() for _ in dropdown.find_elements(By.TAG_NAME, "option")]
-        if state not in options:
-            raise ValueError(f"State {state} is invalid. Available options are: {options}")
-        select = Select(dropdown)
-        current = select.first_selected_option.text.strip() if select.first_selected_option else None
-        if current != state:
-            logger.info(f"Selecting state {state}")
-            select.select_by_visible_text(state)
-        return ElectricBenchmarkUtilityDropdown(self._context)
+        return self._select(state, category="State", next_state=ElectricBenchmarkUtilityDropdown(self._context))
 
 
 class ElectricBenchmarkUtilityDropdown(ElectricBenchmarkStateDropdown):
-    def _wait_for_element(self):
-        return self._wait().until(EC.presence_of_element_located((By.ID, "UtilitySelect")))
+    element_id = "UtilitySelect"
 
     def get_utilities(self) -> list[str]:
         """Return all available utilities for the previously chosen state."""
-        dropdown = self._wait_for_element()
-        options = dropdown.find_elements(By.TAG_NAME, "option")
-        return [_.text for _ in options]
+        return self._visible_options()
 
-    def select_utility(self, utility: str):
+    def select_utility(self, utility: str) -> ElectricBenchmarkScheduleDropdown:
         """Select the provided utility and expose the schedule dropdown."""
-        dropdown = self._wait_for_element()
-        options = [_.text.strip() for _ in dropdown.find_elements(By.TAG_NAME, "option")]
-        if utility not in options:
-            raise ValueError(f"Utility {utility} is invalid. Available options are: {options}")
-        select = Select(dropdown)
-        current = select.first_selected_option.text.strip() if select.first_selected_option else None
-        if current != utility:
-            logger.info(f"Selecting utility {utility}")
-            select.select_by_visible_text(utility)
-        return ElectricBenchmarkScheduleDropdown(self._context)
+        return self._select(utility, category="Utility", next_state=ElectricBenchmarkScheduleDropdown(self._context))
 
 
 class ElectricBenchmarkScheduleDropdown(ElectricBenchmarkUtilityDropdown):
-    def _wait_for_element(self):
-        return self._wait().until(EC.presence_of_element_located((By.ID, "ScheduleSelect")))
+    element_id = "ScheduleSelect"
 
     def get_schedules(self) -> list[str]:
         """Return all schedules associated with the selected utility."""
-        dropdown = self._wait_for_element()
-        options = dropdown.find_elements(By.TAG_NAME, "option")
-        return [_.text for _ in options]
+        return self._visible_options()
 
-    def select_schedule(self, schedule: str):
+    def select_schedule(self, schedule: str) -> ElectricBenchmarkReport:
         """Select a schedule and produce a report interface that can fetch data."""
-        dropdown = self._wait_for_element()
-        options = [_.text.strip() for _ in dropdown.find_elements(By.TAG_NAME, "option")]
-        if schedule not in options:
-            raise ValueError(f"Schedule {schedule} is invalid. Available options are: {options}")
-        select = Select(dropdown)
-        current = select.first_selected_option.text.strip() if select.first_selected_option else None
-        if current != schedule:
-            logger.info(f"Selecting schedule {schedule}")
-            select.select_by_visible_text(schedule)
-        return ElectricBenchmarkReport(self._context)
+        return self._select(schedule, category="Schedule", next_state=ElectricBenchmarkReport(self._context))
 
 
-class ElectricBenchmarkReport(State):
-    def back_to_selections(self) -> ElectricBenchmarkScheduleDropdown:
-        """Return to the selections page so additional schedules can be fetched."""
+class ReportState(State):
+    def _back_to_selections(self, state: S) -> S:
         self._wait().until(EC.presence_of_element_located((By.LINK_TEXT, "Back To Selections"))).click()
-        return ElectricBenchmarkScheduleDropdown(self._context)
+        return state
 
     def download_excel(self, timeout: int = 20) -> Path:
         """Trigger the report download and return the path once it appears."""
@@ -188,6 +219,9 @@ def download_excel(self, timeout: int = 20) -> Path:
         filename = next(iter(_get_xlsx(download_path) ^ initial_state))
         return Path(download_path, filename)
 
+    def as_sections(self) -> list[SectionJson]:
+        return sections_to_json(self._context.driver)
+
     def as_dataframe(self, timeout: int = 20) -> pl.DataFrame:
         """Convert a freshly downloaded Excel report into a cleaned Polars dataframe."""
         filepath = self.download_excel(timeout)
@@ -206,6 +240,12 @@ def as_dataframe(self, timeout: int = 20) -> pl.DataFrame:
         return df
 
 
+class ElectricBenchmarkReport(ReportState):
+    def back_to_selections(self) -> ElectricBenchmarkScheduleDropdown:
+        """Return to the selections page so additional schedules can be fetched."""
+        return self._back_to_selections(ElectricBenchmarkScheduleDropdown(self._context))
+
+
 def _get_xlsx(folder) -> set[str]:
     """Return the set of .xlsx filenames currently present in the provided folder."""
     return {_ for _ in os.listdir(folder) if _.endswith(".xlsx")}