Skip to content

Commit 991db4c

Browse files
committed
Export rateacuity reports as json instead of csv
1 parent 289d327 commit 991db4c

File tree

3 files changed

+179
-61
lines changed

3 files changed

+179
-61
lines changed

tariff_fetch/_cli/rateacuity.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1+
import json
12
import os
23
from pathlib import Path
34

4-
import polars as pl
55
import questionary
66
import tenacity
77
from dotenv import load_dotenv
@@ -23,7 +23,7 @@ def process_rateacuity(output_folder: Path, state: str, utility: Utility):
2323

2424
selected_utility = None
2525
tariffs_to_include = None
26-
frames = []
26+
results = []
2727

2828
for attempt in tenacity.Retrying(
2929
stop=tenacity.stop_after_attempt(3), retry=tenacity.retry_if_exception_type(WebDriverException)
@@ -70,15 +70,12 @@ def process_rateacuity(output_folder: Path, state: str, utility: Utility):
7070
tariff = tariffs_to_include.pop(0)
7171
console.log(f"Fetching {tariff}")
7272
scraping_state = scraping_state.select_schedule(tariff)
73-
df = scraping_state.as_dataframe()
74-
df = df.with_columns(pl.lit(tariff).alias("Schedule"))
75-
df = df.select(["Schedule", *[name for name in df.columns if name != "Schedule"]])
76-
frames.append(df)
73+
sections = scraping_state.as_sections()
74+
results.append({"schedule": tariff, "sections": sections})
7775
scraping_state = scraping_state.back_to_selections()
7876

7977
assert selected_utility
8078
suggested_filename = f"rateacuity_{selected_utility}"
81-
filename = prompt_filename(output_folder, suggested_filename, "csv")
79+
filename = prompt_filename(output_folder, suggested_filename, "json")
8280
filename.parent.mkdir(exist_ok=True)
83-
combined_df = pl.concat(frames, how="diagonal_relaxed", rechunk=True) if frames else pl.DataFrame()
84-
combined_df.write_csv(filename)
81+
filename.write_text(json.dumps(results, indent=2))
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from __future__ import annotations
2+
3+
from typing import TypedDict
4+
5+
from selenium.webdriver.common.by import By
6+
from selenium.webdriver.remote.webdriver import WebDriver
7+
from selenium.webdriver.remote.webelement import WebElement
8+
9+
10+
class TableJson(TypedDict):
11+
title: str
12+
columns: list[str]
13+
values: list[dict[str, str]]
14+
15+
16+
class SectionJson(TypedDict):
17+
section: str
18+
tables: list[TableJson]
19+
20+
21+
def _headers_from_table(table: WebElement) -> list[str]:
22+
ths = table.find_elements(By.CSS_SELECTOR, "thead th")
23+
result = []
24+
for th in ths:
25+
links = th.find_elements(By.TAG_NAME, "a")
26+
result.append(links[0].text if links else th.text)
27+
return result
28+
29+
30+
def _rows_from_table(table: WebElement) -> list[str]:
31+
rows = []
32+
for tr in table.find_elements(By.CSS_SELECTOR, "tbody tr"):
33+
tds = tr.find_elements(By.TAG_NAME, "td")
34+
rows.append([td.text for td in tds])
35+
return rows
36+
37+
38+
def _table_json(table: WebElement) -> TableJson | None:
39+
columns = _headers_from_table(table)
40+
if not columns:
41+
return None
42+
rows = _rows_from_table(table)
43+
title = columns[0]
44+
45+
values = []
46+
47+
for row in rows:
48+
v = {}
49+
for c, r in zip(columns, row, strict=False):
50+
v[c] = r
51+
values.append(v)
52+
53+
return TableJson(
54+
{
55+
"title": title,
56+
"columns": columns,
57+
"values": values,
58+
}
59+
)
60+
61+
62+
def sections_to_json(driver: WebDriver) -> list[SectionJson]:
63+
seq = driver.find_elements(By.CSS_SELECTOR, "h3, table.eamwebgrid-table")
64+
65+
sections = []
66+
current = {"section": None, "tables": []}
67+
for el in seq:
68+
tag = el.tag_name.lower()
69+
if tag == "h3":
70+
if current["section"] is not None or current["tables"]:
71+
sections.append(current)
72+
current = {"section": el.text, "tables": []}
73+
else:
74+
table = _table_json(el)
75+
if table:
76+
current["tables"].append(table)
77+
78+
if current["section"] is not None or current["tables"]:
79+
sections.append(current)
80+
81+
return sections

tariff_fetch/rateacuity/state.py

Lines changed: 92 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,16 @@
3333
from collections.abc import Sequence
3434
from pathlib import Path
3535
from time import sleep
36+
from typing import TypeVar
3637

3738
import polars as pl
3839
from selenium.webdriver import Chrome
3940
from selenium.webdriver.common.by import By
4041
from selenium.webdriver.support import expected_conditions as EC
4142
from selenium.webdriver.support.ui import Select, WebDriverWait
4243

44+
from tariff_fetch.rateacuity.report_tables import SectionJson, sections_to_json
45+
4346
from .base import ScrapingContext, create_context, login
4447

4548
logger = logging.getLogger(__name__)
@@ -66,6 +69,9 @@ def _wait(self) -> WebDriverWait:
6669
return WebDriverWait(self.driver, 10)
6770

6871

72+
S = TypeVar("S", bound=State)
73+
74+
6975
class LoginState(State):
7076
def login(self, username: str, password: str) -> PortalState:
7177
"""Authenticate with RateAcuity and transition into the portal state."""
@@ -96,83 +102,108 @@ def _select_report(self, report: str):
96102
radio.click()
97103

98104

99-
class ElectricBenchmarkStateDropdown(State):
100-
def _wait_for_element(self):
101-
return self._wait().until(EC.presence_of_element_located((By.ID, "StateSelect")))
105+
class DropdownState(State):
106+
"""Shared behavior for dropdown-driven selections on the benchmark workflow."""
107+
108+
element_id: str
109+
110+
def _dropdown(self):
111+
return self._wait().until(EC.presence_of_element_located((By.ID, self.element_id)))
112+
113+
def _visible_options(self) -> list[str]:
114+
dropdown = self._dropdown()
115+
return [option.text for option in dropdown.find_elements(By.TAG_NAME, "option")]
116+
117+
def _select(self, choice: str, *, category: str, next_state: S) -> S:
118+
raw_options = self._visible_options()
119+
normalized = {option.strip(): option for option in raw_options}
120+
stripped_choice = choice.strip()
121+
122+
if choice in raw_options:
123+
visible_choice = choice
124+
normalized_choice = stripped_choice
125+
elif stripped_choice in normalized:
126+
visible_choice = normalized[stripped_choice]
127+
normalized_choice = stripped_choice
128+
else:
129+
raise ValueError(f"{category} {choice} is invalid. Available options are: {list(normalized)}")
130+
131+
dropdown = self._dropdown()
132+
select = Select(dropdown)
133+
current = select.first_selected_option.text.strip() if select.first_selected_option else None
134+
if current != normalized_choice:
135+
logger.info(f"Selecting {category.lower()} {normalized_choice}")
136+
select.select_by_visible_text(visible_choice)
137+
return next_state
138+
139+
140+
class ElectricBenchmarkAllStateDropdown(DropdownState):
141+
element_id = "StateSelect"
142+
143+
def get_states(self) -> list[str]:
144+
return self._visible_options()
145+
146+
def select_state(self, state: str) -> ElectricBenchmarkAllUtilityDropdown:
147+
return self._select(state, category="State", next_state=ElectricBenchmarkAllUtilityDropdown(self._context))
148+
149+
150+
class ElectricBenchmarkAllUtilityDropdown(ElectricBenchmarkAllStateDropdown):
151+
element_id = "UtilitySelect"
152+
153+
def get_utilities(self) -> list[str]:
154+
return self._visible_options()
155+
156+
def select_utility(self, utility: str) -> ElectricBenchmarkAllScheduleDropdown:
157+
return self._select(utility, category="Utility", next_state=ElectricBenchmarkAllScheduleDropdown(self._context))
158+
159+
160+
class ElectricBenchmarkAllScheduleDropdown(ElectricBenchmarkAllUtilityDropdown):
161+
element_id = "ScheduleSelect"
162+
163+
def get_schedules(self) -> list[str]:
164+
return self._visible_options()
165+
166+
167+
class ElectricBenchmarkStateDropdown(DropdownState):
168+
element_id = "StateSelect"
102169

103170
def get_states(self) -> list[str]:
104171
"""Return all available states visible in the State dropdown."""
105-
dropdown = self._wait_for_element()
106-
options = dropdown.find_elements(By.TAG_NAME, "option")
107-
return [_.text for _ in options]
172+
return self._visible_options()
108173

109174
def select_state(self, state: str) -> ElectricBenchmarkUtilityDropdown:
110175
"""Select the provided state and transition to the utility dropdown."""
111-
dropdown = self._wait_for_element()
112-
options = [_.text.strip() for _ in dropdown.find_elements(By.TAG_NAME, "option")]
113-
if state not in options:
114-
raise ValueError(f"State {state} is invalid. Available options are: {options}")
115-
select = Select(dropdown)
116-
current = select.first_selected_option.text.strip() if select.first_selected_option else None
117-
if current != state:
118-
logger.info(f"Selecting state {state}")
119-
select.select_by_visible_text(state)
120-
return ElectricBenchmarkUtilityDropdown(self._context)
176+
return self._select(state, category="State", next_state=ElectricBenchmarkUtilityDropdown(self._context))
121177

122178

123179
class ElectricBenchmarkUtilityDropdown(ElectricBenchmarkStateDropdown):
124-
def _wait_for_element(self):
125-
return self._wait().until(EC.presence_of_element_located((By.ID, "UtilitySelect")))
180+
element_id = "UtilitySelect"
126181

127182
def get_utilities(self) -> list[str]:
128183
"""Return all available utilities for the previously chosen state."""
129-
dropdown = self._wait_for_element()
130-
options = dropdown.find_elements(By.TAG_NAME, "option")
131-
return [_.text for _ in options]
184+
return self._visible_options()
132185

133-
def select_utility(self, utility: str):
186+
def select_utility(self, utility: str) -> ElectricBenchmarkScheduleDropdown:
134187
"""Select the provided utility and expose the schedule dropdown."""
135-
dropdown = self._wait_for_element()
136-
options = [_.text.strip() for _ in dropdown.find_elements(By.TAG_NAME, "option")]
137-
if utility not in options:
138-
raise ValueError(f"Utility {utility} is invalid. Available options are: {options}")
139-
select = Select(dropdown)
140-
current = select.first_selected_option.text.strip() if select.first_selected_option else None
141-
if current != utility:
142-
logger.info(f"Selecting utility {utility}")
143-
select.select_by_visible_text(utility)
144-
return ElectricBenchmarkScheduleDropdown(self._context)
188+
return self._select(utility, category="Utility", next_state=ElectricBenchmarkScheduleDropdown(self._context))
145189

146190

147191
class ElectricBenchmarkScheduleDropdown(ElectricBenchmarkUtilityDropdown):
148-
def _wait_for_element(self):
149-
return self._wait().until(EC.presence_of_element_located((By.ID, "ScheduleSelect")))
192+
element_id = "ScheduleSelect"
150193

151194
def get_schedules(self) -> list[str]:
152195
"""Return all schedules associated with the selected utility."""
153-
dropdown = self._wait_for_element()
154-
options = dropdown.find_elements(By.TAG_NAME, "option")
155-
return [_.text for _ in options]
196+
return self._visible_options()
156197

157-
def select_schedule(self, schedule: str):
198+
def select_schedule(self, schedule: str) -> ElectricBenchmarkReport:
158199
"""Select a schedule and produce a report interface that can fetch data."""
159-
dropdown = self._wait_for_element()
160-
options = [_.text.strip() for _ in dropdown.find_elements(By.TAG_NAME, "option")]
161-
if schedule not in options:
162-
raise ValueError(f"Schedule {schedule} is invalid. Available options are: {options}")
163-
select = Select(dropdown)
164-
current = select.first_selected_option.text.strip() if select.first_selected_option else None
165-
if current != schedule:
166-
logger.info(f"Selecting schedule {schedule}")
167-
select.select_by_visible_text(schedule)
168-
return ElectricBenchmarkReport(self._context)
200+
return self._select(schedule, category="Schedule", next_state=ElectricBenchmarkReport(self._context))
169201

170202

171-
class ElectricBenchmarkReport(State):
172-
def back_to_selections(self) -> ElectricBenchmarkScheduleDropdown:
173-
"""Return to the selections page so additional schedules can be fetched."""
203+
class ReportState(State):
204+
def _back_to_selections(self, state: S) -> S:
174205
self._wait().until(EC.presence_of_element_located((By.LINK_TEXT, "Back To Selections"))).click()
175-
return ElectricBenchmarkScheduleDropdown(self._context)
206+
return state
176207

177208
def download_excel(self, timeout: int = 20) -> Path:
178209
"""Trigger the report download and return the path once it appears."""
@@ -188,6 +219,9 @@ def download_excel(self, timeout: int = 20) -> Path:
188219
filename = next(iter(_get_xlsx(download_path) ^ initial_state))
189220
return Path(download_path, filename)
190221

222+
def as_sections(self) -> list[SectionJson]:
223+
return sections_to_json(self._context.driver)
224+
191225
def as_dataframe(self, timeout: int = 20) -> pl.DataFrame:
192226
"""Convert a freshly downloaded Excel report into a cleaned Polars dataframe."""
193227
filepath = self.download_excel(timeout)
@@ -206,6 +240,12 @@ def as_dataframe(self, timeout: int = 20) -> pl.DataFrame:
206240
return df
207241

208242

243+
class ElectricBenchmarkReport(ReportState):
244+
def back_to_selections(self) -> ElectricBenchmarkScheduleDropdown:
245+
"""Return to the selections page so additional schedules can be fetched."""
246+
return self._back_to_selections(ElectricBenchmarkScheduleDropdown(self._context))
247+
248+
209249
def _get_xlsx(folder) -> set[str]:
210250
"""Return the set of .xlsx filenames currently present in the provided folder."""
211251
return {_ for _ in os.listdir(folder) if _.endswith(".xlsx")}

0 commit comments

Comments
 (0)