Skip to content

Commit 1d4350b

Browse files
authored
Merge pull request #257 from lxiam26/main
Update model version 4 - January 2025
2 parents b60b5d9 + 5b709cb commit 1d4350b

File tree

21 files changed

+1028
-88
lines changed

21 files changed

+1028
-88
lines changed

alembic/versions/rev005.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""
2+
empty message
3+
4+
Revision ID: 6ab68552a4a6
5+
Revises: e433f34dd4bd
6+
Create Date: 2025-01-24 10:25:29.083842
7+
8+
"""
9+
10+
from alembic import op
11+
12+
13+
# revision identifiers, used by Alembic.
14+
revision = "6ab68552a4a6"
15+
down_revision = "e433f34dd4bd"
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade():
21+
op.alter_column("prediction", "probability", new_column_name="predicted_ecoli_cfu_100ml")
22+
23+
24+
def downgrade():
25+
op.alter_column("prediction", "predicted_ecoli_cfu_100ml", new_column_name="probability")

app/admin/views/data.py

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717
from app.data.celery import combine_data_v1_task
1818
from app.data.celery import combine_data_v2_task
1919
from app.data.celery import combine_data_v3_task
20+
from app.data.celery import combine_data_v4_task
2021
from app.data.celery import live_hobolink_data_task
2122
from app.data.celery import live_usgs_data_task
2223
from app.data.celery import predict_v1_task
2324
from app.data.celery import predict_v2_task
2425
from app.data.celery import predict_v3_task
26+
from app.data.celery import predict_v4_task
2527
from app.data.celery import update_db_task
2628
from app.data.database import execute_sql
2729
from app.data.database import get_current_time
@@ -86,7 +88,15 @@ class DownloadView(BaseView):
8688
are handy because they get around limitations of the Heroku free tier.
8789
"""
8890

89-
TABLES = ["hobolink", "usgs", "processed_data", "prediction", "boathouse", "override_history"]
91+
TABLES = [
92+
"hobolink",
93+
"usgs_w",
94+
"usgs_b",
95+
"processed_data",
96+
"prediction",
97+
"boathouse",
98+
"override_history",
99+
]
90100

91101
@expose("/")
92102
def index(self):
@@ -122,11 +132,18 @@ def source_hobolink(self):
122132
url_for("admin_downloadview.csv_wait", task_id=async_result.id, data_source="hobolink")
123133
)
124134

125-
@expose("/csv/src/usgs_source")
126-
def source_usgs(self):
135+
@expose("/csv/src/usgs_w_source")
136+
def source_usgs_w(self):
127137
async_result = live_usgs_data_task.delay(days_ago=90)
128138
return redirect(
129-
url_for("admin_downloadview.csv_wait", task_id=async_result.id, data_source="usgs")
139+
url_for("admin_downloadview.csv_wait", task_id=async_result.id, data_source="usgs_w")
140+
)
141+
142+
@expose("/csv/src/usgs_b_source")
143+
def source_usgs_b(self):
144+
async_result = live_usgs_data_task.delay(days_ago=90)
145+
return redirect(
146+
url_for("admin_downloadview.csv_wait", task_id=async_result.id, data_source="usgs_b")
130147
)
131148

132149
@expose("/csv/src/processed_data_v1_source")
@@ -156,6 +173,15 @@ def source_combine_data_v3(self):
156173
url_for("admin_downloadview.csv_wait", task_id=async_result.id, data_source="combined")
157174
)
158175

176+
@expose("/csv/src/processed_data_v4_source")
177+
def source_combine_data_v4(self):
178+
async_result = combine_data_v4_task.delay(
179+
export_name="code_for_boston_export_90d", days_ago=90
180+
)
181+
return redirect(
182+
url_for("admin_downloadview.csv_wait", task_id=async_result.id, data_source="combined")
183+
)
184+
159185
@expose("/csv/src/prediction_v1_source")
160186
def source_prediction_v1(self):
161187
async_result = predict_v1_task.delay(export_name="code_for_boston_export_90d", days_ago=90)
@@ -183,6 +209,15 @@ def source_prediction_v3(self):
183209
)
184210
)
185211

212+
@expose("/csv/src/prediction_v4_source")
213+
def source_prediction_v4(self):
214+
async_result = predict_v4_task.delay(export_name="code_for_boston_export_90d", days_ago=90)
215+
return redirect(
216+
url_for(
217+
"admin_downloadview.csv_wait", task_id=async_result.id, data_source="prediction"
218+
)
219+
)
220+
186221
@expose("/csv/wait")
187222
def csv_wait(self):
188223
task_id = request.args.get("task_id")
@@ -222,10 +257,15 @@ def sync_source_hobolink(self):
222257
df = live_hobolink_data_task.run("code_for_boston_export_90d")
223258
return send_csv_attachment_of_dataframe(df=pd.DataFrame(df), filename="hobolink_source.csv")
224259

225-
@expose("/csv/src_sync/usgs_source")
226-
def sync_source_usgs(self):
260+
@expose("/csv/src_sync/usgs_w_source")
261+
def sync_source_usgs_w(self):
227262
df = live_usgs_data_task.run(days_ago=90)
228-
return send_csv_attachment_of_dataframe(df=pd.DataFrame(df), filename="usgs_source.csv")
263+
return send_csv_attachment_of_dataframe(df=pd.DataFrame(df), filename="usgs_w_source.csv")
264+
265+
@expose("/csv/src_sync/usgs_b_source")
266+
def sync_source_usgs_b(self):
267+
df = live_usgs_data_task.run(days_ago=90)
268+
return send_csv_attachment_of_dataframe(df=pd.DataFrame(df), filename="usgs_b_source.csv")
229269

230270
@expose("/csv/src_sync/processed_data_v1_source")
231271
def sync_source_combine_data_v1(self):
@@ -248,6 +288,13 @@ def sync_source_combine_data_v3(self):
248288
df=pd.DataFrame(df), filename="model_processed_data.csv"
249289
)
250290

291+
@expose("/csv/src_sync/processed_data_v4_source")
292+
def sync_source_combine_data_v4(self):
293+
df = combine_data_v4_task.run(days_ago=90, export_name="code_for_boston_export_90d")
294+
return send_csv_attachment_of_dataframe(
295+
df=pd.DataFrame(df), filename="model_processed_data.csv"
296+
)
297+
251298
@expose("/csv/src_sync/prediction_v1_source")
252299
def sync_source_prediction_v1(self):
253300
df = predict_v1_task.run(days_ago=90, export_name="code_for_boston_export_90d")
@@ -269,6 +316,13 @@ def sync_source_prediction_v3(self):
269316
df=pd.DataFrame(df), filename="prediction_source.csv"
270317
)
271318

319+
@expose("/csv/src_sync/prediction_v4_source")
320+
def sync_source_prediction_v4(self):
321+
df = predict_v4_task.run(days_ago=90, export_name="code_for_boston_export_90d")
322+
return send_csv_attachment_of_dataframe(
323+
df=pd.DataFrame(df), filename="prediction_source.csv"
324+
)
325+
272326

273327
class DatabaseView(BaseView):
274328
"""Exposes an "update database" button to the user."""

app/data/celery.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ def combine_data_v3_task(*args, **kwargs) -> RecordsType:
100100
return df.to_dict(orient="records")
101101

102102

103+
@celery_app.task
104+
def combine_data_v4_task(*args, **kwargs) -> RecordsType:
105+
from app.data.processing.core import combine_v4_job
106+
107+
df = combine_v4_job(*args, **kwargs)
108+
return df.to_dict(orient="records")
109+
110+
103111
@celery_app.task
104112
def predict_v1_task(*args, **kwargs) -> RecordsType:
105113
from app.data.processing.core import predict_v1_job
@@ -124,6 +132,14 @@ def predict_v3_task(*args, **kwargs) -> RecordsType:
124132
return df.to_dict(orient="records")
125133

126134

135+
@celery_app.task
136+
def predict_v4_task(*args, **kwargs) -> RecordsType:
137+
from app.data.processing.core import predict_v4_job
138+
139+
df = predict_v4_job(*args, **kwargs)
140+
return df.to_dict(orient="records")
141+
142+
127143
@celery_app.task
128144
def update_db_task(tweet_status: bool = False) -> None:
129145
from app.data.globals import website_options
@@ -150,9 +166,11 @@ def send_database_exports_task() -> None:
150166
combine_data_v1_task: WithAppContextTask
151167
combine_data_v2_task: WithAppContextTask
152168
combine_data_v3_task: WithAppContextTask
169+
combine_data_v4_task: WithAppContextTask
153170
clear_cache_task: WithAppContextTask
154171
predict_v1_task: WithAppContextTask
155172
predict_v2_task: WithAppContextTask
156173
predict_v3_task: WithAppContextTask
174+
predict_v4_task: WithAppContextTask
157175
update_db_task: WithAppContextTask
158176
send_database_exports_task: WithAppContextTask

app/data/models/prediction.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@ class Prediction(db.Model):
1414
__tablename__ = "prediction"
1515
reach_id = db.Column(db.Integer, db.ForeignKey("reach.id"), primary_key=True, nullable=False)
1616
time = db.Column(db.DateTime, primary_key=True, nullable=False)
17-
# predicted_ecoli_cfu_100ml = db.Column(db.Numeric)
18-
probability = db.Column(db.Numeric)
17+
predicted_ecoli_cfu_100ml = db.Column(db.Numeric)
18+
# probability = db.Column(db.Numeric)
1919
safe = db.Column(db.Boolean)
2020

2121
reach = db.relationship("Reach", back_populates="predictions")
2222

23-
# @property
24-
# def predicted_ecoli_cfu_100ml_rounded(self) -> float:
25-
# return round(self.predicted_ecoli_cfu_100ml, 1)
26-
2723
@property
28-
def probability_rounded_and_formatted(self) -> str:
29-
return str(round(self.probability * 100, 1)) + "%"
24+
def predicted_ecoli_cfu_100ml_rounded(self) -> float:
25+
return round(self.predicted_ecoli_cfu_100ml, 1)
26+
27+
# @property
28+
# def probability_rounded_and_formatted(self) -> str:
29+
# return str(round(self.probability * 100, 1)) + "%"
3030

3131
@classmethod
3232
def _latest_ts_scalar_subquery(cls):
@@ -44,8 +44,14 @@ def get_latest(cls, reach: int) -> "Prediction":
4444
def get_all_latest(cls) -> List["Prediction"]:
4545
return db.session.query(cls).filter(cls.time == cls._latest_ts_scalar_subquery()).all()
4646

47+
# def api_v1_to_dict(self) -> Dict[str, Any]:
48+
# return {"prediction": float(self.probability), "safe": self.safe, "time": self.time}
4749
def api_v1_to_dict(self) -> Dict[str, Any]:
48-
return {"prediction": float(self.probability), "safe": self.safe, "time": self.time}
50+
return {
51+
"prediction": float(self.predicted_ecoli_cfu_100ml),
52+
"safe": self.safe,
53+
"time": self.time,
54+
}
4955

5056

5157
def get_latest_prediction_time() -> datetime:

app/data/processing/core.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
from app.data.processing.hobolink import HOBOLINK_ROWS_PER_HOUR
2222
from app.data.processing.hobolink import get_live_hobolink_data
2323
from app.data.processing.usgs import USGS_DEFAULT_DAYS_AGO
24-
from app.data.processing.usgs import USGS_ROWS_PER_HOUR
24+
from app.data.processing.usgs import USGS_ROWS_PER_HOUR_MUDDY_RIVER
25+
from app.data.processing.usgs import USGS_ROWS_PER_HOUR_WALTHAM
2526
from app.data.processing.usgs import get_live_usgs_data
2627
from app.mail import ExportEmail
2728
from app.mail import mail
@@ -38,7 +39,9 @@ def _write_to_db(df: pd.DataFrame, table_name: str, rows: Optional[int] = None)
3839
class ModelModule(Protocol):
3940
MODEL_YEAR: str
4041

41-
def process_data(self, df_hobolink: pd.DataFrame, df_usgs: pd.DataFrame) -> pd.DataFrame: ...
42+
def process_data(
43+
self, df_hobolink: pd.DataFrame, df_usgs_w: pd.DataFrame, df_usgs_b: pd.DataFrame
44+
) -> pd.DataFrame: ...
4245

4346
def all_models(self, df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame: ...
4447

@@ -47,6 +50,7 @@ class ModelVersion(str, Enum):
4750
v1 = "v1"
4851
v2 = "v2"
4952
v3 = "v3"
53+
v4 = "v4"
5054

5155
def get_module(self) -> ModelModule:
5256
if self == self.__class__.v1:
@@ -61,11 +65,15 @@ def get_module(self) -> ModelModule:
6165
from app.data.processing.predictive_models import v3
6266

6367
return v3
68+
elif self == self.__class__.v4:
69+
from app.data.processing.predictive_models import v4
70+
71+
return v4
6472
else:
6573
raise ValueError(f"Unclear what happened; {self} not supported")
6674

6775

68-
DEFAULT_MODEL_VERSION = ModelVersion.v1
76+
DEFAULT_MODEL_VERSION = ModelVersion.v4
6977

7078

7179
@mail_on_fail
@@ -75,15 +83,19 @@ def _combine_job(
7583
model_version: ModelVersion = DEFAULT_MODEL_VERSION,
7684
) -> pd.DataFrame:
7785
mod = model_version.get_module()
78-
df_usgs = get_live_usgs_data(days_ago=days_ago)
86+
df_usgs_w = get_live_usgs_data(days_ago=days_ago, site_no="01104500")
87+
df_usgs_b = get_live_usgs_data(days_ago=days_ago, site_no="01104683")
7988
df_hobolink = get_live_hobolink_data(export_name=export_name)
80-
df_combined = mod.process_data(df_hobolink=df_hobolink, df_usgs=df_usgs)
89+
df_combined = mod.process_data(
90+
df_hobolink=df_hobolink, df_usgs_w=df_usgs_w, df_usgs_b=df_usgs_b
91+
)
8192
return df_combined
8293

8394

8495
combine_v1_job = partial(_combine_job, model_version=ModelVersion.v1)
8596
combine_v2_job = partial(_combine_job, model_version=ModelVersion.v2)
8697
combine_v3_job = partial(_combine_job, model_version=ModelVersion.v3)
98+
combine_v4_job = partial(_combine_job, model_version=ModelVersion.v4)
8799

88100

89101
@mail_on_fail
@@ -93,29 +105,37 @@ def _predict_job(
93105
model_version: ModelVersion = DEFAULT_MODEL_VERSION,
94106
) -> pd.DataFrame:
95107
mod = model_version.get_module()
96-
df_usgs = get_live_usgs_data(days_ago=days_ago)
108+
df_usgs_w = get_live_usgs_data(days_ago=days_ago, site_no="01104500")
109+
df_usgs_b = get_live_usgs_data(days_ago=days_ago, site_no="01104683")
97110
df_hobolink = get_live_hobolink_data(export_name=export_name)
98-
df_combined = mod.process_data(df_hobolink=df_hobolink, df_usgs=df_usgs)
111+
df_combined = mod.process_data(
112+
df_hobolink=df_hobolink, df_usgs_w=df_usgs_w, df_usgs_b=df_usgs_b
113+
)
99114
df_predictions = mod.all_models(df_combined)
100115
return df_predictions
101116

102117

103118
predict_v1_job = partial(_predict_job, model_version=ModelVersion.v1)
104119
predict_v2_job = partial(_predict_job, model_version=ModelVersion.v2)
105120
predict_v3_job = partial(_predict_job, model_version=ModelVersion.v3)
121+
predict_v4_job = partial(_predict_job, model_version=ModelVersion.v4)
106122

107123

108124
@mail_on_fail
109125
def update_db() -> None:
110126
mod = DEFAULT_MODEL_VERSION.get_module()
111-
df_usgs = get_live_usgs_data()
127+
df_usgs_w = get_live_usgs_data(site_no="01104500")
128+
df_usgs_b = get_live_usgs_data(site_no="01104683")
112129
df_hobolink = get_live_hobolink_data()
113-
df_combined = mod.process_data(df_hobolink=df_hobolink, df_usgs=df_usgs)
130+
df_combined = mod.process_data(
131+
df_hobolink=df_hobolink, df_usgs_w=df_usgs_w, df_usgs_b=df_usgs_b
132+
)
114133
df_predictions = mod.all_models(df_combined)
115134

116135
hours = current_app.config["STORAGE_HOURS"]
117136
try:
118-
_write_to_db(df_usgs, "usgs", rows=hours * USGS_ROWS_PER_HOUR)
137+
_write_to_db(df_usgs_w, "usgs_w", rows=hours * USGS_ROWS_PER_HOUR_WALTHAM)
138+
_write_to_db(df_usgs_b, "usgs_b", rows=hours * USGS_ROWS_PER_HOUR_MUDDY_RIVER)
119139
_write_to_db(df_hobolink, "hobolink", rows=hours * HOBOLINK_ROWS_PER_HOUR)
120140
_write_to_db(df_combined, "processed_data")
121141
_write_to_db(df_predictions, Prediction.__tablename__)
@@ -129,17 +149,21 @@ def update_db() -> None:
129149
@mail_on_fail
130150
def send_database_exports() -> None:
131151
mod = DEFAULT_MODEL_VERSION.get_module()
132-
df_usgs = get_live_usgs_data(days_ago=90)
152+
df_usgs_w = get_live_usgs_data(days_ago=90, site_no="01104500")
153+
df_usgs_b = get_live_usgs_data(days_ago=90, site_no="01104683")
133154
df_hobolink = get_live_hobolink_data(export_name="code_for_boston_export_90d")
134-
df_combined = mod.process_data(df_hobolink=df_hobolink, df_usgs=df_usgs)
155+
df_combined = mod.process_data(
156+
df_hobolink=df_hobolink, df_usgs_w=df_usgs_w, df_usgs_b=df_usgs_b
157+
)
135158
df_predictions = mod.all_models(df_combined)
136159
df_override_history = execute_sql("select * from override_history;")
137160

138161
todays_date = get_current_time().strftime("%Y_%m_%d")
139162

140163
msg = ExportEmail()
141164

142-
msg.attach_dataframe(df_usgs, f"{todays_date}-usgs.csv")
165+
msg.attach_dataframe(df_usgs_w, f"{todays_date}-usgs_w.csv")
166+
msg.attach_dataframe(df_usgs_b, f"{todays_date}-usgs_b.csv")
143167
msg.attach_dataframe(df_hobolink, f"{todays_date}-hobolink.csv")
144168
msg.attach_dataframe(df_combined, f"{todays_date}-combined.csv")
145169
msg.attach_dataframe(df_predictions, f"{todays_date}-prediction.csv")

0 commit comments

Comments
 (0)