Skip to content

Commit 6a44521

Browse files
committed
Merge branch 'develop' into inference-pipeline-testing-refactor-v3
2 parents 40d5cad + d4e04b2 commit 6a44521

File tree

55 files changed

+2479
-638
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+2479
-638
lines changed

CHANGELOG.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,25 @@
11
# CHANGELOG
22

3+
## 0.3.8 (2025-07)
4+
- Patching up gold volume path due to Azure/GCP naming differences (PR 249)
5+
6+
## 0.3.7 (2025-07)
7+
- Adjusted feature naming for better flexibility and robustness in feature table (PR 243)
8+
- Updated codebased to have better data leakage handling (PR 237)
9+
- Added SHAP feature metadata to FE tables to have better compatibility with FE needs and resolve endpoint bugs (PR 242)
10+
- Updated model card output location to gold volumes instead of artifacts for compatibility with API endpoint & FE (PR 245)
11+
12+
## 0.3.6 (2025-06)
13+
- Fixed bug in features table (PR 229)
14+
- Fixed bug in 12 credit features (PR 230)
15+
16+
## 0.3.5 (2025-06)
17+
- Added support scores to to features (PR 222)
18+
- Limit boolean features to courses and subjects (PR 223)
19+
- Add boolean features into VIF calcs (PR 223)
20+
- Clean up features table (PR 223)
21+
- Adjusting config unit tests to import templates directly (PR 224)
22+
323
## 0.3.4 (2025-06)
424
- Update features table (PR #218)
525
- Add features table to top shap feature output table (PR #217)
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
# Databricks notebook source
2+
# MAGIC %md
3+
# MAGIC # SST Train and Evaluate Model
4+
# MAGIC
5+
# MAGIC Third step in the process of transforming raw data into actionable, data-driven insights for advisors: load a prepared modeling dataset, configure experiment tracking framework, then train and evaluate a predictive model.
6+
# MAGIC
7+
# MAGIC ### References
8+
# MAGIC
9+
# MAGIC - [Data science product components (Confluence doc)](https://datakind.atlassian.net/wiki/spaces/TT/pages/237862913/Data+science+product+components+the+modeling+process)
10+
# MAGIC - [Databricks Classification with AutoML](https://docs.databricks.com/en/machine-learning/automl/classification.html)
11+
# MAGIC - [Databricks AutoML Python API reference](https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html)
12+
# MAGIC - [Databricks runtimes release notes](https://docs.databricks.com/en/release-notes/runtime/index.html)
13+
14+
# COMMAND ----------
15+
16+
# MAGIC %md
17+
# MAGIC # setup
18+
19+
# COMMAND ----------
20+
21+
# MAGIC %sh python --version
22+
23+
# COMMAND ----------
24+
25+
# WARNING: AutoML/mlflow expect particular packages with version constraints
26+
# that directly conflicts with dependencies in our SST repo. As a temporary fix,
27+
# we need to manually install a certain version of pandas and scikit-learn in order
28+
# for our models to load and run properly.
29+
30+
# %pip install "student-success-tool==0.3.8"
31+
# %pip install "pandas==1.5.3"
32+
# %pip install "scikit-learn==1.3.0"
33+
# %restart_python
34+
35+
# COMMAND ----------
36+
37+
import logging
38+
39+
import mlflow
40+
from databricks.connect import DatabricksSession
41+
42+
from student_success_tool import configs, dataio, modeling, utils
43+
44+
# COMMAND ----------
45+
46+
logging.basicConfig(level=logging.INFO, force=True)
47+
logging.getLogger("py4j").setLevel(logging.WARNING) # ignore databricks logger
48+
49+
try:
50+
spark = DatabricksSession.builder.getOrCreate()
51+
except Exception:
52+
logging.warning("unable to create spark session; are you in a Databricks runtime?")
53+
pass
54+
55+
# Get job run id for automl run
56+
job_run_id = utils.databricks.get_db_widget_param("job_run_id", default="interactive")
57+
58+
# COMMAND ----------
59+
60+
# MAGIC %md
61+
# MAGIC ## configuration
62+
63+
# COMMAND ----------
64+
65+
# project configuration stored as a config file in TOML format
66+
cfg = dataio.read_config(
67+
"./config-TEMPLATE.toml", schema=configs.custom.CustomProjectConfig
68+
)
69+
cfg
70+
71+
# COMMAND ----------
72+
73+
# MAGIC %md
74+
# MAGIC # read preprocessed dataset
75+
76+
# COMMAND ----------
77+
78+
df = dataio.read.from_delta_table(
79+
cfg.datasets.silver["preprocessed"].table_path,
80+
spark_session=spark,
81+
)
82+
df.head()
83+
84+
# COMMAND ----------
85+
86+
# delta tables not great about maintaining dtypes; this may be needed
87+
# df = df.convert_dtypes()
88+
89+
# COMMAND ----------
90+
91+
print(f"target proportions:\n{df[cfg.target_col].value_counts(normalize=True)}")
92+
93+
# COMMAND ----------
94+
95+
if cfg.split_col:
96+
print(f"split proportions:\n{df[cfg.split_col].value_counts(normalize=True)}")
97+
98+
# COMMAND ----------
99+
100+
if cfg.sample_weight_col:
101+
print(f"sample weights: {df[cfg.sample_weight_col].unique()}")
102+
103+
# COMMAND ----------
104+
105+
# MAGIC %md
106+
# MAGIC # feature selection
107+
108+
# COMMAND ----------
109+
110+
# databricks freaks out during feature selection if autologging isn't disabled :shrug:
111+
mlflow.autolog(disable=True)
112+
113+
# COMMAND ----------
114+
115+
# load feature selection params from the project config
116+
# HACK: set non-feature cols in params since it's computed outside
117+
# of feature selection config
118+
selection_params = cfg.modeling.feature_selection.model_dump()
119+
selection_params["non_feature_cols"] = cfg.non_feature_cols
120+
logging.info("selection params = %s", selection_params)
121+
122+
# COMMAND ----------
123+
124+
df_selected = modeling.feature_selection.select_features(
125+
(df.loc[df[cfg.split_col].eq("train"), :] if cfg.split_col else df),
126+
**selection_params,
127+
)
128+
print(f"rows x cols = {df_selected.shape}")
129+
df_selected.head()
130+
131+
# COMMAND ----------
132+
133+
# HACK: we want to use selected columns for *all* splits, not just train
134+
df = df.loc[:, df_selected.columns]
135+
136+
# COMMAND ----------
137+
138+
# save modeling dataset with all splits
139+
dataio.write.to_delta_table(
140+
df, cfg.datasets.silver["modeling"].table_path, spark_session=spark
141+
)
142+
143+
# COMMAND ----------
144+
145+
# MAGIC %md
146+
# MAGIC # train model
147+
148+
# COMMAND ----------
149+
150+
# re-enable mlflow's autologging
151+
mlflow.autolog(disable=False)
152+
153+
# COMMAND ----------
154+
155+
training_params = {
156+
"job_run_id": job_run_id,
157+
"institution_id": cfg.institution_id,
158+
"student_id_col": cfg.student_id_col,
159+
"target_col": cfg.target_col,
160+
"split_col": cfg.split_col,
161+
"sample_weight_col": cfg.sample_weight_col,
162+
"pos_label": cfg.pos_label,
163+
"primary_metric": cfg.modeling.training.primary_metric,
164+
"timeout_minutes": cfg.modeling.training.timeout_minutes,
165+
"exclude_frameworks": cfg.modeling.training.exclude_frameworks,
166+
"exclude_cols": sorted(
167+
set((cfg.modeling.training.exclude_cols or []) + (cfg.student_group_cols or []))
168+
),
169+
}
170+
logging.info("training params = %s", training_params)
171+
172+
# COMMAND ----------
173+
174+
summary = modeling.training.run_automl_classification(df, **training_params)
175+
176+
experiment_id = summary.experiment.experiment_id
177+
run_id = summary.best_trial.mlflow_run_id
178+
print(
179+
f"experiment_id: {experiment_id}"
180+
f"\nbest trial run_id: {run_id}"
181+
f"\n{training_params['primary_metric']} metric distribution = {summary.metric_distribution}"
182+
)
183+
184+
dbutils.jobs.taskValues.set(key="experiment_id", value=experiment_id)
185+
dbutils.jobs.taskValues.set(key="run_id", value=run_id)
186+
187+
# COMMAND ----------
188+
189+
# MAGIC %md
190+
# MAGIC # evaluate model
191+
192+
# COMMAND ----------
193+
194+
# HACK: Evaluate an experiment you've already trained
195+
# experiment_id = cfg.model.experiment_id
196+
197+
# NOTE: AutoML generates a split column if not manually specified
198+
split_col = training_params.get("split_col", "_automl_split_col_0000")
199+
200+
# COMMAND ----------
201+
202+
# only possible to do bias evaluation if you specify a split col for train/test/validate
203+
# AutoML doesn't preserve student ids the training set, which we need for [reasons]
204+
if evaluate_model_bias := (training_params.get("split_col") is not None):
205+
df_features = df.drop(columns=cfg.non_feature_cols)
206+
else:
207+
df_features = modeling.evaluation.extract_training_data_from_model(experiment_id)
208+
209+
# COMMAND ----------
210+
211+
# Get top runs from experiment for evaluation
212+
# Adjust optimization metrics & topn_runs_included as needed
213+
top_runs = modeling.evaluation.get_top_runs(
214+
experiment_id,
215+
optimization_metrics=[
216+
"test_recall_score",
217+
"val_recall_score",
218+
"test_roc_auc",
219+
"val_roc_auc",
220+
"test_log_loss",
221+
"val_log_loss",
222+
],
223+
topn_runs_included=cfg.modeling.evaluation.topn_runs_included,
224+
)
225+
logging.info("top run ids = %s", top_runs)
226+
227+
# COMMAND ----------
228+
229+
for run_id in top_runs.values():
230+
with mlflow.start_run(run_id=run_id) as run:
231+
logging.info(
232+
"Run %s: Starting performance evaluation%s",
233+
run_id,
234+
" and bias assessment" if evaluate_model_bias else "",
235+
)
236+
model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
237+
df_pred = df.assign(
238+
**{
239+
cfg.pred_col: model.predict(df_features),
240+
cfg.pred_prob_col: modeling.inference.predict_probs(
241+
df_features,
242+
model,
243+
feature_names=list(df_features.columns),
244+
pos_label=cfg.pos_label,
245+
),
246+
}
247+
)
248+
model_comp_fig = modeling.evaluation.plot_trained_models_comparison(
249+
experiment_id, cfg.modeling.training.primary_metric
250+
)
251+
modeling.evaluation.evaluate_performance(
252+
df_pred,
253+
target_col=cfg.target_col,
254+
pos_label=cfg.pos_label,
255+
)
256+
if evaluate_model_bias:
257+
modeling.bias_detection.evaluate_bias(
258+
df_pred,
259+
student_group_cols=cfg.student_group_cols,
260+
target_col=cfg.target_col,
261+
pos_label=cfg.pos_label,
262+
)
263+
logging.info("Run %s: Completed", run_id)
264+
mlflow.end_run()

0 commit comments

Comments
 (0)