Merge pull request #53 from datakind/update-inference-output-file

bdewilde · web-flow · commit 1f3f78ca7f8a · 2025-01-13T21:08:18.000-05:00
Update inference output file per discussion
diff --git a/src/student_success_tool/modeling/inference.py b/src/student_success_tool/modeling/inference.py
@@ -1,6 +1,7 @@
 import typing as t
 
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
 from shap import KernelExplainer
 
@@ -9,8 +10,9 @@ def select_top_features_for_display(
     features: pd.DataFrame,
     unique_ids: pd.Series,
     predicted_probabilities: list[float],
-    shap_values: pd.Series,
+    shap_values: npt.NDArray[np.float64],
     n_features: int = 3,
+    needs_support_threshold_prob: t.Optional[float] = 0.5,
     features_table: t.Optional[dict[str, dict[str, str]]] = None,
 ) -> pd.DataFrame:
     """
@@ -24,6 +26,14 @@ def select_top_features_for_display(
             order as unique_ids, of shape len(unique_ids)
         shap_values: array of arrays of SHAP values, of shape len(unique_ids)
         n_features: number of important features to return
+        needs_support_threshold_prob: Minimum probability in [0.0, 1.0] used to compute
+            a boolean "needs support" field added to output records. Values in
+            ``predicted_probabilities`` greater than or equal to this threshold result in
+            a True value, otherwise it's False; if this threshold is set to null,
+            then no "needs support" values are added to the output records.
+            Note that this doesn't have to be the "optimal" decision threshold for
+            the trained model that produced ``predicted_probabilities`` , it can
+            be tailored to a school's preferences and use case.
         features_table: Optional mapping of column to human-friendly feature name/desc,
             loaded via :func:`utils.load_features_table()`
 
@@ -32,18 +42,26 @@ def select_top_features_for_display(
 
     TODO: refactor this functionality so it's vectorized and aggregates by student
     """
-    top_features_info = []
+    pred_probs = np.asarray(predicted_probabilities)
 
-    for i, (unique_id, predicted_proba) in enumerate(
-        zip(unique_ids, predicted_probabilities)
-    ):
+    top_features_info = []
+    for i, (unique_id, predicted_proba) in enumerate(zip(unique_ids, pred_probs)):
         instance_shap_values = shap_values[i]
         top_indices = np.argsort(-np.abs(instance_shap_values))[:n_features]
         top_features = features.columns[top_indices]
         top_feature_values = features.iloc[i][top_features]
         top_shap_values = instance_shap_values[top_indices]
 
-        for rank, (feature, feature_value, shap_value) in enumerate(
+        student_output = {
+            "Student ID": unique_id,
+            "Support Score": predicted_proba,
+        }
+        if needs_support_threshold_prob is not None:
+            student_output["Support Needed"] = (
+                predicted_proba >= needs_support_threshold_prob
+            )
+
+        for feature_rank, (feature, feature_value, shap_value) in enumerate(
             zip(top_features, top_feature_values, top_shap_values), start=1
         ):
             feature_name = (
@@ -54,16 +72,18 @@ def select_top_features_for_display(
                 if features_table is not None
                 else feature
             )
-            top_features_info.append(
-                {
-                    "Student ID": unique_id,
-                    "Support Score": predicted_proba,
-                    "Top Indicators": feature_name,
-                    "Indicator Value": feature_value,
-                    "SHAP Value": shap_value,
-                    "Rank": rank,
-                }
+            feature_value = (
+                str(round(feature_value, 2))
+                if isinstance(feature_value, float)
+                else str(feature_value)
             )
+            student_output |= {
+                f"Feature_{feature_rank}_Name": feature_name,
+                f"Feature_{feature_rank}_Value": feature_value,
+                f"Feature_{feature_rank}_Importance": round(shap_value, 2),
+            }
+
+        top_features_info.append(student_output)
     return pd.DataFrame(top_features_info)
 
 
diff --git a/tests/modeling/test_inference.py b/tests/modeling/test_inference.py
@@ -28,6 +28,7 @@ def explainer():
         "predicted_probabilities",
         "shap_values",
         "n_features",
+        "needs_support_threshold_prob",
         "features_table",
         "exp",
     ],
@@ -37,7 +38,7 @@ def explainer():
                 {
                     "x1": ["val1", "val2", "val3"],
                     "x2": [True, False, True],
-                    "x3": [2.0, 1.0, 0.5],
+                    "x3": [2.0, 1.0001, 0.5],
                     "x4": [1, 2, 3],
                 }
             ),
@@ -47,39 +48,26 @@ def explainer():
                 [[1.0, 0.9, 0.8, 0.7], [0.0, -1.0, 0.9, -0.8], [0.25, 0.0, -0.5, 0.75]]
             ),
             3,
+            0.5,
             {
                 "x1": {"name": "feature #1"},
                 "x2": {"name": "feature #2"},
                 "x3": {"name": "feature #3"},
             },
             pd.DataFrame(
                 {
-                    "Student ID": [1, 1, 1, 2, 2, 2, 3, 3, 3],
-                    "Support Score": [0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5],
-                    "Top Indicators": [
-                        "feature #1",
-                        "feature #2",
-                        "feature #3",
-                        "feature #2",
-                        "feature #3",
-                        "x4",
-                        "x4",
-                        "feature #3",
-                        "feature #1",
-                    ],
-                    "Indicator Value": [
-                        "val1",
-                        True,
-                        2.0,
-                        False,
-                        1.0,
-                        2,
-                        3,
-                        0.5,
-                        "val3",
-                    ],
-                    "SHAP Value": [1.0, 0.9, 0.8, -1.0, 0.9, -0.8, 0.75, -0.5, 0.25],
-                    "Rank": [1, 2, 3, 1, 2, 3, 1, 2, 3],
+                    "Student ID": [1, 2, 3],
+                    "Support Score": [0.9, 0.1, 0.5],
+                    "Support Needed": [True, False, True],
+                    "Feature_1_Name": ["feature #1", "feature #2", "x4"],
+                    "Feature_1_Value": ["val1", "False", "3"],
+                    "Feature_1_Importance": [1.0, -1.0, 0.75],
+                    "Feature_2_Name": ["feature #2", "feature #3", "feature #3"],
+                    "Feature_2_Value": ["True", "1.0", "0.5"],
+                    "Feature_2_Importance": [0.9, 0.9, -0.5],
+                    "Feature_3_Name": ["feature #3", "x4", "feature #1"],
+                    "Feature_3_Value": ["2.0", "2", "val3"],
+                    "Feature_3_Importance": [0.8, -0.8, 0.25],
                 }
             ),
         ),
@@ -99,14 +87,14 @@ def explainer():
             ),
             1,
             None,
+            None,
             pd.DataFrame(
                 {
                     "Student ID": [1, 2, 3],
                     "Support Score": [0.9, 0.1, 0.5],
-                    "Top Indicators": ["x1", "x2", "x4"],
-                    "Indicator Value": ["val1", False, 3],
-                    "SHAP Value": [1.0, -1.0, 0.75],
-                    "Rank": [1, 1, 1],
+                    "Feature_1_Name": ["x1", "x2", "x4"],
+                    "Feature_1_Value": ["val1", "False", "3"],
+                    "Feature_1_Importance": [1.0, -1.0, 0.75],
                 }
             ),
         ),
@@ -118,6 +106,7 @@ def test_select_top_features_for_display(
     predicted_probabilities,
     shap_values,
     n_features,
+    needs_support_threshold_prob,
     features_table,
     exp,
 ):
@@ -127,6 +116,7 @@ def test_select_top_features_for_display(
         predicted_probabilities,
         shap_values,
         n_features=n_features,
+        needs_support_threshold_prob=needs_support_threshold_prob,
         features_table=features_table,
     )
     assert isinstance(obs, pd.DataFrame) and not obs.empty