Add formula support to BalanceDF model_matrix (#318)

neuralsorcerer · meta-codesync[bot] · commit cb99f125408b · 2026-02-05T08:31:08.000-08:00
Summary: - Closes #304 Pull Request resolved: #318 Differential Revision: D92394155 Pulled By: talgalili fbshipit-source-id: ec76b8f628e555c15b568f72b5cea9fca00631ea
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,9 @@
     `--weights_impact_on_outcome_method`.
 - **Pandas 3 support**
   - Updated compatibility and tests for pandas 3.x
+- **Formula support for BalanceDF model matrices**
+  - `BalanceDF.model_matrix()` now accepts a `formula` argument to build
+    custom model matrices without precomputing them manually.
 
 ## Bug Fixes
 
diff --git a/balance/balancedf_class.py b/balance/balancedf_class.py
@@ -27,6 +27,7 @@
 )
 from balance.typing import FilePathOrBuffer
 from balance.util import find_items_index_in_list, get_items_from_list_via_indices
+from balance.utils.input_validation import _verify_value_type
 from IPython.lib.display import FileLink
 from plotly.graph_objs import Figure
 
@@ -392,18 +393,19 @@ def _call_on_linked(
         #     if v is not None and k not in exclude
         # )
 
-    # TODO: add the ability to pass formula argument to model_matrix
-    #       but in which case - notice that we'd want the ability to track
-    #       which object is stored in _model_matrix (and to run it over)
-    #       Also, the output may sometimes no longer only be pd.DataFrame
-    #       so such work will require update the type hinting here.
-    def model_matrix(self: "BalanceDF") -> pd.DataFrame:
+    def model_matrix(
+        self: "BalanceDF", formula: str | list[str] | None = None
+    ) -> pd.DataFrame:
         """Return a model_matrix version of the df inside the BalanceDF object using balance_util.model_matrix
 
         This can be used to turn all character columns into a one hot encoding columns.
 
         Args:
             self (BalanceDF): Object
+            formula (str | list[str] | None, optional): Optional formula string (or list of
+                formula strings) to pass to :func:`balance_util.model_matrix`. When
+                provided, the model matrix is computed on demand for the formula and
+                not cached on the object. Defaults to None.
 
         Returns:
             pd.DataFrame: The output from :func:`balance_util.model_matrix`
@@ -443,12 +445,27 @@ def model_matrix(self: "BalanceDF") -> pd.DataFrame:
                     # 1  2.0   8.0   0.0   0.0   1.0   0.0
                     # 2  3.0   2.0   0.0   0.0   0.0   1.0
                     # 3  1.0 -42.0   1.0   0.0   0.0   0.0
+
+                print(s1.covars().model_matrix(formula="a + b"))
+                    #      a     b
+                    # 0  1.0 -42.0
+                    # 1  2.0   8.0
+                    # 2  3.0   2.0
+                    # 3  1.0 -42.0
         """
-        if not hasattr(self, "_model_matrix") or self._model_matrix is None:
-            self._model_matrix = balance_util.model_matrix(
-                self.df, add_na=True, return_type="one"
-            )["model_matrix"]
-        return self._model_matrix
+        if formula is None:
+            if not hasattr(self, "_model_matrix") or self._model_matrix is None:
+                self._model_matrix = balance_util.model_matrix(
+                    self.df, add_na=True, return_type="one"
+                )["model_matrix"]
+            return self._model_matrix
+
+        return _verify_value_type(
+            balance_util.model_matrix(
+                self.df, add_na=True, return_type="one", formula=formula
+            )["model_matrix"],
+            pd.DataFrame,
+        )
 
     def _descriptive_stats(
         self: "BalanceDF",
diff --git a/tests/test_balancedf.py b/tests/test_balancedf.py
@@ -23,6 +23,8 @@
 from balance.sample_class import Sample
 from balance.stats_and_plots import weighted_comparisons_stats
 from balance.testutil import BalanceTestCase, tempfile_path
+from balance.utils.model_matrix import model_matrix
+from patsy import PatsyError  # pyre-ignore[21]
 
 
 class TestDataFactory:
@@ -1721,6 +1723,48 @@ def testBalanceDF_model_matrix(self) -> None:
             },
         )
 
+    def testBalanceDF_model_matrix_with_formula(self) -> None:
+        covars = s1.covars()
+        expected = model_matrix(
+            covars.df, add_na=True, return_type="one", formula="a + b"
+        )["model_matrix"]
+        result = covars.model_matrix(formula="a + b")
+        pd.testing.assert_frame_equal(result, expected)
+
+    def testBalanceDF_model_matrix_with_formula_list(self) -> None:
+        covars = s1.covars()
+        expected = model_matrix(
+            covars.df, add_na=True, return_type="one", formula=["a", "b"]
+        )["model_matrix"]
+        result = covars.model_matrix(formula=["a", "b"])
+        pd.testing.assert_frame_equal(result, expected)
+
+    def testBalanceDF_model_matrix_with_interaction_formula(self) -> None:
+        covars = s1.covars()
+        expected = model_matrix(
+            covars.df, add_na=True, return_type="one", formula="a * c"
+        )["model_matrix"]
+        result = covars.model_matrix(formula="a * c")
+        pd.testing.assert_frame_equal(result, expected)
+
+    def testBalanceDF_model_matrix_formula_does_not_affect_cache(self) -> None:
+        covars = s1.covars()
+        cached_before = covars.model_matrix()
+        formula_result = covars.model_matrix(formula="a")
+        cached_after = covars.model_matrix()
+        pd.testing.assert_frame_equal(
+            formula_result,
+            model_matrix(covars.df, add_na=True, return_type="one", formula="a")[
+                "model_matrix"
+            ],
+        )
+        pd.testing.assert_frame_equal(cached_after, cached_before)
+
+    def testBalanceDF_model_matrix_with_invalid_formula(self) -> None:
+        covars = s1.covars()
+        with self.assertRaises(PatsyError):
+            covars.model_matrix(formula="missing_column + a")
+
     def test_check_if_not_BalanceDF(self) -> None:
         with self.assertRaisesRegex(ValueError, "number must be balancedf_class"):
             BalanceDF._check_if_not_BalanceDF(