facebookresearch
diff --git a/‎.github/workflows/deploy-website.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/deploy-website.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 17 additions & 8 deletions b/‎CHANGELOG.md‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎balance/sample_class.py‎
Lines changed: 27 additions & 21 deletions b/‎balance/sample_class.py‎
Lines changed: 27 additions & 21 deletions
diff --git a/‎balance/utils/data_transformation.py‎
Lines changed: 23 additions & 0 deletions b/‎balance/utils/data_transformation.py‎
Lines changed: 23 additions & 0 deletions
@@ -18,7 +18,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: "3.12"
       - name: Install Pkg + Dependencies
         run: |
           python -m pip install .[dev]
 
@@ -2,23 +2,32 @@
 
 ## New Features
 
-- **Validate weights include positive values**
-  - Added a guard in weight diagnostics to error when all weights are zero.
-- **Support configurable ID column candidates**
-  - `Sample.from_frame()` and `guess_id_column()` now accept candidate ID column names
-    when auto-detecting the ID column.
 - **Outcome weight impact diagnostics**
   - Added paired outcome-weight impact tests (`y*w0` vs `y*w1`) with confidence intervals.
   - Exposed in `BalanceDFOutcomes`, `Sample.diagnostics()`, and the CLI via
     `--weights_impact_on_outcome_method`.
 - **Pandas 3 support**
   - Updated compatibility and tests for pandas 3.x
-- **Formula support for BalanceDF model matrices**
-  - `BalanceDF.model_matrix()` now accepts a `formula` argument to build
-    custom model matrices without precomputing them manually.
 - **Categorical distribution metrics without one-hot encoding**
   - KLD/EMD/CVMD/KS on `BalanceDF.covars()` now operate on raw categorical variables
     (with NA indicators) instead of one-hot encoded columns.
+- **Misc**
+  - **Raw-covariate adjustment for custom models**
+    - `Sample.adjust()` now supports fitting models on raw covariates (without a model matrix)
+      for IPW via `use_model_matrix=False`. String, object, and boolean columns are converted
+      to pandas `Categorical` dtype, allowing sklearn estimators with native categorical
+      support (e.g., `HistGradientBoostingClassifier` with `categorical_features="from_dtype"`)
+      to handle them correctly. Requires scikit-learn >= 1.4 when categorical columns are
+      present.
+  - **Validate weights include positive values**
+    - Added a guard in weight diagnostics to error when all weights are zero.
+  - **Support configurable ID column candidates**
+    - `Sample.from_frame()` and `guess_id_column()` now accept candidate ID column names
+      when auto-detecting the ID column.
+  - **Formula support for BalanceDF model matrices**
+    - `BalanceDF.model_matrix()` now accepts a `formula` argument to build
+      custom model matrices without precomputing them manually.
+
 
 ## Bug Fixes
 
 
@@ -510,10 +510,9 @@ def from_frame(
             ]
             # TODO:(after 2026) that if pandas >=3, this doesn't cause issues for users importing data from SQL
             # In pandas < 3, convert string dtype to object for compatibility
-            _pd_version = tuple(
-                int(x) for x in importlib_version("pandas").split(".")[:2]
-            )
-            if _pd_version < (3, 0):
+            from packaging.version import Version
+
+            if Version(importlib_version("pandas")) < Version("3.0"):
                 input_type.append("string")
                 output_type.append("object")
             for i_input, i_output in zip(input_type, output_type):
@@ -940,31 +939,38 @@ def adjust(
         .. code-block:: python
 
                 import balance
-                from sklearn.ensemble import RandomForestClassifier
+                from sklearn.ensemble import HistGradientBoostingClassifier
                 from balance import Sample
                 from balance import load_data
 
                 # Load simulated data
                 target_df, sample_df = load_data()
 
+                sample
+
                 sample = Sample.from_frame(sample_df, outcome_columns=["happiness"])
-                # Often times we don'y have the outcome for the target. In this case we've added it just to validate later that the weights indeed help us reduce the bias
+                # Often times we don't have the outcome for the target. In this case we've added it just to validate later that the weights indeed help us reduce the bias
                 target = Sample.from_frame(target_df, outcome_columns=["happiness"])
 
                 sample_with_target = sample.set_target(target)
                 adjusted = sample_with_target.adjust()
 
-                rf = RandomForestClassifier(n_estimators=200, random_state=0)
-                adjusted_rf = sample_with_target.adjust(model = rf)
+                hgb = HistGradientBoostingClassifier(
+                    random_state=0, categorical_features="from_dtype"
+                )
+                adjusted_hgb = sample_with_target.adjust(
+                    model=hgb,
+                    use_model_matrix=False,
+                )
 
-                # Print ASMD tables for both adjusted and adjusted_rf
+                # Print ASMD tables for both adjusted and adjusted_hgb
                 print("\\n=== Adjusted ASMD ===")
                 print(adjusted.covars().asmd().T)
 
-                print("\\n=== Adjusted_RF ASMD ===")
-                print(adjusted_rf.covars().asmd().T)
+                print("\\n=== Adjusted_HGB ASMD ===")
+                print(adjusted_hgb.covars().asmd().T)
 
-                # output
+                # output (values will vary by model and random seed)
                 #
                 # === Adjusted ASMD ===
                 # source                  self  unadjusted  unadjusted - self
@@ -977,16 +983,16 @@ def adjust(
                 # income              0.205469    0.494217           0.288748
                 # mean(asmd)          0.119597    0.326799           0.207202
                 #
-                # === Adjusted_RF ASMD ===
+                # === Adjusted_HGB ASMD ===
                 # source                  self  unadjusted  unadjusted - self
-                # age_group[T.25-34]  0.074491    0.005688          -0.068804
-                # age_group[T.35-44]  0.022383    0.312711           0.290328
-                # age_group[T.45+]    0.145628    0.378828           0.233201
-                # gender[Female]      0.037700    0.375699           0.337999
-                # gender[Male]        0.067392    0.379314           0.311922
-                # gender[_NA]         0.051718    0.006296          -0.045422
-                # income              0.140655    0.494217           0.353562
-                # mean(asmd)          0.091253    0.326799           0.235546
+                # age_group[T.25-34]       ...    0.005688            ...
+                # age_group[T.35-44]       ...    0.312711            ...
+                # age_group[T.45+]         ...    0.378828            ...
+                # gender[Female]           ...    0.375699            ...
+                # gender[Male]             ...    0.379314            ...
+                # gender[_NA]              ...    0.006296            ...
+                # income                   ...    0.494217            ...
+                # mean(asmd)               ...    0.326799            ...
         """
         if target is None:
             self._no_target_error()
 
@@ -91,6 +91,29 @@ def add_na_indicator_to_combined(df: pd.DataFrame) -> pd.DataFrame:
     Returns:
         pd.DataFrame: The DataFrame with NA indicator columns added for every
             base column that contains missing values.
+
+    Examples:
+        Basic usage on a DataFrame without pre-existing indicators:
+
+        >>> import pandas as pd
+        >>> from balance.utils.data_transformation import add_na_indicator_to_combined
+        >>> df = pd.DataFrame({"x": [1.0, None, 3.0], "y": [0, 1, 2]})
+        >>> result = add_na_indicator_to_combined(df)
+        >>> result.columns.tolist()
+        ['x', 'y', '_is_na_x']
+
+        When the input already contains ``_is_na_*`` columns, they are preserved
+        and not duplicated:
+
+        >>> df2 = pd.DataFrame(
+        ...     {
+        ...         "x": [1.0, None, 3.0],
+        ...         "_is_na_y": [0, 1, 0],
+        ...     }
+        ... )
+        >>> result2 = add_na_indicator_to_combined(df2)
+        >>> result2.columns.tolist()
+        ['x', '_is_na_x', '_is_na_y']
     """
     existing_indicator_cols = [
         col for col in df.columns if isinstance(col, str) and col.startswith("_is_na_")