guillermo-navas-palencia · guillermo-navas-palencia · Oct 26, 2025 · Feb 19, 2025 · Feb 25, 2025 · Mar 5, 2025
diff --git a/README.rst b/README.rst
@@ -81,7 +81,7 @@ OptBinning requires
 * ortools (>=9.4)
 * pandas
 * ropwr (>=1.0.0)
-* scikit-learn (>=1.0.2)
+* scikit-learn (>=1.6.0)
 * scipy (>=1.6.0)
 
 OptBinning[distributed] requires additional packages

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -18,13 +18,13 @@
 # -- Project information -----------------------------------------------------
 
 project = 'optbinning'
-copyright = '2019 - 2024, Guillermo Navas-Palencia'
+copyright = '2019 - 2025, Guillermo Navas-Palencia'
 author = 'Guillermo Navas-Palencia'
 
 # The short X.Y version
-version = '0.20.0'
+version = '0.21.0'
 # The full version, including alpha/beta/rc tags
-release = '0.20.0'
+release = '0.21.0'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/doc/source/release_notes.rst b/doc/source/release_notes.rst
@@ -1,5 +1,19 @@
 Release Notes
 =============
+Version 0.21.0 (2025-10-26)
+---------------------------
+
+New features:
+
+   - Add ``transform`` method in scorecard (`Issue 347 <https://github.com/guillermo-navas-palencia/optbinning/issues/346>`_).
+   - Add ``to_dict`` method for binning table serialization (`Issue 347 <https://github.com/guillermo-navas-palencia/optbinning/issues/371>`_).
+   - Replace Boston dataset (`Issue 347 <https://github.com/guillermo-navas-palencia/optbinning/issues/372>`_).
+
+Bugfixes:
+
+   - Use weighted min and max bin size, and correct decision tree hyperparameters when sample_weight is provided (`Issue 347 <https://github.com/guillermo-navas-palencia/optbinning/issues/359>`_).
+   - Fix ssym for std calculation (`Issue 347 <https://github.com/guillermo-navas-palencia/optbinning/issues/360>`_)..
+
 
 Version 0.20.1 (2025-02-23)
 ---------------------------

diff --git a/optbinning/_version.py b/optbinning/_version.py
@@ -1,3 +1,3 @@
 """Version information."""
 
-__version__ = "0.20.1"
+__version__ = "0.21.0"
diff --git a/optbinning/binning/binning.py b/optbinning/binning/binning.py
@@ -290,7 +290,8 @@ class OptimalBinning(BaseOptimalBinning):
         The maximum number of bins after pre-binning (prebins).
 
     min_prebin_size : float (default=0.05)
-        The fraction of mininum number of records for each prebin.
+        The fraction of mininum number of records for each prebin
+        (including missing and ``special_code`` groups).
 
     min_n_bins : int or None, optional (default=None)
         The minimum number of bins. If None, then ``min_n_bins`` is
@@ -301,11 +302,13 @@ class OptimalBinning(BaseOptimalBinning):
         a value in ``[0, max_n_prebins]``.
 
     min_bin_size : float or None, optional (default=None)
-        The fraction of minimum number of records for each bin. If None,
+        The fraction of minimum number of records for each bin
+        (including missing and ``special_code`` groups). If None,
         ``min_bin_size = min_prebin_size``.
 
     max_bin_size : float or None, optional (default=None)
-        The fraction of maximum number of records for each bin. If None,
+        The fraction of maximum number of records for each bin
+        (including missing and ``special_code`` groups). If None,
         ``max_bin_size = 1.0``.
 
     min_bin_n_nonevent : int or None, optional (default=None)
@@ -516,6 +519,7 @@ def __init__(self, name="", dtype="numerical", prebinning_method="cart",
         self._n_prebins = None
         self._n_refinements = 0
         self._n_samples = None
+        self._n_samples_weighted = None
         self._optimizer = None
         self._solution = None
         self._splits_optimal = None
@@ -711,10 +715,15 @@ def _fit(self, x, y, sample_weight, check_input):
             logger.info("Pre-processing started.")
 
         self._n_samples = len(x)
+        self._n_samples_weighted = sum(sample_weight) if sample_weight is not None else len(x)
 
         if self.verbose:
-            logger.info("Pre-processing: number of samples: {}"
-                        .format(self._n_samples))
+            if self._n_samples == self._n_samples_weighted:
+                logger.info("Pre-processing: number of samples: {}"
+                            .format(self._n_samples))
+            else:
+                logger.info("Pre-processing: number of samples: {}. Weighted samples: {}"
+                            .format(self._n_samples, self._n_samples_weighted))
 
         time_preprocessing = time.perf_counter()
 
@@ -784,7 +793,7 @@ def _fit(self, x, y, sample_weight, check_input):
                 if self.dtype == "numerical":
                     user_splits = check_array(
                         self.user_splits, ensure_2d=False, dtype=None,
-                        force_all_finite=True)
+                        ensure_all_finite=True)
 
                     if len(set(user_splits)) != len(user_splits):
                         raise ValueError("User splits are not unique.")
@@ -880,7 +889,7 @@ def _fit_prebinning(self, x, y, y_missing, x_special, y_special, y_others,
                         class_weight=None, sw_clean=None, sw_missing=None,
                         sw_special=None, sw_others=None):
 
-        min_bin_size = int(np.ceil(self.min_prebin_size * self._n_samples))
+        min_bin_size = int(np.ceil(self.min_prebin_size * self._n_samples_weighted))
 
         prebinning = PreBinning(method=self.prebinning_method,
                                 n_bins=self.max_n_prebins,
@@ -916,12 +925,12 @@ def _fit_optimizer(self, splits, n_nonevent, n_event):
 
         # Min/max number of bins
         if self.min_bin_size is not None:
-            min_bin_size = int(np.ceil(self.min_bin_size * self._n_samples))
+            min_bin_size = int(np.ceil(self.min_bin_size * self._n_samples_weighted))
         else:
             min_bin_size = self.min_bin_size
 
         if self.max_bin_size is not None:
-            max_bin_size = int(np.ceil(self.max_bin_size * self._n_samples))
+            max_bin_size = int(np.ceil(self.max_bin_size * self._n_samples_weighted))
         else:
             max_bin_size = self.max_bin_size
 
@@ -1177,18 +1186,15 @@ def status(self):
 
         return self._status
 
-    def to_json(self, path):
+    def to_dict(self):
         """
-        Save optimal bins and/or splits points and transformation depending on
-        the target type.
+        Convert optimal bins and/or splits points and transformation depending on
+        the target type to dictionary.
 
-        Parameters
-        ----------
-        path: The path where the json is going to be saved.
+        Returns
+        -------
+        opt_bin_dict : dict
         """
-        if path is None:
-            raise ValueError('Specify the path for the json file')
-
         table = self.binning_table
 
         opt_bin_dict = dict()
@@ -1210,6 +1216,22 @@ def to_json(self, path):
         opt_bin_dict['cat_others'] = table.cat_others
         opt_bin_dict['user_splits'] = table.user_splits
 
+        return opt_bin_dict
+
+    def to_json(self, path):
+        """
+        Save optimal bins and/or splits points and transformation depending on
+        the target type.
+
+        Parameters
+        ----------
+        path: The path where the json is going to be saved.
+        """
+        if path is None:
+            raise ValueError('Specify the path for the json file')
+
+        opt_bin_dict = self.to_dict()
+
         with open(path, "w") as write_file:
             json.dump(opt_bin_dict, write_file)
 

diff --git a/optbinning/binning/binning_process.py b/optbinning/binning/binning_process.py
@@ -448,7 +448,8 @@ class BinningProcess(Base, BaseEstimator, BaseBinningProcess):
         The maximum number of bins after pre-binning (prebins).
 
     min_prebin_size : float (default=0.05)
-        The fraction of mininum number of records for each prebin.
+        The fraction of mininum number of records for each prebin
+        (including missing and ``special_code`` groups).
 
     min_n_bins : int or None, optional (default=None)
         The minimum number of bins. If None, then ``min_n_bins`` is
@@ -459,11 +460,13 @@ class BinningProcess(Base, BaseEstimator, BaseBinningProcess):
         a value in ``[0, max_n_prebins]``.
 
     min_bin_size : float or None, optional (default=None)
-        The fraction of minimum number of records for each bin. If None,
+        The fraction of minimum number of records for each bin
+        (including missing and ``special_code`` groups). If None,
         ``min_bin_size = min_prebin_size``.
 
     max_bin_size : float or None, optional (default=None)
-        The fraction of maximum number of records for each bin. If None,
+        The fraction of maximum number of records for each bin
+        (including missing and ``special_code`` groups). If None,
         ``max_bin_size = 1.0``.
 
     max_pvalue : float or None, optional (default=None)
@@ -1082,10 +1085,10 @@ def _fit(self, X, y, sample_weight, check_input):
         # check X and y data
         if check_input:
             X = check_array(X, ensure_2d=False, dtype=None,
-                            force_all_finite='allow-nan')
+                            ensure_all_finite='allow-nan')
 
             y = check_array(y, ensure_2d=False, dtype=None,
-                            force_all_finite=True)
+                            ensure_all_finite=True)
 
             check_consistent_length(X, y)
 

diff --git a/optbinning/binning/continuous_binning.py b/optbinning/binning/continuous_binning.py
@@ -208,7 +208,8 @@ class ContinuousOptimalBinning(OptimalBinning):
         The maximum number of bins after pre-binning (prebins).
 
     min_prebin_size : float (default=0.05)
-        The fraction of mininum number of records for each prebin.
+        The fraction of mininum number of records for each prebin
+        (including missing and ``special_code`` groups).
 
     min_n_bins : int or None, optional (default=None)
         The minimum number of bins. If None, then ``min_n_bins`` is
@@ -219,11 +220,13 @@ class ContinuousOptimalBinning(OptimalBinning):
         a value in ``[0, max_n_prebins]``.
 
     min_bin_size : float or None, optional (default=None)
-        The fraction of minimum number of records for each bin. If None,
+        The fraction of minimum number of records for each bin
+        (including missing and ``special_code`` groups). If None,
         ``min_bin_size = min_prebin_size``.
 
     max_bin_size : float or None, optional (default=None)
-        The fraction of maximum number of records for each bin. If None,
+        The fraction of maximum number of records for each bin
+        (including missing and ``special_code`` groups). If None,
         ``max_bin_size = 1.0``.
 
     monotonic_trend : str or None, optional (default="auto")
@@ -400,6 +403,7 @@ def __init__(self, name="", dtype="numerical", prebinning_method="cart",
         self._n_prebins = None
         self._n_refinements = 0
         self._n_samples = None
+        self._n_samples_weighted = None
         self._optimizer = None
         self._splits_optimal = None
         self._status = None
@@ -559,10 +563,15 @@ def _fit(self, x, y, sample_weight, check_input):
             logger.info("Pre-processing started.")
 
         self._n_samples = len(x)
+        self._n_samples_weighted = sum(sample_weight) if sample_weight is not None else len(x)
 
         if self.verbose:
-            logger.info("Pre-processing: number of samples: {}"
-                        .format(self._n_samples))
+            if self._n_samples == self._n_samples_weighted:
+                logger.info("Pre-processing: number of samples: {}"
+                            .format(self._n_samples))
+            else:
+                logger.info("Pre-processing: number of samples: {}. Weighted samples: {}"
+                            .format(self._n_samples, self._n_samples_weighted))
 
         time_preprocessing = time.perf_counter()
 
@@ -633,7 +642,7 @@ def _fit(self, x, y, sample_weight, check_input):
                 if self.dtype == "numerical":
                     user_splits = check_array(
                         self.user_splits, ensure_2d=False, dtype=None,
-                        force_all_finite=True)
+                        ensure_all_finite=True)
 
                     if len(set(user_splits)) != len(user_splits):
                         raise ValueError("User splits are not unique.")
@@ -757,12 +766,12 @@ def _fit_optimizer(self, splits, n_records, sums, ssums, stds):
             return
 
         if self.min_bin_size is not None:
-            min_bin_size = int(np.ceil(self.min_bin_size * self._n_samples))
+            min_bin_size = int(np.ceil(self.min_bin_size * self._n_samples_weighted))
         else:
             min_bin_size = self.min_bin_size
 
         if self.max_bin_size is not None:
-            max_bin_size = int(np.ceil(self.max_bin_size * self._n_samples))
+            max_bin_size = int(np.ceil(self.max_bin_size * self._n_samples_weighted))
         else:
             max_bin_size = self.max_bin_size
 
@@ -897,7 +906,8 @@ def _prebinning_refinement(self, splits_prebinning, x, y, y_missing,
     def _compute_prebins(self, splits_prebinning, x, y, sw):
         n_splits = len(splits_prebinning)
         if not n_splits:
-            return splits_prebinning, np.array([]), np.array([])
+            return (splits_prebinning, np.array([]), np.array([]), np.array([]),
+                    np.array([]), np.array([]), np.array([]), np.array([]))
 
         if self.dtype == "categorical" and self.user_splits is not None:
             indices = np.digitize(x, splits_prebinning, right=True)
@@ -920,7 +930,7 @@ def _compute_prebins(self, splits_prebinning, x, y, sw):
             n_records[i] = np.sum(sw[mask])
             ymask = sw[mask] * y[mask]
             sums[i] = np.sum(ymask)
-            ssums[i] = np.sum(ymask ** 2)
+            ssums[i] = np.sum(sw[mask] * (y[mask] ** 2))
             n_zeros[i] = np.count_nonzero(ymask == 0)
             if len(ymask):
                 stds[i] = np.std(ymask)

diff --git a/optbinning/binning/mdlp.py b/optbinning/binning/mdlp.py
@@ -99,8 +99,8 @@ def fit(self, x, y):
     def _fit(self, x, y):
         _check_parameters(**self.get_params())
 
-        x = check_array(x, ensure_2d=False, force_all_finite=True)
-        y = check_array(y, ensure_2d=False, force_all_finite=True)
+        x = check_array(x, ensure_2d=False, ensure_all_finite=True)
+        y = check_array(y, ensure_2d=False, ensure_all_finite=True)
 
         idx = np.argsort(x)
         x = x[idx]

diff --git a/optbinning/binning/metrics.py b/optbinning/binning/metrics.py
@@ -14,8 +14,8 @@
 
 
 def _check_x_y(x, y):
-    x = check_array(x, ensure_2d=False, force_all_finite=True)
-    y = check_array(y, ensure_2d=False, force_all_finite=True)
+    x = check_array(x, ensure_2d=False, ensure_all_finite=True)
+    y = check_array(y, ensure_2d=False, ensure_all_finite=True)
 
     check_consistent_length(x, y)
 

diff --git a/optbinning/binning/multiclass_binning.py b/optbinning/binning/multiclass_binning.py
@@ -214,7 +214,8 @@ class MulticlassOptimalBinning(OptimalBinning):
         The maximum number of bins after pre-binning (prebins).
 
     min_prebin_size : float (default=0.05)
-        The fraction of mininum number of records for each prebin.
+        The fraction of mininum number of records for each prebin
+        (including missing and ``special_code`` groups).
 
     min_n_bins : int or None, optional (default=None)
         The minimum number of bins. If None, then ``min_n_bins`` is
@@ -225,11 +226,13 @@ class MulticlassOptimalBinning(OptimalBinning):
         a value in ``[0, max_n_prebins]``.
 
     min_bin_size : float or None, optional (default=None)
-        The fraction of minimum number of records for each bin. If None,
+        The fraction of minimum number of records for each bin
+        (including missing and ``special_code`` groups). If None,
         ``min_bin_size = min_prebin_size``.
 
     max_bin_size : float or None, optional (default=None)
-        The fraction of maximum number of records for each bin. If None,
+        The fraction of maximum number of records for each bin
+        (including missing and ``special_code`` groups). If None,
         ``max_bin_size = 1.0``.
 
     monotonic_trend : str, array-like or None, optional (default="auto")
@@ -360,6 +363,7 @@ def __init__(self, name="", prebinning_method="cart", solver="cp",
         self._n_prebins = None
         self._n_refinements = 0
         self._n_samples = None
+        self._n_samples_weighted = None
         self._optimizer = None
         self._splits_optimal = None
         self._status = None
@@ -504,6 +508,7 @@ def _fit(self, x, y, check_input):
             logger.info("Pre-processing started.")
 
         self._n_samples = len(x)
+        self._n_samples_weighted = self._n_samples
 
         if self.verbose:
             logger.info("Pre-processing: number of samples: {}"
@@ -560,7 +565,7 @@ def _fit(self, x, y, check_input):
                             .format(n_splits))
 
             user_splits = check_array(self.user_splits, ensure_2d=False,
-                                      dtype=None, force_all_finite=True)
+                                      dtype=None, ensure_all_finite=True)
 
             if len(set(user_splits)) != len(user_splits):
                 raise ValueError("User splits are not unique.")