Fix linear svc handling sample weights under class_weight="balanced" (scikit-learn#30057)

snath-xoc · ogrisel · jeremiedbb · web-flow · commit 9523006807f2 · 2025-02-11T18:20:00.000Z
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
Co-authored-by: Jérémie du Boisberranger &lt;jeremie@probabl.ai&gt;
diff --git a/doc/whats_new/upcoming_changes/sklearn.linear_model/30057.fix.rst b/doc/whats_new/upcoming_changes/sklearn.linear_model/30057.fix.rst
@@ -0,0 +1,5 @@
+- :class:`linear_model.LogisticRegression` and
+  :class:`linear_model.LogisticRegressionCV` now properly pass sample weights to
+  :func:`utils.class_weight.compute_class_weight` when fit with
+  `class_weight="balanced"`.
+  By :user:`Shruti Nath <snath-xoc>` and :user:`Olivier Grisel <ogrisel>`
diff --git a/doc/whats_new/upcoming_changes/sklearn.svm/30057.fix.rst b/doc/whats_new/upcoming_changes/sklearn.svm/30057.fix.rst
@@ -0,0 +1,4 @@
+- :class:`svm.LinearSVC` now properly passes sample weights to
+  :func:`utils.class_weight.compute_class_weight` when fit with
+  `class_weight="balanced"`.
+  By :user:`Shruti Nath <snath-xoc>`
diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/30057.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.utils/30057.enhancement.rst
@@ -0,0 +1,3 @@
+- :func:`utils.class_weight.compute_class_weight` now properly accounts for
+  sample weights when using strategy "balanced" to calculate class weights.
+  By :user:`Shruti Nath <snath-xoc>`
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
@@ -305,7 +305,9 @@ def _logistic_regression_path(
     if isinstance(class_weight, dict) or (
         multi_class == "multinomial" and class_weight is not None
     ):
-        class_weight_ = compute_class_weight(class_weight, classes=classes, y=y)
+        class_weight_ = compute_class_weight(
+            class_weight, classes=classes, y=y, sample_weight=sample_weight
+        )
         sample_weight *= class_weight_[le.fit_transform(y)]
 
     # For doing a ovr, we need to mask the labels first. For the
@@ -326,7 +328,10 @@ def _logistic_regression_path(
         # for compute_class_weight
         if class_weight == "balanced":
             class_weight_ = compute_class_weight(
-                class_weight, classes=mask_classes, y=y_bin
+                class_weight,
+                classes=mask_classes,
+                y=y_bin,
+                sample_weight=sample_weight,
             )
             sample_weight *= class_weight_[le.fit_transform(y_bin)]
 
@@ -1981,7 +1986,10 @@ def fit(self, X, y, sample_weight=None, **params):
         # compute the class weights for the entire dataset y
         if class_weight == "balanced":
             class_weight = compute_class_weight(
-                class_weight, classes=np.arange(len(self.classes_)), y=y
+                class_weight,
+                classes=np.arange(len(self.classes_)),
+                y=y,
+                sample_weight=sample_weight,
             )
             class_weight = dict(enumerate(class_weight))
 
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
@@ -1189,8 +1189,9 @@ def _fit_liblinear(
                 " in the data, but the data contains only one"
                 " class: %r" % classes_[0]
             )
-
-        class_weight_ = compute_class_weight(class_weight, classes=classes_, y=y)
+        class_weight_ = compute_class_weight(
+            class_weight, classes=classes_, y=y, sample_weight=sample_weight
+        )
     else:
         class_weight_ = np.empty(0, dtype=np.float64)
         y_ind = y
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
@@ -600,6 +600,17 @@
         "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
     },
     LinearDiscriminantAnalysis: {"check_dict_unchanged": dict(n_components=1)},
+    LinearSVC: {
+        "check_sample_weight_equivalence": [
+            # TODO: dual=True is a stochastic solver: we cannot rely on
+            # check_sample_weight_equivalence to check the correct handling of
+            # sample_weight and we would need a statistical test instead, see
+            # meta-issue #162298.
+            # dict(max_iter=20, dual=True, tol=1e-12),
+            dict(dual=False, tol=1e-12),
+            dict(dual=False, tol=1e-12, class_weight="balanced"),
+        ]
+    },
     LinearRegression: {
         "check_estimator_sparse_tag": [dict(positive=False), dict(positive=True)],
         "check_sample_weight_equivalence_on_dense_data": [
@@ -615,6 +626,14 @@
             dict(solver="liblinear"),
             dict(solver="newton-cg"),
             dict(solver="newton-cholesky"),
+            dict(solver="newton-cholesky", class_weight="balanced"),
+        ]
+    },
+    LogisticRegressionCV: {
+        "check_sample_weight_equivalence": [
+            dict(solver="lbfgs"),
+            dict(solver="newton-cholesky"),
+            dict(solver="newton-cholesky", class_weight="balanced"),
         ],
         "check_sample_weight_equivalence_on_sparse_data": [
             dict(solver="liblinear"),
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
@@ -7,24 +7,27 @@
 from scipy import sparse
 
 from ._param_validation import StrOptions, validate_params
+from .validation import _check_sample_weight
 
 
 @validate_params(
     {
         "class_weight": [dict, StrOptions({"balanced"}), None],
         "classes": [np.ndarray],
         "y": ["array-like"],
+        "sample_weight": ["array-like", None],
     },
     prefer_skip_nested_validation=True,
 )
-def compute_class_weight(class_weight, *, classes, y):
+def compute_class_weight(class_weight, *, classes, y, sample_weight=None):
     """Estimate class weights for unbalanced datasets.
 
     Parameters
     ----------
     class_weight : dict, "balanced" or None
         If "balanced", class weights will be given by
-        `n_samples / (n_classes * np.bincount(y))`.
+        `n_samples / (n_classes * np.bincount(y))` or their weighted equivalent if
+        `sample_weight` is provided.
         If a dictionary is given, keys are classes and values are corresponding class
         weights.
         If `None` is given, the class weights will be uniform.
@@ -36,6 +39,10 @@ def compute_class_weight(class_weight, *, classes, y):
     y : array-like of shape (n_samples,)
         Array of original class labels per sample.
 
+    sample_weight : array-like of shape (n_samples,), default=None
+        Array of weights that are assigned to individual samples. Only used when
+        `class_weight='balanced'`.
+
     Returns
     -------
     class_weight_vect : ndarray of shape (n_classes,)
@@ -69,7 +76,11 @@ def compute_class_weight(class_weight, *, classes, y):
         if not all(np.isin(classes, le.classes_)):
             raise ValueError("classes should have valid labels that are in y")
 
-        recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64))
+        sample_weight = _check_sample_weight(sample_weight, y)
+        weighted_class_counts = np.bincount(y_ind, weights=sample_weight)
+        recip_freq = weighted_class_counts.sum() / (
+            len(le.classes_) * weighted_class_counts
+        )
         weight = recip_freq[le.transform(classes)]
     else:
         # user-defined dictionary
diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py
@@ -129,14 +129,32 @@ def test_compute_class_weight_balanced_negative():
     assert len(cw) == len(classes)
     assert_array_almost_equal(cw, np.array([1.0, 1.0, 1.0]))
 
-    # Test with unbalanced class labels.
-    y = np.asarray([-1, 0, 0, -2, -2, -2])
 
-    cw = compute_class_weight("balanced", classes=classes, y=y)
-    assert len(cw) == len(classes)
-    class_counts = np.bincount(y + 2)
-    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
-    assert_array_almost_equal(cw, [2.0 / 3, 2.0, 1.0])
+def test_compute_class_weight_balanced_sample_weight_equivalence():
+    # Test with unbalanced and negative class labels for
+    # equivalence between repeated and weighted samples
+
+    classes = np.array([-2, -1, 0])
+    y = np.asarray([-1, -1, 0, 0, -2, -2])
+    sw = np.asarray([1, 0, 1, 1, 1, 2])
+
+    y_rep = np.repeat(y, sw, axis=0)
+
+    class_weights_weighted = compute_class_weight(
+        "balanced", classes=classes, y=y, sample_weight=sw
+    )
+    class_weights_repeated = compute_class_weight("balanced", classes=classes, y=y_rep)
+    assert len(class_weights_weighted) == len(classes)
+    assert len(class_weights_repeated) == len(classes)
+
+    class_counts_weighted = np.bincount(y + 2, weights=sw)
+    class_counts_repeated = np.bincount(y_rep + 2)
+
+    assert np.dot(class_weights_weighted, class_counts_weighted) == pytest.approx(
+        np.dot(class_weights_repeated, class_counts_repeated)
+    )
+
+    assert_allclose(class_weights_weighted, class_weights_repeated)
 
 
 def test_compute_class_weight_balanced_unordered():

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+- :func:`utils.class_weight.compute_class_weight` now properly accounts for
	`2`	`+ sample weights when using strategy "balanced" to calculate class weights.`
	`3`	+ By :user:`Shruti Nath <snath-xoc>`