Skip to content

Commit fb0562f

Browse files
committed
[SPARK-22810][ML][PYSPARK] Expose Python API for LinearRegression with huber loss.
## What changes were proposed in this pull request? Expose Python API for _LinearRegression_ with _huber_ loss. ## How was this patch tested? Unit test. Author: Yanbo Liang <[email protected]> Closes #19994 from yanboliang/spark-22810.
1 parent 0114c89 commit fb0562f

File tree

4 files changed

+96
-15
lines changed

4 files changed

+96
-15
lines changed

python/pyspark/ml/param/_shared_params_code_gen.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,8 @@ def get$Name(self):
154154
("aggregationDepth", "suggested depth for treeAggregate (>= 2).", "2",
155155
"TypeConverters.toInt"),
156156
("parallelism", "the number of threads to use when running parallel algorithms (>= 1).",
157-
"1", "TypeConverters.toInt")]
157+
"1", "TypeConverters.toInt"),
158+
("loss", "the loss function to be optimized.", None, "TypeConverters.toString")]
158159

159160
code = []
160161
for name, doc, defaultValueStr, typeConverter in shared:

python/pyspark/ml/param/shared.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,29 @@ def getParallelism(self):
632632
return self.getOrDefault(self.parallelism)
633633

634634

635+
class HasLoss(Params):
636+
"""
637+
Mixin for param loss: the loss function to be optimized.
638+
"""
639+
640+
loss = Param(Params._dummy(), "loss", "the loss function to be optimized.", typeConverter=TypeConverters.toString)
641+
642+
def __init__(self):
643+
super(HasLoss, self).__init__()
644+
645+
def setLoss(self, value):
646+
"""
647+
Sets the value of :py:attr:`loss`.
648+
"""
649+
return self._set(loss=value)
650+
651+
def getLoss(self):
652+
"""
653+
Gets the value of loss or its default value.
654+
"""
655+
return self.getOrDefault(self.loss)
656+
657+
635658
class DecisionTreeParams(Params):
636659
"""
637660
Mixin for Decision Tree parameters.

python/pyspark/ml/regression.py

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,23 +39,26 @@
3939
@inherit_doc
4040
class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
4141
HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept,
42-
HasStandardization, HasSolver, HasWeightCol, HasAggregationDepth,
42+
HasStandardization, HasSolver, HasWeightCol, HasAggregationDepth, HasLoss,
4343
JavaMLWritable, JavaMLReadable):
4444
"""
4545
Linear regression.
4646
47-
The learning objective is to minimize the squared error, with regularization.
48-
The specific squared error loss function used is: L = 1/2n ||A coefficients - y||^2^
47+
The learning objective is to minimize the specified loss function, with regularization.
48+
This supports two kinds of loss:
4949
50-
This supports multiple types of regularization:
51-
52-
* none (a.k.a. ordinary least squares)
50+
* squaredError (a.k.a squared loss)
51+
* huber (a hybrid of squared error for relatively small errors and absolute error for \
52+
relatively large ones, and we estimate the scale parameter from training data)
5353
54-
* L2 (ridge regression)
54+
This supports multiple types of regularization:
5555
56-
* L1 (Lasso)
56+
* none (a.k.a. ordinary least squares)
57+
* L2 (ridge regression)
58+
* L1 (Lasso)
59+
* L2 + L1 (elastic net)
5760
58-
* L2 + L1 (elastic net)
61+
Note: Fitting with huber loss only supports none and L2 regularization.
5962
6063
>>> from pyspark.ml.linalg import Vectors
6164
>>> df = spark.createDataFrame([
@@ -98,31 +101,42 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
98101
solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " +
99102
"options: auto, normal, l-bfgs.", typeConverter=TypeConverters.toString)
100103

104+
loss = Param(Params._dummy(), "loss", "The loss function to be optimized. Supported " +
105+
"options: squaredError, huber.", typeConverter=TypeConverters.toString)
106+
107+
epsilon = Param(Params._dummy(), "epsilon", "The shape parameter to control the amount of " +
108+
"robustness. Must be > 1.0. Only valid when loss is huber",
109+
typeConverter=TypeConverters.toFloat)
110+
101111
@keyword_only
102112
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
103113
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
104-
standardization=True, solver="auto", weightCol=None, aggregationDepth=2):
114+
standardization=True, solver="auto", weightCol=None, aggregationDepth=2,
115+
loss="squaredError", epsilon=1.35):
105116
"""
106117
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
107118
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
108-
standardization=True, solver="auto", weightCol=None, aggregationDepth=2)
119+
standardization=True, solver="auto", weightCol=None, aggregationDepth=2, \
120+
loss="squaredError", epsilon=1.35)
109121
"""
110122
super(LinearRegression, self).__init__()
111123
self._java_obj = self._new_java_obj(
112124
"org.apache.spark.ml.regression.LinearRegression", self.uid)
113-
self._setDefault(maxIter=100, regParam=0.0, tol=1e-6)
125+
self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, loss="squaredError", epsilon=1.35)
114126
kwargs = self._input_kwargs
115127
self.setParams(**kwargs)
116128

117129
@keyword_only
118130
@since("1.4.0")
119131
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
120132
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
121-
standardization=True, solver="auto", weightCol=None, aggregationDepth=2):
133+
standardization=True, solver="auto", weightCol=None, aggregationDepth=2,
134+
loss="squaredError", epsilon=1.35):
122135
"""
123136
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
124137
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
125-
standardization=True, solver="auto", weightCol=None, aggregationDepth=2)
138+
standardization=True, solver="auto", weightCol=None, aggregationDepth=2, \
139+
loss="squaredError", epsilon=1.35)
126140
Sets params for linear regression.
127141
"""
128142
kwargs = self._input_kwargs
@@ -131,6 +145,20 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
131145
def _create_model(self, java_model):
132146
return LinearRegressionModel(java_model)
133147

148+
@since("2.3.0")
149+
def setEpsilon(self, value):
150+
"""
151+
Sets the value of :py:attr:`epsilon`.
152+
"""
153+
return self._set(epsilon=value)
154+
155+
@since("2.3.0")
156+
def getEpsilon(self):
157+
"""
158+
Gets the value of epsilon or its default value.
159+
"""
160+
return self.getOrDefault(self.epsilon)
161+
134162

135163
class LinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable):
136164
"""
@@ -155,6 +183,14 @@ def intercept(self):
155183
"""
156184
return self._call_java("intercept")
157185

186+
@property
187+
@since("2.3.0")
188+
def scale(self):
189+
"""
190+
The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0.
191+
"""
192+
return self._call_java("scale")
193+
158194
@property
159195
@since("2.0.0")
160196
def summary(self):

python/pyspark/ml/tests.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1726,6 +1726,27 @@ def test_offset(self):
17261726
self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
17271727

17281728

1729+
class LinearRegressionTest(SparkSessionTestCase):
1730+
1731+
def test_linear_regression_with_huber_loss(self):
1732+
1733+
data_path = "data/mllib/sample_linear_regression_data.txt"
1734+
df = self.spark.read.format("libsvm").load(data_path)
1735+
1736+
lir = LinearRegression(loss="huber", epsilon=2.0)
1737+
model = lir.fit(df)
1738+
1739+
expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537,
1740+
1.2612, -0.333, -0.5694, -0.6311, 0.6053]
1741+
expectedIntercept = 0.1607
1742+
expectedScale = 9.758
1743+
1744+
self.assertTrue(
1745+
np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3))
1746+
self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3))
1747+
self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3))
1748+
1749+
17291750
class LogisticRegressionTest(SparkSessionTestCase):
17301751

17311752
def test_binomial_logistic_regression_with_bound(self):

0 commit comments

Comments
 (0)