Skip to content

Commit 7c12ff6

Browse files
wbo4958zhengruifeng
authored andcommitted
[SPARK-50988][ML][PYTHON][CONNECT] Fix uid inconsistencies for estimator and model
### What changes were proposed in this pull request? The uid of the model trained by the corresponding estimator is not equal to the uid of the estimator, which is a bug. This PR has fixed this issue. ### Why are the changes needed? Fix the bug to make the uid of the estimator and model equal ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? The CI passes ### Was this patch authored or co-authored using generative AI tooling? No Closes #49674 from wbo4958/uid. Authored-by: Bobby Wang <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 43785de commit 7c12ff6

File tree

8 files changed

+48
-10
lines changed

8 files changed

+48
-10
lines changed

python/pyspark/ml/tests/test_als.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,19 @@ def test_als(self):
4848
self.assertEqual(als.getSeed(), 1)
4949
self.assertEqual(als.getMaxIter(), 2)
5050

51+
model = als.fit(df)
52+
5153
# Estimator save & load
5254
with tempfile.TemporaryDirectory(prefix="ALS") as d:
5355
als.write().overwrite().save(d)
5456
als2 = ALS.load(d)
5557
self.assertEqual(str(als), str(als2))
5658

57-
model = als.fit(df)
59+
model.write().overwrite().save(d)
60+
model2 = ALSModel.load(d)
61+
self.assertEqual(str(model), str(model2))
62+
63+
self.assertEqual(als.uid, model.uid)
5864
self.assertEqual(model.rank, 10)
5965

6066
self.assertEqual(model.itemFactors.columns, ["id", "features"])
@@ -84,12 +90,6 @@ def test_als(self):
8490
self.assertEqual(output4.columns, ["item", "recommendations"])
8591
self.assertEqual(output4.count(), 3)
8692

87-
# Model save & load
88-
with tempfile.TemporaryDirectory(prefix="als_model") as d:
89-
model.write().overwrite().save(d)
90-
model2 = ALSModel.load(d)
91-
self.assertEqual(str(model), str(model2))
92-
9393
def test_ambiguous_column(self):
9494
data = self.spark.createDataFrame(
9595
[[1, 15, 1], [1, 2, 2], [2, 3, 4], [2, 2, 5]],

python/pyspark/ml/tests/test_classification.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def test_naive_bayes(self):
6464
self.assertEqual(nb.getWeightCol(), "weight")
6565

6666
model = nb.fit(df)
67+
self.assertEqual(model.uid, nb.uid)
6768
self.assertEqual(model.numClasses, 2)
6869
self.assertEqual(model.numFeatures, 2)
6970
self.assertTrue(
@@ -126,6 +127,7 @@ def test_binomial_logistic_regression_with_bound(self):
126127
upperBoundsOnIntercepts=Vectors.dense(0.0),
127128
)
128129
lor_model = lor.fit(df)
130+
self.assertEqual(lor.uid, lor_model.uid)
129131

130132
def check_result(model: LogisticRegressionModel) -> None:
131133
self.assertTrue(
@@ -159,6 +161,7 @@ def test_multinomial_logistic_regression_with_bound(self):
159161
upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0),
160162
)
161163
lor_model = lor.fit(df)
164+
self.assertEqual(lor.uid, lor_model.uid)
162165

163166
def check_result(model: LogisticRegressionModel) -> None:
164167
expected = [
@@ -196,6 +199,7 @@ def test_logistic_regression_with_threshold(self):
196199

197200
lor = LogisticRegression(weightCol="weight")
198201
model = lor.fit(df)
202+
self.assertEqual(lor.uid, model.uid)
199203

200204
# status changes 1
201205
for t in [0.0, 0.1, 0.2, 0.5, 1.0]:
@@ -224,6 +228,7 @@ def test_binary_logistic_regression_summary(self):
224228
)
225229
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
226230
model = lr.fit(df)
231+
self.assertEqual(lr.uid, model.uid)
227232
self.assertTrue(model.hasSummary)
228233
s = model.summary
229234
# test that api is callable and returns expected types
@@ -385,6 +390,7 @@ def test_linear_svc(self):
385390
self.assertEqual(svc.getRegParam(), 1.0)
386391

387392
model = svc.fit(df)
393+
self.assertEqual(svc.uid, model.uid)
388394
self.assertEqual(model.numClasses, 2)
389395
self.assertEqual(model.numFeatures, 2)
390396
self.assertTrue(np.allclose(model.intercept, 0.025877458475338313, atol=1e-4))
@@ -464,6 +470,7 @@ def test_decision_tree_classifier(self):
464470
self.assertEqual(dt.getLeafCol(), "leaf")
465471

466472
model = dt.fit(df)
473+
self.assertEqual(dt.uid, model.uid)
467474
self.assertEqual(model.numClasses, 2)
468475
self.assertEqual(model.numFeatures, 2)
469476
self.assertEqual(model.depth, 2)
@@ -531,6 +538,7 @@ def test_gbt_classifier(self):
531538
self.assertEqual(gbt.getLeafCol(), "leaf")
532539

533540
model = gbt.fit(df)
541+
self.assertEqual(gbt.uid, model.uid)
534542
self.assertEqual(model.numClasses, 2)
535543
self.assertEqual(model.numFeatures, 2)
536544
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
@@ -609,6 +617,7 @@ def test_binary_random_forest_classifier(self):
609617
self.assertEqual(rf.getLeafCol(), "leaf")
610618

611619
model = rf.fit(df)
620+
self.assertEqual(rf.uid, model.uid)
612621
self.assertEqual(model.numClasses, 2)
613622
self.assertEqual(model.numFeatures, 2)
614623
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
@@ -695,6 +704,7 @@ def test_multiclass_random_forest_classifier(self):
695704
self.assertEqual(rf.getLeafCol(), "leaf")
696705

697706
model = rf.fit(df)
707+
self.assertEqual(rf.uid, model.uid)
698708
self.assertEqual(model.numClasses, 3)
699709
self.assertEqual(model.numFeatures, 2)
700710
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel

python/pyspark/ml/tests/test_clustering.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def test_kmeans(self):
6464
self.assertEqual(km.getWeightCol(), "weight")
6565

6666
model = km.fit(df)
67+
self.assertEqual(km.uid, model.uid)
6768
# TODO: support KMeansModel.numFeatures in Python
6869
# self.assertEqual(model.numFeatures, 2)
6970

@@ -132,6 +133,7 @@ def test_bisecting_kmeans(self):
132133
self.assertEqual(bkm.getWeightCol(), "weight")
133134

134135
model = bkm.fit(df)
136+
self.assertEqual(bkm.uid, model.uid)
135137
# TODO: support KMeansModel.numFeatures in Python
136138
# self.assertEqual(model.numFeatures, 2)
137139

@@ -203,6 +205,7 @@ def test_gaussian_mixture(self):
203205
self.assertEqual(gmm.getSeed(), 1)
204206

205207
model = gmm.fit(df)
208+
self.assertEqual(gmm.uid, model.uid)
206209
# TODO: support GMM.numFeatures in Python
207210
# self.assertEqual(model.numFeatures, 2)
208211
self.assertEqual(len(model.weights), 2)

python/pyspark/ml/tests/test_feature.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def test_string_indexer(self):
123123
# single input
124124
si = StringIndexer(inputCol="label1", outputCol="index1")
125125
model = si.fit(df.select("label1"))
126+
self.assertEqual(si.uid, model.uid)
126127

127128
# read/write
128129
with tempfile.TemporaryDirectory(prefix="string_indexer") as tmp_dir:
@@ -183,6 +184,7 @@ def test_pca(self):
183184
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
184185

185186
model = pca.fit(df)
187+
self.assertEqual(pca.uid, model.uid)
186188
self.assertEqual(model.getK(), 2)
187189
self.assertTrue(
188190
np.allclose(model.explainedVariance.toArray(), [0.79439, 0.20560], atol=1e-4)
@@ -272,6 +274,7 @@ def test_standard_scaler(self):
272274
self.assertEqual(scaler.getOutputCol(), "scaled")
273275

274276
model = scaler.fit(df)
277+
self.assertEqual(scaler.uid, model.uid)
275278
self.assertTrue(np.allclose(model.mean.toArray(), [1.66666667], atol=1e-4))
276279
self.assertTrue(np.allclose(model.std.toArray(), [1.52752523], atol=1e-4))
277280

@@ -311,6 +314,7 @@ def test_maxabs_scaler(self):
311314
self.assertEqual(scaler.getOutputCol(), "scaled")
312315

313316
model = scaler.fit(df)
317+
self.assertEqual(scaler.uid, model.uid)
314318
self.assertTrue(np.allclose(model.maxAbs.toArray(), [3.0], atol=1e-4))
315319

316320
output = model.transform(df)
@@ -349,6 +353,7 @@ def test_minmax_scaler(self):
349353
self.assertEqual(scaler.getOutputCol(), "scaled")
350354

351355
model = scaler.fit(df)
356+
self.assertEqual(scaler.uid, model.uid)
352357
self.assertTrue(np.allclose(model.originalMax.toArray(), [3.0], atol=1e-4))
353358
self.assertTrue(np.allclose(model.originalMin.toArray(), [0.0], atol=1e-4))
354359

@@ -388,6 +393,7 @@ def test_robust_scaler(self):
388393
self.assertEqual(scaler.getOutputCol(), "scaled")
389394

390395
model = scaler.fit(df)
396+
self.assertEqual(scaler.uid, model.uid)
391397
self.assertTrue(np.allclose(model.range.toArray(), [3.0], atol=1e-4))
392398
self.assertTrue(np.allclose(model.median.toArray(), [2.0], atol=1e-4))
393399

@@ -422,6 +428,7 @@ def test_chi_sq_selector(self):
422428
self.assertEqual(selector.getOutputCol(), "selectedFeatures")
423429

424430
model = selector.fit(df)
431+
self.assertEqual(selector.uid, model.uid)
425432
self.assertEqual(model.selectedFeatures, [2])
426433

427434
output = model.transform(df)
@@ -456,6 +463,7 @@ def test_univariate_selector(self):
456463
self.assertEqual(selector.getSelectionThreshold(), 1)
457464

458465
model = selector.fit(df)
466+
self.assertEqual(selector.uid, model.uid)
459467
self.assertEqual(model.selectedFeatures, [3])
460468

461469
output = model.transform(df)
@@ -487,6 +495,7 @@ def test_variance_threshold_selector(self):
487495
self.assertEqual(selector.getOutputCol(), "selectedFeatures")
488496

489497
model = selector.fit(df)
498+
self.assertEqual(selector.uid, model.uid)
490499
self.assertEqual(model.selectedFeatures, [2])
491500

492501
output = model.transform(df)
@@ -516,6 +525,7 @@ def test_word2vec(self):
516525
self.assertEqual(w2v.getMaxIter(), 1)
517526

518527
model = w2v.fit(df)
528+
self.assertEqual(w2v.uid, model.uid)
519529
self.assertEqual(model.getVectors().columns, ["word", "vector"])
520530
self.assertEqual(model.getVectors().count(), 3)
521531

@@ -567,6 +577,7 @@ def test_imputer(self):
567577
self.assertEqual(imputer.getOutputCols(), ["out_a", "out_b"])
568578

569579
model = imputer.fit(df)
580+
self.assertEqual(imputer.uid, model.uid)
570581
self.assertEqual(model.surrogateDF.columns, ["a", "b"])
571582
self.assertEqual(model.surrogateDF.count(), 1)
572583
self.assertEqual(list(model.surrogateDF.head()), [3.0, 4.0])
@@ -598,6 +609,7 @@ def test_count_vectorizer(self):
598609
self.assertEqual(cv.getOutputCol(), "vectors")
599610

600611
model = cv.fit(df)
612+
self.assertEqual(cv.uid, model.uid)
601613
self.assertEqual(sorted(model.vocabulary), ["a", "b", "c"])
602614

603615
output = model.transform(df)
@@ -624,6 +636,7 @@ def test_one_hot_encoder(self):
624636
self.assertEqual(encoder.getOutputCols(), ["output"])
625637

626638
model = encoder.fit(df)
639+
self.assertEqual(encoder.uid, model.uid)
627640
self.assertEqual(model.categorySizes, [3])
628641

629642
output = model.transform(df)
@@ -900,6 +913,7 @@ def test_idf(self):
900913
self.assertListEqual(idf.params, [idf.inputCol, idf.minDocFreq, idf.outputCol])
901914

902915
model = idf.fit(df, {idf.outputCol: "idf"})
916+
self.assertEqual(idf.uid, model.uid)
903917
# self.assertEqual(
904918
# model.uid, idf.uid, "Model should inherit the UID from its parent estimator."
905919
# )
@@ -1012,6 +1026,7 @@ def test_count_vectorizer_with_binary(self):
10121026
)
10131027
cv = CountVectorizer(binary=True, inputCol="words", outputCol="features")
10141028
model = cv.fit(dataset)
1029+
self.assertEqual(cv.uid, model.uid)
10151030

10161031
transformedList = model.transform(dataset).select("features", "expected").collect()
10171032

@@ -1047,6 +1062,8 @@ def test_count_vectorizer_with_maxDF(self):
10471062
)
10481063
cv = CountVectorizer(inputCol="words", outputCol="features")
10491064
model1 = cv.setMaxDF(3).fit(dataset)
1065+
self.assertEqual(cv.uid, model1.uid)
1066+
10501067
self.assertEqual(model1.vocabulary, ["b", "c", "d"])
10511068

10521069
transformedList1 = model1.transform(dataset).select("features", "expected").collect()
@@ -1119,6 +1136,8 @@ def test_rformula_force_index_label(self):
11191136
# Does not index label by default since it's numeric type.
11201137
rf = RFormula(formula="y ~ x + s")
11211138
model = rf.fit(df)
1139+
self.assertEqual(rf.uid, model.uid)
1140+
11221141
transformedDF = model.transform(df)
11231142
self.assertEqual(transformedDF.head().label, 1.0)
11241143
# Force to index label.

python/pyspark/ml/tests/test_fpm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_fp_growth(self):
4747
self.assertEqual(fp.getNumPartitions(), 1)
4848

4949
model = fp.fit(df)
50-
50+
self.assertEqual(fp.uid, model.uid)
5151
self.assertEqual(model.freqItemsets.columns, ["items", "freq"])
5252
self.assertEqual(model.freqItemsets.count(), 54)
5353

python/pyspark/ml/tests/test_regression.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def test_linear_regression(self):
7171
self.assertEqual(lr.getWeightCol(), "weight")
7272

7373
model = lr.fit(df)
74+
self.assertEqual(lr.uid, model.uid)
7475
self.assertEqual(model.numFeatures, 2)
7576
self.assertTrue(np.allclose(model.scale, 1.0, atol=1e-4))
7677
self.assertTrue(np.allclose(model.intercept, -0.35, atol=1e-4))
@@ -280,6 +281,7 @@ def test_decision_tree_regressor(self):
280281
self.assertEqual(dt.getLeafCol(), "leaf")
281282

282283
model = dt.fit(df)
284+
self.assertEqual(dt.uid, model.uid)
283285
self.assertEqual(model.numFeatures, 2)
284286
self.assertEqual(model.depth, 2)
285287
self.assertEqual(model.numNodes, 5)
@@ -337,6 +339,7 @@ def test_gbt_regressor(self):
337339
self.assertEqual(gbt.getLeafCol(), "leaf")
338340

339341
model = gbt.fit(df)
342+
self.assertEqual(gbt.uid, model.uid)
340343
self.assertEqual(model.numFeatures, 2)
341344
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
342345
# model.trees
@@ -412,6 +415,7 @@ def test_random_forest_regressor(self):
412415
self.assertEqual(rf.getLeafCol(), "leaf")
413416

414417
model = rf.fit(df)
418+
self.assertEqual(rf.uid, model.uid)
415419
self.assertEqual(model.numFeatures, 2)
416420
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
417421
# model.trees

python/pyspark/ml/util.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,9 @@ def wrapped(self: "JavaEstimator", dataset: "ConnectDataFrame") -> Any:
135135
(_, properties, _) = client.execute_command(command)
136136
model_info = deserialize(properties)
137137
client.add_ml_cache(model_info.obj_ref.id)
138-
return model_info.obj_ref.id
138+
model = self._create_model(model_info.obj_ref.id)
139+
model._resetUid(self.uid)
140+
return self._copyValues(model)
139141
else:
140142
return f(self, dataset)
141143

python/pyspark/ml/wrapper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,6 @@ def _create_model(self, java_model: "JavaObject") -> JM:
387387
"""
388388
raise NotImplementedError()
389389

390-
@try_remote_fit
391390
def _fit_java(self, dataset: DataFrame) -> "JavaObject":
392391
"""
393392
Fits a Java model to the input dataset.
@@ -407,6 +406,7 @@ def _fit_java(self, dataset: DataFrame) -> "JavaObject":
407406
self._transfer_params_to_java()
408407
return self._java_obj.fit(dataset._jdf)
409408

409+
@try_remote_fit
410410
def _fit(self, dataset: DataFrame) -> JM:
411411
java_model = self._fit_java(dataset)
412412
model = self._create_model(java_model)

0 commit comments

Comments
 (0)