Clean up sklearn warnings (#94)

srowen · web-flow · commit 9f67a7418011 · 2018-12-10T10:48:21.000-06:00
diff --git a/README.rst b/README.rst
@@ -30,7 +30,7 @@ This project is also available as as `Spark package <https://spark-packages.org/
 
 The developer version has the following requirements:
 
-- a recent release of scikit-learn. Releases 0.18.1, 0.19.0 have been tested, older versions may work too.
+- scikit-learn 0.18 or 0.19. Later versions may work, but tests currently are incompatible with 0.20.
 - Spark >= 2.1.1. Spark may be downloaded from the `Spark website <https://spark.apache.org/>`_.
   In order to use this package, you need to use the pyspark interpreter or another Spark-compliant python
   interpreter. See the `Spark guide <https://spark.apache.org/docs/latest/programming-guide.html#overview>`_
@@ -61,11 +61,11 @@ on how to install the package.
 
 .. code:: python
 
-    from sklearn import svm, grid_search, datasets
+    from sklearn import svm, datasets
     from spark_sklearn import GridSearchCV
     iris = datasets.load_iris()
     parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
-    svr = svm.SVC()
+    svr = svm.SVC(gamma='auto')
     clf = GridSearchCV(sc, svr, parameters)
     clf.fit(iris.data, iris.target)
 
diff --git a/python/spark_sklearn/converter_test.py b/python/spark_sklearn/converter_test.py
@@ -26,7 +26,7 @@ def _compare_GLMs(self, skl, spark):
         self.assertEqual(skl.intercept_, spark.intercept)
 
     def test_LogisticRegression_skl2spark(self):
-        skl_lr = SKL_LogisticRegression().fit(self.X, self.y)
+        skl_lr = SKL_LogisticRegression(solver='lbfgs').fit(self.X, self.y)
         lr = self.converter.toSpark(skl_lr)
         self.assertTrue(isinstance(lr, LogisticRegressionModel),
                         "Expected LogisticRegressionModel but found type %s" % type(lr))
@@ -72,7 +72,6 @@ def ztest_toPandas(self):
 @fixtureReuseSparkSession
 class CSRVectorUDTTests(MLlibTestCase):
 
-    @unittest.skip("CSR Matrix support not present for Spark 2.0 - see issue #24")
     def test_scipy_sparse(self):
         data = [(self.list2csr([0.1, 0.2]),)]
         df = self.sql.createDataFrame(data, ["features"])
diff --git a/python/spark_sklearn/grid_search.py b/python/spark_sklearn/grid_search.py
@@ -117,11 +117,11 @@ class GridSearchCV(BaseSearchCV):
     >>> sc = createLocalSparkSession().sparkContext
     >>> iris = datasets.load_iris()
     >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
-    >>> svr = svm.SVC()
+    >>> svr = svm.SVC(gamma='auto')
     >>> clf = GridSearchCV(sc, svr, parameters)
     >>> clf.fit(iris.data, iris.target)
     ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-    GridSearchCV(cv=None, error_score=...,
+    GridSearchCV(cv=3, error_score=...,
            estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
                          decision_function_shape=..., degree=..., gamma=...,
                          kernel='rbf', max_iter=-1, probability=False,
@@ -242,7 +242,7 @@ class GridSearchCV(BaseSearchCV):
 
 
     def __init__(self, sc, estimator, param_grid, scoring=None, fit_params=None,
-                 n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
+                 n_jobs=1, iid=True, refit=True, cv=3, verbose=0,
                  pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True):
         super(GridSearchCV, self).__init__(
             estimator=estimator, scoring=scoring, n_jobs=n_jobs, iid=iid,
diff --git a/python/spark_sklearn/tests/test_grid_search_2.py b/python/spark_sklearn/tests/test_grid_search_2.py
@@ -35,7 +35,7 @@ def test_example(self):
         # The classic example from the sklearn documentation
         iris = datasets.load_iris()
         parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
-        svr = svm.SVC()
+        svr = svm.SVC(gamma='auto')
         clf = grid_search.GridSearchCV(svr, parameters)
         clf.fit(iris.data, iris.target)
 
@@ -55,7 +55,7 @@ def setUp(self):
 
     def test_cv_linreg(self):
         pipeline = SKL_Pipeline([
-            ('lasso', SKL_Lasso(max_iter=1))
+            ('lasso', SKL_Lasso())
         ])
         parameters = {
             'lasso__alpha': (0.001, 0.005, 0.01)
@@ -70,7 +70,7 @@ def test_cv_pipeline(self):
         pipeline = SKL_Pipeline([
             ('vect', SKL_HashingVectorizer(n_features=20)),
             ('tfidf', SKL_TfidfTransformer(use_idf=False)),
-            ('lasso', SKL_Lasso(max_iter=1))
+            ('lasso', SKL_Lasso())
         ])
         parameters = {
             'lasso__alpha': (0.001, 0.005, 0.01)
@@ -113,7 +113,7 @@ def test_cv_lasso_with_mllib_featurization(self):
         df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))
 
         pipeline = SKL_Pipeline([
-            ('lasso', SKL_Lasso(max_iter=1))
+            ('lasso', SKL_Lasso())
         ])
         parameters = {
             'lasso__alpha': (0.001, 0.005, 0.01)
diff --git a/python/spark_sklearn/tests/test_keyed_models.py b/python/spark_sklearn/tests/test_keyed_models.py
@@ -249,7 +249,7 @@ def makeOutput(estimator, X):
         _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])
 
     def test_transformer(self):
-        minExamples = 1
+        minExamples = 2
         featureGen = lambda: np.random.random(KeyedModelTests.NDIM)
         labelGen = None
         self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
@@ -277,7 +277,7 @@ def test_classification_predictor(self):
         # Need to ensure each user has at least one of each label to train on.
         cyc = cycle([-1, 1])
         labelGen = lambda: next(cyc)
-        lr = LogisticRegression(random_state=0)
+        lr = LogisticRegression(solver='lbfgs', random_state=0)
         self.checkKeyedModelEquivalent(minExamples, featureGen, labelGen,
                                        sklearnEstimator=lr, yCol="y")