Merge pull request #64 from smurching/spark-2.2-patch

thunterdb · web-flow · commit 1a4cd4e20547 · 2017-09-20T11:09:33.000-07:00
[#63] Remove usage of deprecated scikit-learn API in GridSearchCV
diff --git a/python/README.md b/python/README.md
@@ -65,7 +65,10 @@ More extensive documentation (generated with Sphinx) is available in the `python
 ## Changelog
 
 - 2015-12-10 First public release (0.1)
-- 2016-08-16 Minor release:
-   1. the official Spark target is Spark 0.2
+- 2016-08-16 Minor release (0.2.0):
+   1. the official Spark target is Spark 2.0
    2. support for keyed models
+- 2017-09-14 Minor release (0.2.2):
+   1. The official Spark target is Spark >= 2.1
+
 
diff --git a/python/requirements.txt b/python/requirements.txt
@@ -1,2 +1,2 @@
 # This file should list any python package dependencies.
-scikit-learn==0.18.1
+scikit-learn>=0.18.1, <=0.19
diff --git a/python/setup.py b/python/setup.py
@@ -19,7 +19,7 @@
     "Programming Language :: Python",
     "Topic :: Scientific/Engineering",
 ]
-INSTALL_REQUIRES = ["scikit-learn >= 0.18.1"]
+INSTALL_REQUIRES = ["scikit-learn >=0.18.1, <= 0.19"]
 
 # Project root
 ROOT = os.path.abspath(os.getcwd() + "/")
diff --git a/python/spark_sklearn/grid_search.py b/python/spark_sklearn/grid_search.py
@@ -2,9 +2,6 @@
 Class for parallelizing GridSearchCV jobs in scikit-learn
 """
 
-import sys
-
-from itertools import product
 from collections import defaultdict, Sized
 from functools import partial
 import warnings
@@ -115,21 +112,23 @@ class GridSearchCV(BaseSearchCV):
     Examples
     --------
     >>> from sklearn import svm, datasets
-    >>> from sklearn.model_selection import GridSearchCV
+    >>> from spark_sklearn.grid_search import GridSearchCV
+    >>> from spark_sklearn.util import createLocalSparkSession
+    >>> sc = createLocalSparkSession().sparkContext
     >>> iris = datasets.load_iris()
     >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
     >>> svr = svm.SVC()
-    >>> clf = GridSearchCV(svr, parameters)
+    >>> clf = GridSearchCV(sc, svr, parameters)
     >>> clf.fit(iris.data, iris.target)
     ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
     GridSearchCV(cv=None, error_score=...,
            estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
-                         decision_function_shape=None, degree=..., gamma=...,
+                         decision_function_shape=..., degree=..., gamma=...,
                          kernel='rbf', max_iter=-1, probability=False,
                          random_state=None, shrinking=True, tol=...,
                          verbose=False),
            fit_params={}, iid=..., n_jobs=1,
-           param_grid=..., pre_dispatch=..., refit=..., return_train_score=...,
+           param_grid=..., pre_dispatch=..., refit=...,
            scoring=..., verbose=...)
     >>> sorted(clf.cv_results_.keys())
     ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
@@ -242,9 +241,12 @@ def __init__(self, sc, estimator, param_grid, scoring=None, fit_params=None,
                  n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
                  pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True):
         super(GridSearchCV, self).__init__(
-            estimator=estimator, scoring=scoring, fit_params=fit_params, n_jobs=n_jobs, iid=iid,
+            estimator=estimator, scoring=scoring, n_jobs=n_jobs, iid=iid,
             refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score,
             return_train_score=return_train_score)
+
+        self.fit_params = fit_params if fit_params is not None else {}
+
         self.sc = sc
         self.param_grid = param_grid
 
diff --git a/python/spark_sklearn/tests/test_grid_search_1.py b/python/spark_sklearn/tests/test_grid_search_1.py
@@ -32,6 +32,10 @@ def do_test_expected(*kwargs):
     return do_test_expected
         
 def _add_to_module():
+    # NOTE: This doesn't actually run scikit-learn tests against SPGridSearchWrapper
+    # for scikit-learn >= 0.18, since the scikit-learn tests (in sklearn.model_selection.tests) use
+    # sklearn.model_selection.GridSearchCV (not sklearn.grid_search.GridSearchCV)
+    # TODO: Get scikit-learn tests to pass with spark-sklearn GridSearch implementation
     SKGridSearchCV = sklearn.grid_search.GridSearchCV
     sklearn.grid_search.GridSearchCV = SPGridSearchWrapper
     sklearn.grid_search.GridSearchCV_original = SKGridSearchCV

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# This file should list any python package dependencies.`
`2`		`-scikit-learn==0.18.1`
	`2`	`+scikit-learn>=0.18.1, <=0.19`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`"Programming Language :: Python",`
`20`	`20`	`"Topic :: Scientific/Engineering",`
`21`	`21`	`]`
`22`		`-INSTALL_REQUIRES = ["scikit-learn >= 0.18.1"]`
	`22`	`+INSTALL_REQUIRES = ["scikit-learn >=0.18.1, <= 0.19"]`
`23`	`23`
`24`	`24`	`# Project root`
`25`	`25`	`ROOT = os.path.abspath(os.getcwd() + "/")`