Works except for same tests that fail on master (and the one test that fails the sklearn grid_search implementation). Still need to clean up code before pushing.

ajaysaini725 · ajaysaini725 · commit fdf8eacd9482 · 2017-06-23T15:53:42.000-07:00
diff --git a/python/run-tests.sh b/python/run-tests.sh
@@ -10,4 +10,4 @@ if [ "$#" = 0 ]; then
 else
     ARGS="$@"
 fi
-exec nosetests $ARGS --where $DIR --nocapture
+exec nosetests $ARGS --where $DIR
diff --git a/python/spark_sklearn/grid_search.py b/python/spark_sklearn/grid_search.py
@@ -22,6 +22,9 @@
 from sklearn.utils.fixes import MaskedArray
 from sklearn.utils.validation import _num_samples, indexable
 
+from sklearn.externals.joblib import Parallel, delayed
+
+
 class GridSearchCV(BaseSearchCV):
     """Exhaustive search over specified parameter values for an estimator, using Spark to
     distribute the computations.
@@ -169,13 +172,10 @@ def __init__(self, sc, estimator, param_grid, scoring=None, fit_params=None,
         super(GridSearchCV, self).__init__(
             estimator, scoring, fit_params, n_jobs, iid,
             refit, cv, verbose, pre_dispatch, error_score, return_train_score)
-        # super(GridSearchCV, self).__init__(
-        #     estimator, scoring, fit_params, n_jobs, iid,
-        #     refit, cv, verbose, pre_dispatch, error_score)
         self.sc = sc
         self.param_grid = param_grid
-        # self.grid_scores_ = None
-        self.cv_results_ = None # new
+
+        self.cv_results_ = None
         _check_param_grid(param_grid)
 
     def fit_old(self, X, y=None):
@@ -193,14 +193,9 @@ def fit_old(self, X, y=None):
             None for unsupervised learning.
 
         """
-        # print "Exiting"
-        # sys.exit(0)
         return self._fit(X, y, ParameterGrid(self.param_grid))
 
     
-
-
-    #ef _fit(self, X, y, parameter_iterable, groups=None):
     def fit(self, X, y=None, groups=None, **fit_params):
 
       if self.fit_params is not None:
@@ -223,8 +218,6 @@ def fit(self, X, y=None, groups=None, **fit_params):
         X, y, groups = indexable(X, y, groups)
         n_splits = cv.get_n_splits(X, y, groups)
         # Regenerate parameter iterable for each fit
-        #candidate_params = list(self._get_param_iterator())
-        #candidate_params = parameter_iterable # change later
         candidate_params = ParameterGrid(self.param_grid)
         n_candidates = len(candidate_params)
         if self.verbose > 0:
@@ -235,8 +228,7 @@ def fit(self, X, y=None, groups=None, **fit_params):
         base_estimator = clone(self.estimator)
 
         param_grid = [(parameters, train, test) for parameters, (train, test) in product(candidate_params, cv.split(X, y, groups))]
-        #print "PARAM GRID:",param_grid,"\n"
-        #sys.exit(0)
+
         # Because the original python code expects a certain order for the elements, we need to
         # respect it.
         indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
@@ -246,7 +238,6 @@ def fit(self, X, y=None, groups=None, **fit_params):
 
         scorer = self.scorer_
         verbose = self.verbose
-        #fit_params = self.fit_params # DEPRECIATED: remove later
         error_score = self.error_score
         return_train_score = self.return_train_score
         fas = _fit_and_score
@@ -263,47 +254,14 @@ def fun(tup):
                                   return_parameters=False, error_score=error_score)
             return (index, res)
         indexed_out0 = dict(par_param_grid.map(fun).collect())
-        #print "Indexed out:",indexed_out0,"\n"
         out = [indexed_out0[idx] for idx in range(len(param_grid))]
         if return_train_score:
             (train_scores, test_scores, test_sample_counts, fit_time,
              score_time) = zip(*out)
         else:
             (test_scores, test_sample_counts, fit_time, score_time) = zip(*out)
-        #print "TRAIN SCORES:",train_scores
-        #print "SCORE TIME:",score_time
-
-        # print "OUT:",out,"\n"
-        # print "OUT[0]:",out[0]
-        # print "OUT[0].keys:",out[0].keys()
-        # sys.exit(0)
         X_bc.unpersist()
         y_bc.unpersist()
-        #print "GOT HERE?!?!?!? - shouldn't happen"
-
-
-
-        # pre_dispatch = self.pre_dispatch
-
-        # out = Parallel(
-        #     n_jobs=self.n_jobs, verbose=self.verbose,
-        #     pre_dispatch=pre_dispatch
-        # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
-        #                           train, test, self.verbose, parameters,
-        #                           fit_params=fit_params,
-        #                           return_train_score=self.return_train_score,
-        #                           return_n_test_samples=True,
-        #                           return_times=True, return_parameters=False,
-        #                           error_score=self.error_score)
-        #   for parameters, (train, test) in product(candidate_params,
-        #                                            cv.split(X, y, groups)))
-
-        # # if one choose to see train score, "out" will contain train score info
-        # if self.return_train_score:
-        #     (train_scores, test_scores, test_sample_counts, fit_time,
-        #      score_time) = zip(*out)
-        # else:
-        #     (test_scores, test_sample_counts, fit_time, score_time) = zip(*out)
 
         results = dict()
 
@@ -379,6 +337,150 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
             self.best_estimator_ = best_estimator
         return self
 
+    # def _get_param_iterator(self):
+    #     """Return ParameterGrid instance for the given param_grid"""
+    #     return ParameterGrid(self.param_grid)
+
+
+    # def fit(self, X, y=None, groups=None, **fit_params):
+    #         """Run fit with all sets of parameters.
+    #         Parameters
+    #         ----------
+    #         X : array-like, shape = [n_samples, n_features]
+    #             Training vector, where n_samples is the number of samples and
+    #             n_features is the number of features.
+    #         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+    #             Target relative to X for classification or regression;
+    #             None for unsupervised learning.
+    #         groups : array-like, with shape (n_samples,), optional
+    #             Group labels for the samples used while splitting the dataset into
+    #             train/test set.
+    #         **fit_params : dict of string -> object
+    #             Parameters passed to the ``fit`` method of the estimator
+    #         """
+    #         if self.fit_params is not None:
+    #             warnings.warn('"fit_params" as a constructor argument was '
+    #                           'deprecated in version 0.19 and will be removed '
+    #                           'in version 0.21. Pass fit parameters to the '
+    #                           '"fit" method instead.', DeprecationWarning)
+    #             if fit_params:
+    #                 warnings.warn('Ignoring fit_params passed as a constructor '
+    #                               'argument in favor of keyword arguments to '
+    #                               'the "fit" method.', RuntimeWarning)
+    #             else:
+    #                 fit_params = self.fit_params
+    #         estimator = self.estimator
+    #         cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
+    #         self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
+
+    #         X, y, groups = indexable(X, y, groups)
+    #         n_splits = cv.get_n_splits(X, y, groups)
+    #         # Regenerate parameter iterable for each fit
+    #         candidate_params = list(self._get_param_iterator())
+    #         #candidate_params = ParameterGrid(self.param_grid)
+    #         n_candidates = len(candidate_params)
+    #         if self.verbose > 0:
+    #             print("Fitting {0} folds for each of {1} candidates, totalling"
+    #                   " {2} fits".format(n_splits, n_candidates,
+    #                                      n_candidates * n_splits))
+
+    #         base_estimator = clone(self.estimator)
+    #         pre_dispatch = self.pre_dispatch
+
+    #         out = Parallel(
+    #             n_jobs=self.n_jobs, verbose=self.verbose,
+    #             pre_dispatch=pre_dispatch
+    #         )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
+    #                                   train, test, self.verbose, parameters,
+    #                                   fit_params=fit_params,
+    #                                   return_train_score=self.return_train_score,
+    #                                   return_n_test_samples=True,
+    #                                   return_times=True, return_parameters=False,
+    #                                   error_score=self.error_score)
+    #           for parameters, (train, test) in product(candidate_params,
+    #                                                    cv.split(X, y, groups)))
+
+    #         # if one choose to see train score, "out" will contain train score info
+    #         if self.return_train_score:
+    #             (train_scores, test_scores, test_sample_counts, fit_time,
+    #              score_time) = zip(*out)
+    #         else:
+    #             (test_scores, test_sample_counts, fit_time, score_time) = zip(*out)
+
+    #         results = dict()
+
+    #         def _store(key_name, array, weights=None, splits=False, rank=False):
+    #             """A small helper to store the scores/times to the cv_results_"""
+    #             # When iterated first by splits, then by parameters
+    #             array = np.array(array, dtype=np.float64).reshape(n_candidates,
+    #                                                               n_splits)
+    #             if splits:
+    #                 for split_i in range(n_splits):
+    #                     results["split%d_%s"
+    #                             % (split_i, key_name)] = array[:, split_i]
+
+    #             array_means = np.average(array, axis=1, weights=weights)
+    #             results['mean_%s' % key_name] = array_means
+    #             # Weighted std is not directly available in numpy
+    #             array_stds = np.sqrt(np.average((array -
+    #                                              array_means[:, np.newaxis]) ** 2,
+    #                                             axis=1, weights=weights))
+    #             results['std_%s' % key_name] = array_stds
+
+    #             if rank:
+    #                 results["rank_%s" % key_name] = np.asarray(
+    #                     rankdata(-array_means, method='min'), dtype=np.int32)
+
+    #         # Computed the (weighted) mean and std for test scores alone
+    #         # NOTE test_sample counts (weights) remain the same for all candidates
+    #         test_sample_counts = np.array(test_sample_counts[:n_splits],
+    #                                       dtype=np.int)
+
+    #         _store('test_score', test_scores, splits=True, rank=True,
+    #                weights=test_sample_counts if self.iid else None)
+    #         if self.return_train_score:
+    #             _store('train_score', train_scores, splits=True)
+    #         _store('fit_time', fit_time)
+    #         _store('score_time', score_time)
+
+    #         best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
+    #         best_parameters = candidate_params[best_index]
+
+    #         # Use one MaskedArray and mask all the places where the param is not
+    #         # applicable for that candidate. Use defaultdict as each candidate may
+    #         # not contain all the params
+    #         param_results = defaultdict(partial(MaskedArray,
+    #                                             np.empty(n_candidates,),
+    #                                             mask=True,
+    #                                             dtype=object))
+    #         for cand_i, params in enumerate(candidate_params):
+    #             for name, value in params.items():
+    #                 # An all masked empty array gets created for the key
+    #                 # `"param_%s" % name` at the first occurence of `name`.
+    #                 # Setting the value at an index also unmasks that index
+    #                 param_results["param_%s" % name][cand_i] = value
+
+    #         results.update(param_results)
+
+    #         # Store a list of param dicts at the key 'params'
+    #         results['params'] = candidate_params
+
+    #         self.cv_results_ = results
+    #         self.best_index_ = best_index
+    #         self.n_splits_ = n_splits
+
+    #         if self.refit:
+    #             # fit the best estimator using the entire dataset
+    #             # clone first to work around broken estimators
+    #             best_estimator = clone(base_estimator).set_params(
+    #                 **best_parameters)
+    #             if y is not None:
+    #                 best_estimator.fit(X, y, **fit_params)
+    #             else:
+    #                 best_estimator.fit(X, **fit_params)
+    #             self.best_estimator_ = best_estimator
+    #         return self
+
 
 
 
diff --git a/python/spark_sklearn/tests/test_grid_search_1.py b/python/spark_sklearn/tests/test_grid_search_1.py
@@ -32,6 +32,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
                   'test_grid_search_precomputed_kernel_error_kernel_function',
                   'test_grid_search_precomputed_kernel',
                   'test_grid_search_failing_classifier_raise',
+                  'test_grid_search_score_method', # added this because the sklearn implementation of fit() fails it
                   'test_grid_search_failing_classifier']) # This one we should investigate
 
 def _create_method(method):
diff --git a/python/spark_sklearn/tests/test_grid_search_2.py b/python/spark_sklearn/tests/test_grid_search_2.py
@@ -38,14 +38,9 @@ def test_example(self):
         svr = svm.SVC()
         clf = grid_search.GridSearchCV(svr, parameters)
         clf.fit(iris.data, iris.target)
-        # clf = grid_search.GridSearchCV(svr)
-        # clf.fit(iris.data, iris.target, parameters)
-
 
         clf2 = GridSearchCV(self.sc, svr, parameters)
         clf2.fit(iris.data, iris.target)
-        # clf2 = GridSearchCV(self.sc, svr)
-        # clf2.fit(iris.data, iris.target, parameters)
 
         b1 = clf.estimator
         b2 = clf2.estimator
@@ -70,13 +65,7 @@ def test_cv_linreg(self):
         X = scipy.sparse.vstack(map(lambda x: self.list2csr([x, x+1.0]), range(0, 100)))
         y = np.array(list(range(0, 100))).reshape((100,1))
         skl_gs = grid_search.fit(X, y)
-        #skl_gs = grid_search.fit(X, y, parameters)
-        #assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha'])
-        #print "CV RESULTS KEYS:",skl_gs.cv_results_.keys()
         assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
-        # TODO
-        # for gs in skl_gs.grid_scores_:
-        #     pass # assert(gs.)
 
     def test_cv_pipeline(self):
         pipeline = SKL_Pipeline([
@@ -101,12 +90,7 @@ def test_cv_pipeline(self):
                 ('too cool', 2.0)]
         df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
         skl_gs = grid_search.fit(df.review.values, df.rating.values)
-        #skl_gs = grid_search.fit(df.review.values, df.rating.values, parameters)
-        #assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha'])
         assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
-        # TODO
-        # for gs in skl_gs.grid_scores_:
-        #     pass # assert(gs.)
 
     @unittest.skip("disable this test until we have numpy <-> dataframe conversion")
     def test_cv_lasso_with_mllib_featurization(self):
@@ -139,11 +123,4 @@ def test_cv_lasso_with_mllib_featurization(self):
 
         grid_search = GridSearchCV(self.sc, pipeline, parameters)
         skl_gs = grid_search.fit(df.review.values, df.rating.values)
-        #grid_search = GridSearchCV(self.sc, pipeline)
-        #skl_gs = grid_search.fit(df.review.values, df.rating.values, parameters)
-
-        #assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha'])
         assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
-        # TODO
-        # for gs in skl_gs.grid_scores_:
-        #     pass