2222from sklearn .utils .fixes import MaskedArray
2323from sklearn .utils .validation import _num_samples , indexable
2424
25+ from sklearn .externals .joblib import Parallel , delayed
26+
27+
2528class GridSearchCV (BaseSearchCV ):
2629 """Exhaustive search over specified parameter values for an estimator, using Spark to
2730 distribute the computations.
@@ -169,13 +172,10 @@ def __init__(self, sc, estimator, param_grid, scoring=None, fit_params=None,
169172 super (GridSearchCV , self ).__init__ (
170173 estimator , scoring , fit_params , n_jobs , iid ,
171174 refit , cv , verbose , pre_dispatch , error_score , return_train_score )
172- # super(GridSearchCV, self).__init__(
173- # estimator, scoring, fit_params, n_jobs, iid,
174- # refit, cv, verbose, pre_dispatch, error_score)
175175 self .sc = sc
176176 self .param_grid = param_grid
177- # self.grid_scores_ = None
178- self .cv_results_ = None # new
177+
178+ self .cv_results_ = None
179179 _check_param_grid (param_grid )
180180
181181 def fit_old (self , X , y = None ):
@@ -193,14 +193,9 @@ def fit_old(self, X, y=None):
193193 None for unsupervised learning.
194194
195195 """
196- # print "Exiting"
197- # sys.exit(0)
198196 return self ._fit (X , y , ParameterGrid (self .param_grid ))
199197
200198
201-
202-
203- #ef _fit(self, X, y, parameter_iterable, groups=None):
204199 def fit (self , X , y = None , groups = None , ** fit_params ):
205200
206201 if self .fit_params is not None :
@@ -223,8 +218,6 @@ def fit(self, X, y=None, groups=None, **fit_params):
223218 X , y , groups = indexable (X , y , groups )
224219 n_splits = cv .get_n_splits (X , y , groups )
225220 # Regenerate parameter iterable for each fit
226- #candidate_params = list(self._get_param_iterator())
227- #candidate_params = parameter_iterable # change later
228221 candidate_params = ParameterGrid (self .param_grid )
229222 n_candidates = len (candidate_params )
230223 if self .verbose > 0 :
@@ -235,8 +228,7 @@ def fit(self, X, y=None, groups=None, **fit_params):
235228 base_estimator = clone (self .estimator )
236229
237230 param_grid = [(parameters , train , test ) for parameters , (train , test ) in product (candidate_params , cv .split (X , y , groups ))]
238- #print "PARAM GRID:",param_grid,"\n"
239- #sys.exit(0)
231+
240232 # Because the original python code expects a certain order for the elements, we need to
241233 # respect it.
242234 indexed_param_grid = list (zip (range (len (param_grid )), param_grid ))
@@ -246,7 +238,6 @@ def fit(self, X, y=None, groups=None, **fit_params):
246238
247239 scorer = self .scorer_
248240 verbose = self .verbose
249- #fit_params = self.fit_params # DEPRECIATED: remove later
250241 error_score = self .error_score
251242 return_train_score = self .return_train_score
252243 fas = _fit_and_score
@@ -263,47 +254,14 @@ def fun(tup):
263254 return_parameters = False , error_score = error_score )
264255 return (index , res )
265256 indexed_out0 = dict (par_param_grid .map (fun ).collect ())
266- #print "Indexed out:",indexed_out0,"\n"
267257 out = [indexed_out0 [idx ] for idx in range (len (param_grid ))]
268258 if return_train_score :
269259 (train_scores , test_scores , test_sample_counts , fit_time ,
270260 score_time ) = zip (* out )
271261 else :
272262 (test_scores , test_sample_counts , fit_time , score_time ) = zip (* out )
273- #print "TRAIN SCORES:",train_scores
274- #print "SCORE TIME:",score_time
275-
276- # print "OUT:",out,"\n"
277- # print "OUT[0]:",out[0]
278- # print "OUT[0].keys:",out[0].keys()
279- # sys.exit(0)
280263 X_bc .unpersist ()
281264 y_bc .unpersist ()
282- #print "GOT HERE?!?!?!? - shouldn't happen"
283-
284-
285-
286- # pre_dispatch = self.pre_dispatch
287-
288- # out = Parallel(
289- # n_jobs=self.n_jobs, verbose=self.verbose,
290- # pre_dispatch=pre_dispatch
291- # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
292- # train, test, self.verbose, parameters,
293- # fit_params=fit_params,
294- # return_train_score=self.return_train_score,
295- # return_n_test_samples=True,
296- # return_times=True, return_parameters=False,
297- # error_score=self.error_score)
298- # for parameters, (train, test) in product(candidate_params,
299- # cv.split(X, y, groups)))
300-
301- # # if one choose to see train score, "out" will contain train score info
302- # if self.return_train_score:
303- # (train_scores, test_scores, test_sample_counts, fit_time,
304- # score_time) = zip(*out)
305- # else:
306- # (test_scores, test_sample_counts, fit_time, score_time) = zip(*out)
307265
308266 results = dict ()
309267
@@ -379,6 +337,150 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
379337 self .best_estimator_ = best_estimator
380338 return self
381339
340+ # def _get_param_iterator(self):
341+ # """Return ParameterGrid instance for the given param_grid"""
342+ # return ParameterGrid(self.param_grid)
343+
344+
345+ # def fit(self, X, y=None, groups=None, **fit_params):
346+ # """Run fit with all sets of parameters.
347+ # Parameters
348+ # ----------
349+ # X : array-like, shape = [n_samples, n_features]
350+ # Training vector, where n_samples is the number of samples and
351+ # n_features is the number of features.
352+ # y : array-like, shape = [n_samples] or [n_samples, n_output], optional
353+ # Target relative to X for classification or regression;
354+ # None for unsupervised learning.
355+ # groups : array-like, with shape (n_samples,), optional
356+ # Group labels for the samples used while splitting the dataset into
357+ # train/test set.
358+ # **fit_params : dict of string -> object
359+ # Parameters passed to the ``fit`` method of the estimator
360+ # """
361+ # if self.fit_params is not None:
362+ # warnings.warn('"fit_params" as a constructor argument was '
363+ # 'deprecated in version 0.19 and will be removed '
364+ # 'in version 0.21. Pass fit parameters to the '
365+ # '"fit" method instead.', DeprecationWarning)
366+ # if fit_params:
367+ # warnings.warn('Ignoring fit_params passed as a constructor '
368+ # 'argument in favor of keyword arguments to '
369+ # 'the "fit" method.', RuntimeWarning)
370+ # else:
371+ # fit_params = self.fit_params
372+ # estimator = self.estimator
373+ # cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
374+ # self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
375+
376+ # X, y, groups = indexable(X, y, groups)
377+ # n_splits = cv.get_n_splits(X, y, groups)
378+ # # Regenerate parameter iterable for each fit
379+ # candidate_params = list(self._get_param_iterator())
380+ # #candidate_params = ParameterGrid(self.param_grid)
381+ # n_candidates = len(candidate_params)
382+ # if self.verbose > 0:
383+ # print("Fitting {0} folds for each of {1} candidates, totalling"
384+ # " {2} fits".format(n_splits, n_candidates,
385+ # n_candidates * n_splits))
386+
387+ # base_estimator = clone(self.estimator)
388+ # pre_dispatch = self.pre_dispatch
389+
390+ # out = Parallel(
391+ # n_jobs=self.n_jobs, verbose=self.verbose,
392+ # pre_dispatch=pre_dispatch
393+ # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
394+ # train, test, self.verbose, parameters,
395+ # fit_params=fit_params,
396+ # return_train_score=self.return_train_score,
397+ # return_n_test_samples=True,
398+ # return_times=True, return_parameters=False,
399+ # error_score=self.error_score)
400+ # for parameters, (train, test) in product(candidate_params,
401+ # cv.split(X, y, groups)))
402+
403+ # # if one choose to see train score, "out" will contain train score info
404+ # if self.return_train_score:
405+ # (train_scores, test_scores, test_sample_counts, fit_time,
406+ # score_time) = zip(*out)
407+ # else:
408+ # (test_scores, test_sample_counts, fit_time, score_time) = zip(*out)
409+
410+ # results = dict()
411+
412+ # def _store(key_name, array, weights=None, splits=False, rank=False):
413+ # """A small helper to store the scores/times to the cv_results_"""
414+ # # When iterated first by splits, then by parameters
415+ # array = np.array(array, dtype=np.float64).reshape(n_candidates,
416+ # n_splits)
417+ # if splits:
418+ # for split_i in range(n_splits):
419+ # results["split%d_%s"
420+ # % (split_i, key_name)] = array[:, split_i]
421+
422+ # array_means = np.average(array, axis=1, weights=weights)
423+ # results['mean_%s' % key_name] = array_means
424+ # # Weighted std is not directly available in numpy
425+ # array_stds = np.sqrt(np.average((array -
426+ # array_means[:, np.newaxis]) ** 2,
427+ # axis=1, weights=weights))
428+ # results['std_%s' % key_name] = array_stds
429+
430+ # if rank:
431+ # results["rank_%s" % key_name] = np.asarray(
432+ # rankdata(-array_means, method='min'), dtype=np.int32)
433+
434+ # # Computed the (weighted) mean and std for test scores alone
435+ # # NOTE test_sample counts (weights) remain the same for all candidates
436+ # test_sample_counts = np.array(test_sample_counts[:n_splits],
437+ # dtype=np.int)
438+
439+ # _store('test_score', test_scores, splits=True, rank=True,
440+ # weights=test_sample_counts if self.iid else None)
441+ # if self.return_train_score:
442+ # _store('train_score', train_scores, splits=True)
443+ # _store('fit_time', fit_time)
444+ # _store('score_time', score_time)
445+
446+ # best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
447+ # best_parameters = candidate_params[best_index]
448+
449+ # # Use one MaskedArray and mask all the places where the param is not
450+ # # applicable for that candidate. Use defaultdict as each candidate may
451+ # # not contain all the params
452+ # param_results = defaultdict(partial(MaskedArray,
453+ # np.empty(n_candidates,),
454+ # mask=True,
455+ # dtype=object))
456+ # for cand_i, params in enumerate(candidate_params):
457+ # for name, value in params.items():
458+ # # An all masked empty array gets created for the key
459+ # # `"param_%s" % name` at the first occurence of `name`.
460+ # # Setting the value at an index also unmasks that index
461+ # param_results["param_%s" % name][cand_i] = value
462+
463+ # results.update(param_results)
464+
465+ # # Store a list of param dicts at the key 'params'
466+ # results['params'] = candidate_params
467+
468+ # self.cv_results_ = results
469+ # self.best_index_ = best_index
470+ # self.n_splits_ = n_splits
471+
472+ # if self.refit:
473+ # # fit the best estimator using the entire dataset
474+ # # clone first to work around broken estimators
475+ # best_estimator = clone(base_estimator).set_params(
476+ # **best_parameters)
477+ # if y is not None:
478+ # best_estimator.fit(X, y, **fit_params)
479+ # else:
480+ # best_estimator.fit(X, **fit_params)
481+ # self.best_estimator_ = best_estimator
482+ # return self
483+
382484
383485
384486
0 commit comments