2828import sklearn .tree
2929import sklearn .cluster
3030
31- if LooseVersion (sklearn .__version__ ) < "0.20" :
32- from sklearn .preprocessing import Imputer
33- else :
34- from sklearn .impute import SimpleImputer as Imputer
3531
3632import openml
3733from openml .extensions .sklearn import SklearnExtension
3834from openml .exceptions import PyOpenMLError
3935from openml .flows import OpenMLFlow
4036from openml .flows .functions import assert_flows_equal
4137from openml .runs .trace import OpenMLRunTrace
42- from openml .testing import TestBase
38+ from openml .testing import TestBase , SimpleImputer
39+
4340
4441this_directory = os .path .dirname (os .path .abspath (__file__ ))
4542sys .path .append (this_directory )
@@ -285,11 +282,14 @@ def test_serialize_pipeline(self):
285282 # Comparing the pipeline
286283 # The parameters only have the name of base objects(not the whole flow)
287284 # as value
288- # memory parameter has been added in 0.19
285+ # memory parameter has been added in 0.19, verbose in 0.21
289286 if LooseVersion (sklearn .__version__ ) < "0.19" :
290287 self .assertEqual (len (serialization .parameters ), 1 )
291- else :
288+ elif LooseVersion ( sklearn . __version__ ) < "0.21" :
292289 self .assertEqual (len (serialization .parameters ), 2 )
290+ else :
291+ self .assertEqual (len (serialization .parameters ), 3 )
292+
293293 # Hard to compare two representations of a dict due to possibly
294294 # different sorting. Making a json makes it easier
295295 self .assertEqual (
@@ -374,8 +374,10 @@ def test_serialize_pipeline_clustering(self):
374374 # memory parameter has been added in 0.19
375375 if LooseVersion (sklearn .__version__ ) < "0.19" :
376376 self .assertEqual (len (serialization .parameters ), 1 )
377- else :
377+ elif LooseVersion ( sklearn . __version__ ) < "0.21" :
378378 self .assertEqual (len (serialization .parameters ), 2 )
379+ else :
380+ self .assertEqual (len (serialization .parameters ), 3 )
379381 # Hard to compare two representations of a dict due to possibly
380382 # different sorting. Making a json makes it easier
381383 self .assertEqual (
@@ -624,7 +626,7 @@ def test_serialize_feature_union_switched_names(self):
624626 .format (module_name_encoder ))
625627
626628 def test_serialize_complex_flow (self ):
627- ohe = sklearn .preprocessing .OneHotEncoder (categorical_features = [ 0 ] )
629+ ohe = sklearn .preprocessing .OneHotEncoder ()
628630 scaler = sklearn .preprocessing .StandardScaler (with_mean = False )
629631 boosting = sklearn .ensemble .AdaBoostClassifier (
630632 base_estimator = sklearn .tree .DecisionTreeClassifier ())
@@ -747,25 +749,26 @@ def test_serialize_simple_parameter_grid(self):
747749 # Examples from the scikit-learn documentation
748750 models = [sklearn .svm .SVC (), sklearn .ensemble .RandomForestClassifier ()]
749751 grids = \
750- [[{'C' : [1 , 10 , 100 , 1000 ], 'kernel' : ['linear' ]},
751- {'C' : [1 , 10 , 100 , 1000 ], 'gamma' : [0.001 , 0.0001 ],
752- 'kernel' : ['rbf' ]}],
753- {"max_depth" : [3 , None ],
754- "max_features" : [1 , 3 , 10 ],
755- "min_samples_split" : [1 , 3 , 10 ],
756- "min_samples_leaf" : [1 , 3 , 10 ],
757- "bootstrap" : [True , False ],
758- "criterion" : ["gini" , "entropy" ]}]
752+ [[OrderedDict ([('C' , [1 , 10 , 100 , 1000 ]), ('kernel' , ['linear' ])]),
753+ OrderedDict ([('C' , [1 , 10 , 100 , 1000 ]), ('gamma' , [0.001 , 0.0001 ]),
754+ ('kernel' , ['rbf' ])])],
755+ OrderedDict ([("bootstrap" , [True , False ]),
756+ ("criterion" , ["gini" , "entropy" ]),
757+ ("max_depth" , [3 , None ]),
758+ ("max_features" , [1 , 3 , 10 ]),
759+ ("min_samples_leaf" , [1 , 3 , 10 ]),
760+ ("min_samples_split" , [1 , 3 , 10 ])
761+ ])]
759762
760763 for grid , model in zip (grids , models ):
761764 serialized = self .extension .model_to_flow (grid )
762765 deserialized = self .extension .flow_to_model (serialized )
763766
764767 self .assertEqual (deserialized , grid )
765768 self .assertIsNot (deserialized , grid )
766-
769+ # providing error_score because nan != nan
767770 hpo = sklearn .model_selection .GridSearchCV (
768- param_grid = grid , estimator = model )
771+ param_grid = grid , estimator = model , error_score = - 1000 )
769772
770773 serialized = self .extension .model_to_flow (hpo )
771774 deserialized = self .extension .flow_to_model (serialized )
@@ -943,7 +946,7 @@ def test_illegal_parameter_names(self):
943946 def test_illegal_parameter_names_pipeline (self ):
944947 # illegal name: steps
945948 steps = [
946- ('Imputer' , Imputer (strategy = 'median' )),
949+ ('Imputer' , SimpleImputer (strategy = 'median' )),
947950 ('OneHotEncoder' ,
948951 sklearn .preprocessing .OneHotEncoder (sparse = False ,
949952 handle_unknown = 'ignore' )),
@@ -956,7 +959,7 @@ def test_illegal_parameter_names_featureunion(self):
956959 # illegal name: transformer_list
957960 transformer_list = [
958961 ('transformer_list' ,
959- Imputer (strategy = 'median' )),
962+ SimpleImputer (strategy = 'median' )),
960963 ('OneHotEncoder' ,
961964 sklearn .preprocessing .OneHotEncoder (sparse = False ,
962965 handle_unknown = 'ignore' ))
@@ -1015,18 +1018,25 @@ def test_paralizable_check(self):
10151018 self .extension ._prevent_optimize_n_jobs (model )
10161019
10171020 def test__get_fn_arguments_with_defaults (self ):
1018- if LooseVersion (sklearn .__version__ ) < "0.19" :
1021+ sklearn_version = LooseVersion (sklearn .__version__ )
1022+ if sklearn_version < "0.19" :
10191023 fns = [
10201024 (sklearn .ensemble .RandomForestRegressor .__init__ , 15 ),
10211025 (sklearn .tree .DecisionTreeClassifier .__init__ , 12 ),
10221026 (sklearn .pipeline .Pipeline .__init__ , 0 )
10231027 ]
1024- else :
1028+ elif sklearn_version < "0.21" :
10251029 fns = [
10261030 (sklearn .ensemble .RandomForestRegressor .__init__ , 16 ),
10271031 (sklearn .tree .DecisionTreeClassifier .__init__ , 13 ),
10281032 (sklearn .pipeline .Pipeline .__init__ , 1 )
10291033 ]
1034+ else :
1035+ fns = [
1036+ (sklearn .ensemble .RandomForestRegressor .__init__ , 16 ),
1037+ (sklearn .tree .DecisionTreeClassifier .__init__ , 13 ),
1038+ (sklearn .pipeline .Pipeline .__init__ , 2 )
1039+ ]
10301040
10311041 for fn , num_params_with_defaults in fns :
10321042 defaults , defaultless = (
@@ -1047,7 +1057,7 @@ def test_deserialize_with_defaults(self):
10471057 # used the 'initialize_with_defaults' flag of the deserialization
10481058 # method to return a flow that contains default hyperparameter
10491059 # settings.
1050- steps = [('Imputer' , Imputer ()),
1060+ steps = [('Imputer' , SimpleImputer ()),
10511061 ('OneHotEncoder' , sklearn .preprocessing .OneHotEncoder ()),
10521062 ('Estimator' , sklearn .tree .DecisionTreeClassifier ())]
10531063 pipe_orig = sklearn .pipeline .Pipeline (steps = steps )
@@ -1071,7 +1081,7 @@ def test_deserialize_adaboost_with_defaults(self):
10711081 # used the 'initialize_with_defaults' flag of the deserialization
10721082 # method to return a flow that contains default hyperparameter
10731083 # settings.
1074- steps = [('Imputer' , Imputer ()),
1084+ steps = [('Imputer' , SimpleImputer ()),
10751085 ('OneHotEncoder' , sklearn .preprocessing .OneHotEncoder ()),
10761086 ('Estimator' , sklearn .ensemble .AdaBoostClassifier (
10771087 sklearn .tree .DecisionTreeClassifier ()))]
@@ -1097,7 +1107,7 @@ def test_deserialize_complex_with_defaults(self):
10971107 # method to return a flow that contains default hyperparameter
10981108 # settings.
10991109 steps = [
1100- ('Imputer' , Imputer ()),
1110+ ('Imputer' , SimpleImputer ()),
11011111 ('OneHotEncoder' , sklearn .preprocessing .OneHotEncoder ()),
11021112 (
11031113 'Estimator' ,
@@ -1237,7 +1247,7 @@ def test_run_model_on_task(self):
12371247 class MyPipe (sklearn .pipeline .Pipeline ):
12381248 pass
12391249 task = openml .tasks .get_task (1 )
1240- pipe = MyPipe ([('imp' , Imputer ()),
1250+ pipe = MyPipe ([('imp' , SimpleImputer ()),
12411251 ('dummy' , sklearn .dummy .DummyClassifier ())])
12421252 openml .runs .run_model_on_task (pipe , task )
12431253
@@ -1309,7 +1319,7 @@ def test_run_model_on_fold_classification_1(self):
13091319 y_test = y [test_indices ]
13101320
13111321 pipeline = sklearn .pipeline .Pipeline (steps = [
1312- ('imp' , sklearn . preprocessing . Imputer ()),
1322+ ('imp' , SimpleImputer ()),
13131323 ('clf' , sklearn .tree .DecisionTreeClassifier ()),
13141324 ])
13151325 # TODO add some mocking here to actually test the innards of this function, too!
@@ -1435,11 +1445,11 @@ def predict_proba(*args, **kwargs):
14351445 y_train = y [train_indices ]
14361446 X_test = X [test_indices ]
14371447 clf1 = sklearn .pipeline .Pipeline (steps = [
1438- ('imputer' , sklearn . preprocessing . Imputer ()),
1448+ ('imputer' , SimpleImputer ()),
14391449 ('estimator' , sklearn .naive_bayes .GaussianNB ())
14401450 ])
14411451 clf2 = sklearn .pipeline .Pipeline (steps = [
1442- ('imputer' , sklearn . preprocessing . Imputer ()),
1452+ ('imputer' , SimpleImputer ()),
14431453 ('estimator' , HardNaiveBayes ())
14441454 ])
14451455
@@ -1492,7 +1502,7 @@ def test_run_model_on_fold_regression(self):
14921502 y_test = y [test_indices ]
14931503
14941504 pipeline = sklearn .pipeline .Pipeline (steps = [
1495- ('imp' , sklearn . preprocessing . Imputer ()),
1505+ ('imp' , SimpleImputer ()),
14961506 ('clf' , sklearn .tree .DecisionTreeRegressor ()),
14971507 ])
14981508 # TODO add some mocking here to actually test the innards of this function, too!
@@ -1537,7 +1547,7 @@ def test_run_model_on_fold_clustering(self):
15371547 X = task .get_X (dataset_format = 'array' )
15381548
15391549 pipeline = sklearn .pipeline .Pipeline (steps = [
1540- ('imp' , sklearn . preprocessing . Imputer ()),
1550+ ('imp' , SimpleImputer ()),
15411551 ('clf' , sklearn .cluster .KMeans ()),
15421552 ])
15431553 # TODO add some mocking here to actually test the innards of this function, too!
@@ -1626,7 +1636,7 @@ def test_trim_flow_name(self):
16261636 long = """sklearn.pipeline.Pipeline(
16271637 columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
16281638 numeric=sklearn.pipeline.Pipeline(
1629- imputer =sklearn.preprocessing.imputation.Imputer,
1639+ SimpleImputer =sklearn.preprocessing.imputation.Imputer,
16301640 standardscaler=sklearn.preprocessing.data.StandardScaler),
16311641 nominal=sklearn.pipeline.Pipeline(
16321642 simpleimputer=sklearn.impute.SimpleImputer,
@@ -1650,7 +1660,7 @@ def test_trim_flow_name(self):
16501660 self .assertEqual (short , SklearnExtension .trim_flow_name (long_stripped ))
16511661
16521662 long = """sklearn.pipeline.Pipeline(
1653- Imputer =sklearn.preprocessing.imputation.Imputer,
1663+ SimpleImputer =sklearn.preprocessing.imputation.Imputer,
16541664 VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501
16551665 Estimator=sklearn.model_selection._search.RandomizedSearchCV(
16561666 estimator=sklearn.tree.tree.DecisionTreeClassifier))"""
@@ -1660,7 +1670,7 @@ def test_trim_flow_name(self):
16601670
16611671 long = """sklearn.model_selection._search.RandomizedSearchCV(
16621672 estimator=sklearn.pipeline.Pipeline(
1663- Imputer =sklearn.preprocessing.imputation.Imputer,
1673+ SimpleImputer =sklearn.preprocessing.imputation.Imputer,
16641674 classifier=sklearn.ensemble.forest.RandomForestClassifier))"""
16651675 short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))"
16661676 long_stripped , _ = re .subn (r'\s' , '' , long )
0 commit comments