@@ -975,17 +975,20 @@ def load_diabetes(raw: bool = False, test_set: float = 0.3) -> DATASET_TYPE:
975975 return (x_train , y_train ), (x_test , y_test ), min_ , max_
976976
977977
978- def load_nursery (raw : bool = False , test_set : float = 0.2 , transform_social : bool = False ) -> DATASET_TYPE :
978+ def load_nursery (
979+ raw : bool = False , scaled : bool = True , test_set : float = 0.2 , transform_social : bool = False
980+ ) -> DATASET_TYPE :
979981 """
980982 Loads the UCI Nursery dataset from `config.ART_DATA_PATH` or downloads it if necessary.
981983
982984 :param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
983- encoded and data is scaled using sklearn's StandardScaler.
985+ encoded and data is scaled using sklearn's StandardScaler according to the value of `scaled`.
986+ :param scaled: `True` if data should be scaled.
984987 :param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
985988 :param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
986989 inference. This is done by assigning the original value 'problematic' the new value 1, and
987990 the other original values are assigned the new value 0.
988- :return: Entire dataset and labels.
991+ :return: Entire dataset and labels as numpy array .
989992 """
990993 import pandas as pd
991994 import sklearn .preprocessing
@@ -1050,16 +1053,20 @@ def modify_social(value):
10501053 data = data .drop (features_to_remove , axis = 1 )
10511054
10521055 # normalize data
1053- label = data . loc [:, "label" ]
1054- features = data .drop ([ "label" ], axis = 1 )
1055- scaler = sklearn . preprocessing . StandardScaler ( )
1056- scaler . fit ( features )
1057- scaled_features = pd . DataFrame ( scaler .transform (features ), columns = features . columns )
1058-
1059- data = pd .concat ([label , scaled_features ], axis = 1 , join = "inner" )
1056+ if scaled :
1057+ label = data .loc [:, "label" ]
1058+ features = data . drop ([ "label" ], axis = 1 )
1059+ scaler = sklearn . preprocessing . StandardScaler ( )
1060+ scaler .fit (features )
1061+ scaled_features = pd . DataFrame ( scaler . transform ( features ), columns = features . columns )
1062+ data = pd .concat ([label , scaled_features ], axis = 1 , join = "inner" )
10601063
10611064 features = data .drop (["label" ], axis = 1 )
1062- min_ , max_ = np .amin (features .to_numpy ()), np .amax (features .to_numpy ())
1065+ if raw :
1066+ numeric_features = features .drop (categorical_features , axis = 1 ).to_numpy ().astype (np .int32 )
1067+ min_ , max_ = np .amin (numeric_features ), np .amax (numeric_features )
1068+ else :
1069+ min_ , max_ = np .amin (features .to_numpy ().astype (np .float64 )), np .amax (features .to_numpy ().astype (np .float64 ))
10631070
10641071 # Split training and test sets
10651072 stratified = sklearn .model_selection .StratifiedShuffleSplit (n_splits = 1 , test_size = test_set , random_state = 18 )
@@ -1071,6 +1078,10 @@ def modify_social(value):
10711078 x_test = test .drop (["label" ], axis = 1 ).to_numpy ()
10721079 y_test = test .loc [:, "label" ].to_numpy ()
10731080
1081+ if not raw and not scaled :
1082+ x_train = x_train .astype (np .float64 )
1083+ x_test = x_test .astype (np .float64 )
1084+
10741085 return (x_train , y_train ), (x_test , y_test ), min_ , max_
10751086
10761087
0 commit comments