Skip to content

Commit ff2a91c

Browse files
teresaconcmfeurer
authored andcommitted
Fix floating point issues (Issue #538) (#589)
* Pass train_size and test_size as integers (number of samples) instead of floats (ratio of samples) * Changed assigment of train samples for general use in every cv case * Added Unit Tests * Changed for simpler solution. Remove 'raveling' for multilabel cases * Fix PEP8 errors * Addition Fix PEP8 errors * Addition Fix PEP8 errors * Delete competition_c_functions.c
1 parent 5f598a6 commit ff2a91c

File tree

2 files changed

+119
-8
lines changed

2 files changed

+119
-8
lines changed

autosklearn/evaluation/train_evaluator.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -540,30 +540,32 @@ def get_splitter(self, D):
540540

541541
return cv
542542

543-
y = D.data['Y_train'].ravel()
543+
y = D.data['Y_train']
544544
shuffle = self.resampling_strategy_args.get('shuffle', True)
545545
train_size = 0.67
546546
if self.resampling_strategy_args:
547547
train_size = self.resampling_strategy_args.get('train_size',
548548
train_size)
549-
test_size = 1 - train_size
549+
test_size = float("%.4f" % (1 - train_size))
550+
550551
if D.info['task'] in CLASSIFICATION_TASKS and \
551552
D.info['task'] != MULTILABEL_CLASSIFICATION:
552553

554+
y = y.ravel()
553555
if self.resampling_strategy in ['holdout',
554556
'holdout-iterative-fit']:
557+
555558
if shuffle:
556559
try:
557560
cv = StratifiedShuffleSplit(n_splits=1,
558-
train_size=train_size,
559561
test_size=test_size,
560562
random_state=1)
561563
test_cv = copy.deepcopy(cv)
562564
next(test_cv.split(y, y))
563565
except ValueError as e:
564566
if 'The least populated class in y has only' in e.args[0]:
565-
cv = ShuffleSplit(n_splits=1, train_size=train_size,
566-
test_size=test_size, random_state=1)
567+
cv = ShuffleSplit(n_splits=1, test_size=test_size,
568+
random_state=1)
567569
else:
568570
raise e
569571
else:
@@ -588,8 +590,8 @@ def get_splitter(self, D):
588590
'holdout-iterative-fit']:
589591
# TODO shuffle not taken into account for this
590592
if shuffle:
591-
cv = ShuffleSplit(n_splits=1, train_size=train_size,
592-
test_size=test_size, random_state=1)
593+
cv = ShuffleSplit(n_splits=1, test_size=test_size,
594+
random_state=1)
593595
else:
594596
tmp_train_size = int(np.floor(train_size * y.shape[0]))
595597
test_fold = np.zeros(y.shape[0])

test/test_evaluation/test_train_evaluator.py

Lines changed: 110 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@
2323
eval_holdout, eval_iterative_holdout, eval_cv, eval_partial_cv
2424
from autosklearn.util import backend
2525
from autosklearn.util.pipeline import get_configuration_space
26-
from autosklearn.constants import *
26+
from autosklearn.constants import BINARY_CLASSIFICATION, \
27+
MULTILABEL_CLASSIFICATION,\
28+
MULTICLASS_CLASSIFICATION,\
29+
REGRESSION
2730
from autosklearn.metrics import accuracy, r2, f1_macro
2831

2932
this_directory = os.path.dirname(__file__)
@@ -1226,6 +1229,112 @@ def test_get_splitter_cv_object(self, te_mock):
12261229
next(cv.split(D.data['Y_train'], D.data['Y_train']
12271230
, groups=evaluator.resampling_strategy_args['groups']))
12281231

1232+
@unittest.mock.patch.object(TrainEvaluator, "__init__")
1233+
def test_holdout_split_size(self, te_mock):
1234+
te_mock.return_value = None
1235+
D = unittest.mock.Mock(spec=AbstractDataManager)
1236+
D.feat_type = []
1237+
1238+
evaluator = TrainEvaluator()
1239+
evaluator.resampling_strategy = 'holdout'
1240+
1241+
# Exact Ratio
1242+
D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]))
1243+
D.info = dict(task=BINARY_CLASSIFICATION)
1244+
evaluator.resampling_strategy_args = {'shuffle': True,
1245+
'train_size': 0.7}
1246+
cv = evaluator.get_splitter(D)
1247+
1248+
self.assertEqual(cv.get_n_splits(), 1)
1249+
train_samples, test_samples = next(cv.split(D.data['Y_train'],
1250+
D.data['Y_train']))
1251+
self.assertEqual(len(train_samples), 7)
1252+
self.assertEqual(len(test_samples), 3)
1253+
1254+
# No Shuffle
1255+
evaluator.resampling_strategy_args = {'shuffle': False,
1256+
'train_size': 0.7}
1257+
cv = evaluator.get_splitter(D)
1258+
1259+
self.assertEqual(cv.get_n_splits(), 1)
1260+
train_samples, test_samples = next(cv.split(D.data['Y_train'],
1261+
D.data['Y_train']))
1262+
self.assertEqual(len(train_samples), 7)
1263+
self.assertEqual(len(test_samples), 3)
1264+
1265+
# Rounded Ratio
1266+
D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1]))
1267+
1268+
evaluator.resampling_strategy_args = {'shuffle': True,
1269+
'train_size': 0.7}
1270+
cv = evaluator.get_splitter(D)
1271+
1272+
self.assertEqual(cv.get_n_splits(), 1)
1273+
train_samples, test_samples = next(cv.split(D.data['Y_train'],
1274+
D.data['Y_train']))
1275+
self.assertEqual(len(train_samples), 6)
1276+
self.assertEqual(len(test_samples), 3)
1277+
1278+
# Rounded Ratio No Shuffle
1279+
evaluator.resampling_strategy_args = {'shuffle': False,
1280+
'train_size': 0.7}
1281+
cv = evaluator.get_splitter(D)
1282+
1283+
self.assertEqual(cv.get_n_splits(), 1)
1284+
train_samples, test_samples = next(cv.split(D.data['Y_train'],
1285+
D.data['Y_train']))
1286+
self.assertEqual(len(train_samples), 6)
1287+
self.assertEqual(len(test_samples), 3)
1288+
1289+
# More data
1290+
evaluator.resampling_strategy_args = {'shuffle': True,
1291+
'train_size': 0.7}
1292+
1293+
D.data = dict(Y_train=np.zeros((900, 1)))
1294+
cv = evaluator.get_splitter(D)
1295+
self.assertEqual(cv.get_n_splits(), 1)
1296+
train_samples, test_samples = next(cv.split(D.data['Y_train'],
1297+
D.data['Y_train']))
1298+
self.assertEqual(len(train_samples), 630)
1299+
self.assertEqual(len(test_samples), 270)
1300+
1301+
evaluator.resampling_strategy_args = {'train_size': 0.752}
1302+
D.data = dict(Y_train=np.zeros((900, 1)))
1303+
cv = evaluator.get_splitter(D)
1304+
self.assertEqual(cv.get_n_splits(), 1)
1305+
train_samples, test_samples = next(cv.split(D.data['Y_train'],
1306+
D.data['Y_train']))
1307+
self.assertEqual(len(train_samples), 676)
1308+
self.assertEqual(len(test_samples), 224)
1309+
1310+
# Multilabel Exact Ratio
1311+
D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1],
1312+
[1, 1], [1, 1], [1, 0], [1, 1], [1, 1]]
1313+
))
1314+
D.info = dict(task=MULTILABEL_CLASSIFICATION)
1315+
evaluator.resampling_strategy_args = {'shuffle': True,
1316+
'train_size': 0.7}
1317+
cv = evaluator.get_splitter(D)
1318+
1319+
self.assertEqual(cv.get_n_splits(), 1)
1320+
train_samples, test_samples = next(cv.split(D.data['Y_train'],
1321+
D.data['Y_train']))
1322+
self.assertEqual(len(train_samples), 7)
1323+
self.assertEqual(len(test_samples), 3)
1324+
1325+
# Multilabel No Shuffle
1326+
D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1],
1327+
[1, 1], [1, 1], [1, 0], [1, 1]]))
1328+
evaluator.resampling_strategy_args = {'shuffle': False,
1329+
'train_size': 0.7}
1330+
cv = evaluator.get_splitter(D)
1331+
1332+
self.assertEqual(cv.get_n_splits(), 1)
1333+
train_samples, test_samples = next(cv.split(D.data['Y_train'],
1334+
D.data['Y_train']))
1335+
self.assertEqual(len(train_samples), 6)
1336+
self.assertEqual(len(test_samples), 3)
1337+
12291338

12301339
class FunctionsTest(unittest.TestCase):
12311340
def setUp(self):

0 commit comments

Comments
 (0)