Skip to content

Commit 362b61e

Browse files
authored
Merge pull request #25 from Alexsandruss/train_test_split
Train test split benchmarks
2 parents c35b007 + f0e6032 commit 362b61e

File tree

3 files changed

+91
-0
lines changed

3 files changed

+91
-0
lines changed

config_example.json

100755100644
File mode changed.

cuml/train_test_split.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (C) 2020 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
import argparse
6+
from bench import measure_function_time, parse_args, load_data, print_output
7+
from cuml import train_test_split
8+
9+
parser = argparse.ArgumentParser(
10+
description='cuml train_test_split benchmark')
11+
parser.add_argument('--train-size', type=float, default=0.75,
12+
help='Size of training subset')
13+
parser.add_argument('--test-size', type=float, default=0.25,
14+
help='Size of testing subset')
15+
parser.add_argument('--do-not-shuffle', default=False, action='store_true',
16+
help='Do not perform data shuffle before splitting')
17+
params = parse_args(parser)
18+
19+
# Load generated data
20+
X, y, _, _ = load_data(params)
21+
22+
tts_params = {
23+
'train_size': params.train_size,
24+
'test_size': params.test_size,
25+
'shuffle': not params.do_not_shuffle,
26+
'random_state': params.seed
27+
}
28+
29+
time, _ = measure_function_time(train_test_split, X=X, y=y, params=params)
30+
31+
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
32+
'time')
33+
34+
print_output(library='cuml', algorithm='train_test_split',
35+
stages=['training'], columns=columns, params=params,
36+
functions=['train_test_split'], times=[time], accuracies=[None],
37+
accuracy_type=None, data=[X], alg_params=tts_params)

sklearn/train_test_split.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (C) 2020 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
import argparse
6+
from bench import measure_function_time, parse_args, load_data, print_output
7+
from sklearn.model_selection import train_test_split
8+
9+
parser = argparse.ArgumentParser(
10+
description='scikit-learn train_test_split benchmark')
11+
parser.add_argument('--train-size', type=float, default=0.75,
12+
help='Size of training subset')
13+
parser.add_argument('--test-size', type=float, default=0.25,
14+
help='Size of testing subset')
15+
parser.add_argument('--do-not-shuffle', default=False, action='store_true',
16+
help='Do not perform data shuffle before splitting')
17+
parser.add_argument('--include-y', default=False, action='store_true',
18+
help='Include label (Y) in splitting')
19+
parser.add_argument('--rng', default=None,
20+
choices=('MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH',
21+
'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10',
22+
'NONDETERM', None),
23+
help='Random numbers generator for shuffling '
24+
'(only for IDP scikit-learn)')
25+
params = parse_args(parser)
26+
27+
# Load generated data
28+
X, y, _, _ = load_data(params)
29+
30+
if params.include_y:
31+
data_args = (X, y)
32+
else:
33+
data_args = (X, )
34+
35+
tts_params = {
36+
'train_size': params.train_size,
37+
'test_size': params.test_size,
38+
'shuffle': not params.do_not_shuffle,
39+
'random_state': params.seed
40+
}
41+
42+
if params.rng is not None:
43+
tts_params['rng'] = params.rng
44+
45+
time, _ = measure_function_time(train_test_split, *data_args, **tts_params,
46+
params=params)
47+
48+
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
49+
'time')
50+
51+
print_output(library='sklearn', algorithm='train_test_split',
52+
stages=['training'], columns=columns, params=params,
53+
functions=['train_test_split'], times=[time], accuracies=[None],
54+
accuracy_type=None, data=[X], alg_params=tts_params)

0 commit comments

Comments
 (0)