Skip to content

Commit 9d28e8a

Browse files
committed
Synthetic datasets config change, OMP env. fix
1 parent 9698e00 commit 9d28e8a

File tree

2 files changed

+126
-42
lines changed

2 files changed

+126
-42
lines changed

config_example.json

Lines changed: 101 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,27 @@
1010
"algorithm": "distances",
1111
"dataset": [
1212
{
13-
"training": "synth_clsf_1000_15000_2"
13+
"source": "synthetic",
14+
"type": "classification",
15+
"n_classes": 2,
16+
"n_features": 15000,
17+
"training": {
18+
"n_samples": 1000
19+
}
1420
}
1521
]
1622
},
1723
{
1824
"algorithm": "kmeans",
1925
"dataset": [
2026
{
21-
"training": "synth_kmeans_1000000_50_10"
27+
"source": "synthetic",
28+
"type": "kmeans",
29+
"n_clusters": 10,
30+
"n_features": 50,
31+
"training": {
32+
"n_samples": 1000000
33+
}
2234
}
2335
],
2436
"n-clusters": [10]
@@ -27,7 +39,13 @@
2739
"algorithm": "dbscan",
2840
"dataset": [
2941
{
30-
"training": "synth_blobs_100000_50_10"
42+
"source": "synthetic",
43+
"type": "blobs",
44+
"n_clusters": 10,
45+
"n_features": 50,
46+
"training": {
47+
"n_samples": 100000
48+
}
3149
}
3250
],
3351
"min-samples": [5000],
@@ -37,53 +55,98 @@
3755
"algorithm": "linear",
3856
"dataset": [
3957
{
40-
"training": "synth_reg_1000000_50"
58+
"source": "synthetic",
59+
"type": "regression",
60+
"n_features": 50,
61+
"training": {
62+
"n_samples": 1000000
63+
}
4164
}
4265
]
4366
},
4467
{
4568
"algorithm": "ridge",
4669
"dataset": [
4770
{
48-
"training": "synth_reg_1000000_50"
71+
"source": "synthetic",
72+
"type": "regression",
73+
"n_features": 50,
74+
"training": {
75+
"n_samples": 1000000
76+
}
4977
}
5078
]
5179
},
5280
{
5381
"algorithm": "df_clsf",
5482
"dataset": [
5583
{
56-
"training": "synth_clsf_10000_100_2"
84+
"source": "synthetic",
85+
"type": "classification",
86+
"n_classes": 2,
87+
"n_features": 100,
88+
"training": {
89+
"n_samples": 10000
90+
}
5791
},
5892
{
59-
"training": "synth_clsf_10000_100_5"
93+
"source": "synthetic",
94+
"type": "classification",
95+
"n_classes": 5,
96+
"n_features": 100,
97+
"training": {
98+
"n_samples": 10000
99+
}
60100
}
61101
]
62102
},
63103
{
64104
"algorithm": "df_regr",
65105
"dataset": [
66106
{
67-
"training": "synth_reg_10000_100"
107+
"source": "synthetic",
108+
"type": "regression",
109+
"n_features": 100,
110+
"training": {
111+
"n_samples": 10000
112+
}
68113
}
69114
]
70115
},
71116
{
72117
"algorithm": "log_reg",
73118
"dataset": [
74119
{
75-
"training": "synth_clsf_100000_100_2"
120+
"source": "synthetic",
121+
"type": "classification",
122+
"n_classes": 2,
123+
"n_features": 100,
124+
"training": {
125+
"n_samples": 100000
126+
}
76127
},
77128
{
78-
"training": "synth_clsf_100000_100_5"
129+
"source": "synthetic",
130+
"type": "classification",
131+
"n_classes": 5,
132+
"n_features": 100,
133+
"training": {
134+
"n_samples": 100000
135+
}
79136
}
80137
]
81138
},
82139
{
83140
"algorithm": "pca",
84141
"dataset": [
85142
{
86-
"training": "synth_clsf_10000_100_2"
143+
"source": "synthetic",
144+
"type": "classification",
145+
"n_classes": 2,
146+
"n_features": 100,
147+
"training": {
148+
"n_samples": 10000
149+
}
87150
}
88151
],
89152
"svd-solver": ["daal", "full"]
@@ -92,10 +155,22 @@
92155
"algorithm": "svm",
93156
"dataset": [
94157
{
95-
"training": "synth_clsf_20000_100_2"
158+
"source": "synthetic",
159+
"type": "classification",
160+
"n_classes": 2,
161+
"n_features": 100,
162+
"training": {
163+
"n_samples": 20000
164+
}
96165
},
97166
{
98-
"training": "synth_clsf_20000_100_5"
167+
"source": "synthetic",
168+
"type": "classification",
169+
"n_classes": 5,
170+
"n_features": 100,
171+
"training": {
172+
"n_samples": 20000
173+
}
99174
}
100175
],
101176
"max-cache-size": [4],
@@ -106,7 +181,13 @@
106181
"algorithm": "gbt",
107182
"dataset": [
108183
{
109-
"training": "synth_clsf_10000_100_2"
184+
"source": "synthetic",
185+
"type": "classification",
186+
"n_classes": 2,
187+
"n_features": 100,
188+
"training": {
189+
"n_samples": 10000
190+
}
110191
}
111192
],
112193
"tree-method": ["hist"],
@@ -117,7 +198,12 @@
117198
"algorithm": "gbt",
118199
"dataset": [
119200
{
120-
"training": "synth_reg_10000_100"
201+
"source": "synthetic",
202+
"type": "regression",
203+
"n_features": 100,
204+
"training": {
205+
"n_samples": 10000
206+
}
121207
}
122208
],
123209
"tree-method": ["hist"],

runner.py

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def is_ht_enabled():
7474
return False
7575
return False
7676
except FileNotFoundError:
77-
print('Impossible to check hyperthreading via lscpu')
77+
verbose_print('Impossible to check hyperthreading via lscpu')
7878
return False
7979

8080

@@ -186,7 +186,7 @@ def is_ht_enabled():
186186
verbose_print(f'{algorithm} algorithm: {len(libs) * len(cases)} case(s),'
187187
f' {len(params_set["dataset"])} dataset(s)\n')
188188
for dataset in params_set['dataset']:
189-
if isinstance(dataset['training'], dict):
189+
if dataset['source'] in ['csv', 'npy']:
190190
paths = f'--file-X-train {dataset["training"]["x"]}'
191191
if 'y' in dataset['training'].keys():
192192
paths += f' --file-y-train {dataset["training"]["y"]}'
@@ -198,7 +198,7 @@ def is_ht_enabled():
198198
dataset_name = dataset['name']
199199
else:
200200
dataset_name = 'unknown'
201-
elif dataset['training'].startswith('synth'):
201+
elif dataset['source'] == 'synthetic':
202202
class GenerationArgs:
203203
pass
204204
gen_args = GenerationArgs()
@@ -209,67 +209,65 @@ class GenerationArgs:
209209
else:
210210
gen_args.seed = 777
211211

212-
dataset_params = dataset['training'].split('_')
213-
gen_args.task = dataset_params[1]
214-
gen_args.samples = int(dataset_params[2])
215-
gen_args.features = int(dataset_params[3])
216-
if gen_args.task in ['clsf', 'kmeans', 'blobs']:
217-
cls_num_for_file = '-' + dataset_params[4]
218-
gen_args.classes = int(dataset_params[4])
219-
gen_args.clusters = gen_args.classes
212+
gen_args.type = dataset['type']
213+
gen_args.samples = dataset['training']['n_samples']
214+
gen_args.features = dataset['n_features']
215+
if 'n_classes' in dataset.keys():
216+
gen_args.classes = dataset['n_classes']
217+
cls_num_for_file = f'-{dataset["n_classes"]}'
218+
elif 'n_clusters' in dataset.keys():
219+
gen_args.clusters = dataset['n_clusters']
220+
cls_num_for_file = f'-{dataset["n_clusters"]}'
220221
else:
221222
cls_num_for_file = ''
222223

223-
file_prefix = f'data/synth-{gen_args.task}{cls_num_for_file}-'
224+
file_prefix = f'data/synthetic-{gen_args.type}{cls_num_for_file}-'
224225
file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy'
225226

226-
if gen_args.task == 'kmeans':
227+
if gen_args.type == 'kmeans':
227228
gen_args.node_id = 0
228229
gen_args.filei = f'{file_prefix}init{file_postfix}'
229230
paths += f'--filei {gen_args.filei}'
230231
gen_args.filet = f'{file_prefix}threshold{file_postfix}'
231232

232233
gen_args.filex = f'{file_prefix}X-train{file_postfix}'
233234
paths += f' --file-X-train {gen_args.filex}'
234-
if gen_args.task not in ['kmeans', 'blobs']:
235+
if gen_args.type not in ['kmeans', 'blobs']:
235236
gen_args.filey = f'{file_prefix}y-train{file_postfix}'
236237
paths += f' --file-y-train {gen_args.filey}'
237238

238239
if 'testing' in dataset.keys():
239-
dataset_params = dataset['testing'].split('_')
240-
gen_args.test_samples = int(dataset_params[2])
240+
gen_args.test_samples = dataset['testing']['n_samples']
241241
gen_args.filextest = f'{file_prefix}X-test{file_postfix}'
242242
paths += f' --file-X-test {gen_args.filextest}'
243-
if gen_args.task not in ['kmeans', 'blobs']:
243+
if gen_args.type not in ['kmeans', 'blobs']:
244244
gen_args.fileytest = f'{file_prefix}y-test{file_postfix}'
245245
paths += f' --file-y-test {gen_args.fileytest}'
246246
else:
247247
gen_args.test_samples = 0
248248
gen_args.filextest = gen_args.filex
249-
if gen_args.task not in ['kmeans', 'blobs']:
249+
if gen_args.type not in ['kmeans', 'blobs']:
250250
gen_args.fileytest = gen_args.filey
251251

252252
if not args.dummy_run and not os.path.isfile(gen_args.filex):
253-
if gen_args.task == 'reg':
253+
if gen_args.type == 'regression':
254254
gen_regression(gen_args)
255-
elif gen_args.task == 'clsf':
255+
elif gen_args.type == 'classification':
256256
gen_classification(gen_args)
257-
elif gen_args.task == 'kmeans':
257+
elif gen_args.type == 'kmeans':
258258
gen_kmeans(gen_args)
259-
elif gen_args.task == 'blobs':
259+
elif gen_args.type == 'blobs':
260260
gen_blobs(gen_args)
261-
dataset_name = f'synthetic_{gen_args.task}'
261+
dataset_name = f'synthetic_{gen_args.type}'
262262
else:
263263
raise ValueError(
264-
'Unknown dataset. Only synthetics datasets '
264+
'Unknown dataset source. Only synthetics datasets '
265265
'and csv/npy files are supported now')
266266
for lib in libs:
267+
env = os.environ.copy()
267268
if lib == 'xgboost':
268269
env['OMP_NUM_THREADS'] = omp_num_threads
269270
env['OMP_PLACES'] = omp_places
270-
else:
271-
env['OMP_NUM_THREADS'] = ''
272-
env['OMP_PLACES'] = ''
273271

274272
for i, case in enumerate(cases):
275273
command = f'python {lib}/{algorithm}.py --batch {batch} ' \

0 commit comments

Comments
 (0)