Skip to content

Commit 3a0d7b9

Browse files
authored
Small extensions & add new dataset (#49)
1 parent 8c9e528 commit 3a0d7b9

File tree

4 files changed

+87
-22
lines changed

4 files changed

+87
-22
lines changed

configs/cuml_config.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,27 @@
307307
"C": [1.0],
308308
"kernel": ["rbf"]
309309
},
310+
{
311+
"algorithm": "svm",
312+
"dataset": [
313+
{
314+
"source": "csv",
315+
"name": "skin_segmentation",
316+
"training":
317+
{
318+
"x": "data/skin_segmentation_x_train.csv",
319+
"y": "data/skin_segmentation_y_train.csv"
320+
},
321+
"testing":
322+
{
323+
"x": "data/skin_segmentation_x_test.csv",
324+
"y": "data/skin_segmentation_y_test.csv"
325+
}
326+
}
327+
],
328+
"C": [1.0],
329+
"kernel": ["rbf"]
330+
},
310331
{
311332
"algorithm": "dbscan",
312333
"dataset": [

datasets/load_datasets.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
import logging
2121

2222
from .loader import (a9a, gisette, ijcnn, skin_segmentation,
23-
klaverjas, connect, mnist, sensit, covertype)
23+
klaverjas, connect, mnist, sensit,
24+
covertype, codrnanorm)
2425

2526
dataset_loaders = {
2627
"a9a": a9a,
@@ -32,6 +33,7 @@
3233
"mnist": mnist,
3334
"sensit": sensit,
3435
"covertype": covertype,
36+
"codrnanorm": codrnanorm,
3537
}
3638

3739

datasets/loader.py

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def a9a(dataset_dir=None):
3737
a9a X train dataset (39073, 123)
3838
a9a y train dataset (39073, 1)
3939
a9a X test dataset (9769, 123)
40-
a9a y train dataset (9769, 1)
40+
a9a y test dataset (9769, 1)
4141
"""
4242
dataset_name = 'a9a'
4343
os.makedirs(dataset_dir, exist_ok=True)
@@ -75,7 +75,7 @@ def ijcnn(dataset_dir=None):
7575
ijcnn X train dataset (153344, 22)
7676
ijcnn y train dataset (153344, 1)
7777
ijcnn X test dataset (38337, 22)
78-
ijcnn y train dataset (38337, 1)
78+
ijcnn y test dataset (38337, 1)
7979
"""
8080
dataset_name = 'ijcnn'
8181
os.makedirs(dataset_dir, exist_ok=True)
@@ -113,7 +113,7 @@ def skin_segmentation(dataset_dir=None):
113113
skin_segmentation X train dataset (196045, 3)
114114
skin_segmentation y train dataset (196045, 1)
115115
skin_segmentation X test dataset (49012, 3)
116-
skin_segmentation y train dataset (49012, 1)
116+
skin_segmentation y test dataset (49012, 1)
117117
"""
118118
dataset_name = 'skin_segmentation'
119119
os.makedirs(dataset_dir, exist_ok=True)
@@ -151,7 +151,7 @@ def klaverjas(dataset_dir=None):
151151
klaverjas X train dataset (196045, 3)
152152
klaverjas y train dataset (196045, 1)
153153
klaverjas X test dataset (49012, 3)
154-
klaverjas y train dataset (49012, 1)
154+
klaverjas y test dataset (49012, 1)
155155
"""
156156
dataset_name = 'klaverjas'
157157
os.makedirs(dataset_dir, exist_ok=True)
@@ -184,7 +184,7 @@ def connect(dataset_dir=None):
184184
connect X train dataset (196045, 127)
185185
connect y train dataset (196045, 1)
186186
connect X test dataset (49012, 127)
187-
connect y train dataset (49012, 1)
187+
connect y test dataset (49012, 1)
188188
"""
189189
dataset_name = 'connect'
190190
os.makedirs(dataset_dir, exist_ok=True)
@@ -223,7 +223,7 @@ def mnist(dataset_dir=None):
223223
mnist X train dataset (60000, 784)
224224
mnist y train dataset (60000, 1)
225225
mnist X test dataset (10000, 784)
226-
mnist y train dataset (10000, 1)
226+
mnist y test dataset (10000, 1)
227227
"""
228228
dataset_name = 'mnist'
229229

@@ -258,7 +258,7 @@ def sensit(dataset_dir=None):
258258
sensit X train dataset (196045, 3)
259259
sensit y train dataset (196045, 1)
260260
sensit X test dataset (49012, 3)
261-
sensit y train dataset (49012, 1)
261+
sensit y test dataset (49012, 1)
262262
"""
263263
dataset_name = 'sensit'
264264
os.makedirs(dataset_dir, exist_ok=True)
@@ -285,12 +285,16 @@ def sensit(dataset_dir=None):
285285

286286
def covertype(dataset_dir=None):
287287
"""
288-
289-
covertype X train dataset (196045, 3)
290-
covertype y train dataset (196045, 1)
291-
covertype X test dataset (49012, 3)
292-
covertype y train dataset (49012, 1)
293-
288+
Abstract: This is the original version of the famous
289+
covertype dataset in ARFF format.
290+
Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson
291+
Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype)
292+
293+
Classification task. n_classes = 7.
294+
covertype X train dataset (390852, 54)
295+
covertype y train dataset (390852, 1)
296+
covertype X test dataset (97713, 54)
297+
covertype y test dataset (97713, 1)
294298
"""
295299
dataset_name = 'covertype'
296300
os.makedirs(dataset_dir, exist_ok=True)
@@ -313,6 +317,41 @@ def covertype(dataset_dir=None):
313317
return True
314318

315319

320+
def codrnanorm(dataset_dir=None):
321+
"""
322+
Abstract: Detection of non-coding RNAs on the basis of predicted secondary
323+
structure formation free energy change.
324+
Author: Andrew V Uzilov,Joshua M Keegan,David H Mathews.
325+
Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets)
326+
327+
Classification task. n_classes = 2.
328+
codrnanorm X train dataset (390852, 8)
329+
codrnanorm y train dataset (390852, 1)
330+
codrnanorm X test dataset (97713, 8)
331+
codrnanorm y test dataset (97713, 1)
332+
"""
333+
dataset_name = 'codrnanorm'
334+
os.makedirs(dataset_dir, exist_ok=True)
335+
336+
X, y = fetch_openml(name='codrnaNorm', return_X_y=True,
337+
as_frame=False, data_home=dataset_dir)
338+
X = pd.DataFrame(X.todense())
339+
y = pd.DataFrame(y)
340+
341+
logging.info(f'{dataset_name} dataset is downloaded')
342+
logging.info('reading CSV file...')
343+
344+
x_train, x_test, y_train, y_test = train_test_split(
345+
X, y, test_size=0.2, random_state=42)
346+
for data, name in zip((x_train, x_test, y_train, y_test),
347+
('x_train', 'x_test', 'y_train', 'y_test')):
348+
filename = f'{dataset_name}_{name}.csv'
349+
data.to_csv(os.path.join(dataset_dir, filename),
350+
header=False, index=False)
351+
logging.info(f'dataset {dataset_name} ready.')
352+
return True
353+
354+
316355
def gisette(dataset_dir=None):
317356
"""
318357
GISETTE is a handwritten digit recognition problem.
@@ -323,7 +362,7 @@ def gisette(dataset_dir=None):
323362
gisette X train dataset (6000, 5000)
324363
gisette y train dataset (6000, 1)
325364
gisette X test dataset (1000, 5000)
326-
gisette y train dataset (1000, 1)
365+
gisette y test dataset (1000, 1)
327366
"""
328367
dataset_name = 'gisette'
329368
os.makedirs(dataset_dir, exist_ok=True)

runner.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,11 @@ def generate_cases(params):
8080
# make directory for data if it doesn't exist
8181
os.makedirs('data', exist_ok=True)
8282

83-
json_result = {'hardware': {}, 'software': {}, 'results': []}
83+
json_result = {
84+
'hardware': utils.get_hw_parameters(),
85+
'software': utils.get_sw_parameters(),
86+
'results': []
87+
}
8488
is_successful = True
8589

8690
for config_name in args.configs.split(','):
@@ -106,25 +110,24 @@ def generate_cases(params):
106110
for dataset in params_set['dataset']:
107111
if dataset['source'] in ['csv', 'npy']:
108112
train_data = dataset["training"]
109-
test_data = dataset["testing"]
110-
111113
file_train_data_x = train_data["x"]
112-
file_train_data_y = train_data["y"]
113-
file_test_data_x = test_data["x"]
114-
file_test_data_y = test_data["y"]
115114
paths = f'--file-X-train {file_train_data_x}'
116115
if 'y' in dataset['training'].keys():
116+
file_train_data_y = train_data["y"]
117117
paths += f' --file-y-train {file_train_data_y}'
118118
if 'testing' in dataset.keys():
119+
test_data = dataset["testing"]
120+
file_test_data_x = test_data["x"]
119121
paths += f' --file-X-test {file_test_data_x}'
120122
if 'y' in dataset['testing'].keys():
123+
file_test_data_y = test_data["y"]
121124
paths += f' --file-y-test {file_test_data_y}'
122125
if 'name' in dataset.keys():
123126
dataset_name = dataset['name']
124127
else:
125128
dataset_name = 'unknown'
126129

127-
if not utils.is_exists_files([file_train_data_x, file_train_data_y]):
130+
if not utils.is_exists_files([file_train_data_x]):
128131
directory_dataset = pathlib.Path(file_train_data_x).parent
129132
if not try_load_dataset(dataset_name=dataset_name,
130133
output_directory=directory_dataset):

0 commit comments

Comments
 (0)