@@ -37,7 +37,7 @@ def a9a(dataset_dir=None):
37
37
a9a X train dataset (39073, 123)
38
38
a9a y train dataset (39073, 1)
39
39
a9a X test dataset (9769, 123)
40
- a9a y train dataset (9769, 1)
40
+ a9a y test dataset (9769, 1)
41
41
"""
42
42
dataset_name = 'a9a'
43
43
os .makedirs (dataset_dir , exist_ok = True )
@@ -75,7 +75,7 @@ def ijcnn(dataset_dir=None):
75
75
ijcnn X train dataset (153344, 22)
76
76
ijcnn y train dataset (153344, 1)
77
77
ijcnn X test dataset (38337, 22)
78
- ijcnn y train dataset (38337, 1)
78
+ ijcnn y test dataset (38337, 1)
79
79
"""
80
80
dataset_name = 'ijcnn'
81
81
os .makedirs (dataset_dir , exist_ok = True )
@@ -113,7 +113,7 @@ def skin_segmentation(dataset_dir=None):
113
113
skin_segmentation X train dataset (196045, 3)
114
114
skin_segmentation y train dataset (196045, 1)
115
115
skin_segmentation X test dataset (49012, 3)
116
- skin_segmentation y train dataset (49012, 1)
116
+ skin_segmentation y test dataset (49012, 1)
117
117
"""
118
118
dataset_name = 'skin_segmentation'
119
119
os .makedirs (dataset_dir , exist_ok = True )
@@ -151,7 +151,7 @@ def klaverjas(dataset_dir=None):
151
151
klaverjas X train dataset (196045, 3)
152
152
klaverjas y train dataset (196045, 1)
153
153
klaverjas X test dataset (49012, 3)
154
- klaverjas y train dataset (49012, 1)
154
+ klaverjas y test dataset (49012, 1)
155
155
"""
156
156
dataset_name = 'klaverjas'
157
157
os .makedirs (dataset_dir , exist_ok = True )
@@ -184,7 +184,7 @@ def connect(dataset_dir=None):
184
184
connect X train dataset (196045, 127)
185
185
connect y train dataset (196045, 1)
186
186
connect X test dataset (49012, 127)
187
- connect y train dataset (49012, 1)
187
+ connect y test dataset (49012, 1)
188
188
"""
189
189
dataset_name = 'connect'
190
190
os .makedirs (dataset_dir , exist_ok = True )
@@ -223,7 +223,7 @@ def mnist(dataset_dir=None):
223
223
mnist X train dataset (60000, 784)
224
224
mnist y train dataset (60000, 1)
225
225
mnist X test dataset (10000, 784)
226
- mnist y train dataset (10000, 1)
226
+ mnist y test dataset (10000, 1)
227
227
"""
228
228
dataset_name = 'mnist'
229
229
@@ -258,7 +258,7 @@ def sensit(dataset_dir=None):
258
258
sensit X train dataset (196045, 3)
259
259
sensit y train dataset (196045, 1)
260
260
sensit X test dataset (49012, 3)
261
- sensit y train dataset (49012, 1)
261
+ sensit y test dataset (49012, 1)
262
262
"""
263
263
dataset_name = 'sensit'
264
264
os .makedirs (dataset_dir , exist_ok = True )
@@ -285,12 +285,16 @@ def sensit(dataset_dir=None):
285
285
286
286
def covertype (dataset_dir = None ):
287
287
"""
288
-
289
- covertype X train dataset (196045, 3)
290
- covertype y train dataset (196045, 1)
291
- covertype X test dataset (49012, 3)
292
- covertype y train dataset (49012, 1)
293
-
288
+ Abstract: This is the original version of the famous
289
+ covertype dataset in ARFF format.
290
+ Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson
291
+ Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype)
292
+
293
+ Classification task. n_classes = 7.
294
+ covertype X train dataset (390852, 54)
295
+ covertype y train dataset (390852, 1)
296
+ covertype X test dataset (97713, 54)
297
+ covertype y test dataset (97713, 1)
294
298
"""
295
299
dataset_name = 'covertype'
296
300
os .makedirs (dataset_dir , exist_ok = True )
@@ -313,6 +317,41 @@ def covertype(dataset_dir=None):
313
317
return True
314
318
315
319
320
+ def codrnanorm (dataset_dir = None ):
321
+ """
322
+ Abstract: Detection of non-coding RNAs on the basis of predicted secondary
323
+ structure formation free energy change.
324
+ Author: Andrew V Uzilov,Joshua M Keegan,David H Mathews.
325
+ Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets)
326
+
327
+ Classification task. n_classes = 2.
328
+ codrnanorm X train dataset (390852, 8)
329
+ codrnanorm y train dataset (390852, 1)
330
+ codrnanorm X test dataset (97713, 8)
331
+ codrnanorm y test dataset (97713, 1)
332
+ """
333
+ dataset_name = 'codrnanorm'
334
+ os .makedirs (dataset_dir , exist_ok = True )
335
+
336
+ X , y = fetch_openml (name = 'codrnaNorm' , return_X_y = True ,
337
+ as_frame = False , data_home = dataset_dir )
338
+ X = pd .DataFrame (X .todense ())
339
+ y = pd .DataFrame (y )
340
+
341
+ logging .info (f'{ dataset_name } dataset is downloaded' )
342
+ logging .info ('reading CSV file...' )
343
+
344
+ x_train , x_test , y_train , y_test = train_test_split (
345
+ X , y , test_size = 0.2 , random_state = 42 )
346
+ for data , name in zip ((x_train , x_test , y_train , y_test ),
347
+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
348
+ filename = f'{ dataset_name } _{ name } .csv'
349
+ data .to_csv (os .path .join (dataset_dir , filename ),
350
+ header = False , index = False )
351
+ logging .info (f'dataset { dataset_name } ready.' )
352
+ return True
353
+
354
+
316
355
def gisette (dataset_dir = None ):
317
356
"""
318
357
GISETTE is a handwritten digit recognition problem.
@@ -323,7 +362,7 @@ def gisette(dataset_dir=None):
323
362
gisette X train dataset (6000, 5000)
324
363
gisette y train dataset (6000, 1)
325
364
gisette X test dataset (1000, 5000)
326
- gisette y train dataset (1000, 1)
365
+ gisette y test dataset (1000, 1)
327
366
"""
328
367
dataset_name = 'gisette'
329
368
os .makedirs (dataset_dir , exist_ok = True )
0 commit comments