@@ -334,6 +334,40 @@ def epsilon(dataset_dir: Path) -> bool:
334
334
return True
335
335
336
336
337
+ def epsilon_30K (dataset_dir : Path ) -> bool :
338
+ """
339
+ Epsilon dataset
340
+ https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
341
+
342
+ Classification task. n_classes = 2.
343
+ epsilon_30K x train dataset (30000, 2000)
344
+ epsilon_30K y train dataset (30000, 2000)
345
+ """
346
+ dataset_name = 'epsilon_30K'
347
+ os .makedirs (dataset_dir , exist_ok = True )
348
+
349
+ url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
350
+ '/epsilon_normalized.bz2'
351
+ local_url_train = os .path .join (dataset_dir , os .path .basename (url_train ))
352
+
353
+ num_train , dtype = 30000 , np .float32
354
+ if not os .path .isfile (local_url_train ):
355
+ logging .info (f'Started loading { dataset_name } , train' )
356
+ retrieve (url_train , local_url_train )
357
+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
358
+ X_train , y_train = load_svmlight_file (local_url_train ,
359
+ dtype = dtype )
360
+ X_train = X_train .toarray ()[:num_train ]
361
+ y_train = y_train [:num_train ]
362
+
363
+ for data , name in zip ((X_train , y_train ),
364
+ ('x_train' , 'y_train' )):
365
+ filename = f'{ dataset_name } _{ name } .npy'
366
+ np .save (os .path .join (dataset_dir , filename ), data )
367
+ logging .info (f'dataset { dataset_name } is ready.' )
368
+ return True
369
+
370
+
337
371
def fraud (dataset_dir : Path ) -> bool :
338
372
"""
339
373
Credit Card Fraud Detection contest
@@ -688,6 +722,51 @@ def skin_segmentation(dataset_dir: Path) -> bool:
688
722
return True
689
723
690
724
725
+ def cifar_binary (dataset_dir : Path ) -> bool :
726
+ """
727
+ Cifar dataset from LIBSVM Datasets (
728
+ https://www.cs.toronto.edu/~kriz/cifar.html#cifar)
729
+ TaskType: Classification
730
+ cifar_binary x train dataset (50000, 3072)
731
+ cifar_binary y train dataset (50000, 1)
732
+ cifar_binary x test dataset (10000, 3072)
733
+ cifar_binary y test dataset (10000, 1)
734
+ """
735
+ dataset_name = 'cifar_binary'
736
+ os .makedirs (dataset_dir , exist_ok = True )
737
+
738
+ url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2'
739
+ url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2'
740
+ local_url_train = os .path .join (dataset_dir , os .path .basename (url_train ))
741
+ local_url_test = os .path .join (dataset_dir , os .path .basename (url_test ))
742
+
743
+ if not os .path .isfile (local_url_train ):
744
+ logging .info (f'Started loading { dataset_name } , train' )
745
+ retrieve (url_train , local_url_train )
746
+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
747
+ x_train , y_train = load_svmlight_file (local_url_train ,
748
+ dtype = np .float32 )
749
+
750
+ if not os .path .isfile (local_url_test ):
751
+ logging .info (f'Started loading { dataset_name } , test' )
752
+ retrieve (url_test , local_url_test )
753
+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
754
+ x_test , y_test = load_svmlight_file (local_url_test ,
755
+ dtype = np .float32 )
756
+
757
+ x_train = x_train .toarray ()
758
+ y_train = (y_train > 0 ).astype (int )
759
+
760
+ x_test = x_test .toarray ()
761
+ y_test = (y_test > 0 ).astype (int )
762
+
763
+ for data , name in zip ((x_train , x_test , y_train , y_test ),
764
+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
765
+ filename = f'{ dataset_name } _{ name } .npy'
766
+ np .save (os .path .join (dataset_dir , filename ), data )
767
+ return True
768
+
769
+
691
770
def susy (dataset_dir : Path ) -> bool :
692
771
"""
693
772
SUSY dataset from UCI machine learning repository (
0 commit comments