1010from numpy .testing import assert_array_equal
1111from numpy .testing import assert_allclose
1212from numpy .testing import assert_equal
13+ from numpy .testing import assert_raises
14+ from numpy .testing import assert_warns
1315
1416import os
1517import glob
1618import numpy as np
19+ from scipy .sparse import csr_matrix
20+ from scipy .sparse import csc_matrix
21+ from scipy .sparse import coo_matrix
1722from sklearn .model_selection import cross_val_predict
1823from sklearn .model_selection import cross_val_score
1924from sklearn .model_selection import train_test_split
2227from sklearn .metrics import mean_absolute_error
2328from sklearn .metrics import make_scorer
2429from sklearn .linear_model import LinearRegression
25- from sklearn .linear_model import SGDRegressor
30+ from sklearn .linear_model import Ridge
2631from vecstack import stacking
32+ from vecstack .core import model_action
2733
2834n_folds = 5
2935
3440#-------------------------------------------------------------------------------
3541#-------------------------------------------------------------------------------
3642
43+ class MinimalEstimator :
44+ """Has no get_params attribute"""
45+ def __init__ (self , random_state = 0 ):
46+ self .random_state = random_state
47+ def fit (self , X , y ):
48+ return self
49+ def predict (self , X ):
50+ return np .ones (X .shape [0 ])
51+ def predict_proba (self , X ):
52+ return np .zeros (X .shape [0 ])
53+
54+ #-------------------------------------------------------------------------------
55+ #-------------------------------------------------------------------------------
56+
3757class TestRegression (unittest .TestCase ):
3858
3959 def tearDown (self ):
@@ -410,7 +430,7 @@ def test_oof_pred_mode_2_models(self):
410430 _ = model .fit (X_train , y_train )
411431 S_test_1_a = model .predict (X_test ).reshape (- 1 , 1 )
412432
413- model = SGDRegressor (random_state = 0 )
433+ model = Ridge (random_state = 0 )
414434 S_train_1_b = cross_val_predict (model , X_train , y = y_train , cv = n_folds ,
415435 n_jobs = 1 , verbose = 0 , method = 'predict' ).reshape (- 1 , 1 )
416436 _ = model .fit (X_train , y_train )
@@ -420,7 +440,7 @@ def test_oof_pred_mode_2_models(self):
420440 S_test_1 = np .c_ [S_test_1_a , S_test_1_b ]
421441
422442 models = [LinearRegression (),
423- SGDRegressor (random_state = 0 )]
443+ Ridge (random_state = 0 )]
424444 S_train_2 , S_test_2 = stacking (models , X_train , y_train , X_test ,
425445 regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
426446 mode = 'oof_pred' , random_state = 0 , verbose = 0 )
@@ -468,12 +488,12 @@ def test_oof_pred_bag_mode_2_models(self):
468488 y_tr = y_train [tr_index ]
469489 X_te = X_train [te_index ]
470490 y_te = y_train [te_index ]
471- model = SGDRegressor (random_state = 0 )
491+ model = Ridge (random_state = 0 )
472492 _ = model .fit (X_tr , y_tr )
473493 S_test_temp [:, fold_counter ] = model .predict (X_test )
474494 S_test_1_b = np .mean (S_test_temp , axis = 1 ).reshape (- 1 , 1 )
475495
476- model = SGDRegressor (random_state = 0 )
496+ model = Ridge (random_state = 0 )
477497 S_train_1_b = cross_val_predict (model , X_train , y = y_train , cv = n_folds ,
478498 n_jobs = 1 , verbose = 0 , method = 'predict' ).reshape (- 1 , 1 )
479499
@@ -482,7 +502,7 @@ def test_oof_pred_bag_mode_2_models(self):
482502
483503
484504 models = [LinearRegression (),
485- SGDRegressor (random_state = 0 )]
505+ Ridge (random_state = 0 )]
486506 S_train_2 , S_test_2 = stacking (models , X_train , y_train , X_test ,
487507 regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
488508 mode = 'oof_pred_bag' , random_state = 0 , verbose = 0 )
@@ -501,7 +521,271 @@ def test_oof_pred_bag_mode_2_models(self):
501521 assert_array_equal (S_train_1 , S_train_3 )
502522 assert_array_equal (S_test_1 , S_test_3 )
503523
524+ #---------------------------------------------------------------------------
525+ # Testing sparse types CSR, CSC, COO
526+ #---------------------------------------------------------------------------
527+
528+ def test_oof_pred_mode_sparse_csr (self ):
529+
530+ model = LinearRegression ()
531+ S_train_1 = cross_val_predict (model , csr_matrix (X_train ), y = y_train , cv = n_folds ,
532+ n_jobs = 1 , verbose = 0 , method = 'predict' ).reshape (- 1 , 1 )
533+ _ = model .fit (csr_matrix (X_train ), y_train )
534+ S_test_1 = model .predict (csr_matrix (X_test )).reshape (- 1 , 1 )
535+
536+ models = [LinearRegression ()]
537+ S_train_2 , S_test_2 = stacking (models , csr_matrix (X_train ), y_train , csr_matrix (X_test ),
538+ regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
539+ mode = 'oof_pred' , random_state = 0 , verbose = 0 )
540+
541+ # Load OOF from file
542+ # Normally if cleaning is performed there is only one .npy file at given moment
543+ # But if we have no cleaning there may be more then one file so we take the latest
544+ file_name = sorted (glob .glob ('*.npy' ))[- 1 ] # take the latest file
545+ S = np .load (file_name )
546+ S_train_3 = S [0 ]
547+ S_test_3 = S [1 ]
548+
549+ assert_array_equal (S_train_1 , S_train_2 )
550+ assert_array_equal (S_test_1 , S_test_2 )
551+
552+ assert_array_equal (S_train_1 , S_train_3 )
553+ assert_array_equal (S_test_1 , S_test_3 )
554+
555+ def test_oof_pred_mode_sparse_csc (self ):
556+
557+ model = LinearRegression ()
558+ S_train_1 = cross_val_predict (model , csc_matrix (X_train ), y = y_train , cv = n_folds ,
559+ n_jobs = 1 , verbose = 0 , method = 'predict' ).reshape (- 1 , 1 )
560+ _ = model .fit (csc_matrix (X_train ), y_train )
561+ S_test_1 = model .predict (csc_matrix (X_test )).reshape (- 1 , 1 )
562+
563+ models = [LinearRegression ()]
564+ S_train_2 , S_test_2 = stacking (models , csc_matrix (X_train ), y_train , csc_matrix (X_test ),
565+ regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
566+ mode = 'oof_pred' , random_state = 0 , verbose = 0 )
567+
568+ # Load OOF from file
569+ # Normally if cleaning is performed there is only one .npy file at given moment
570+ # But if we have no cleaning there may be more then one file so we take the latest
571+ file_name = sorted (glob .glob ('*.npy' ))[- 1 ] # take the latest file
572+ S = np .load (file_name )
573+ S_train_3 = S [0 ]
574+ S_test_3 = S [1 ]
575+
576+ assert_array_equal (S_train_1 , S_train_2 )
577+ assert_array_equal (S_test_1 , S_test_2 )
578+
579+ assert_array_equal (S_train_1 , S_train_3 )
580+ assert_array_equal (S_test_1 , S_test_3 )
581+
582+ def test_oof_pred_mode_sparse_coo (self ):
583+
584+ model = LinearRegression ()
585+ S_train_1 = cross_val_predict (model , coo_matrix (X_train ), y = y_train , cv = n_folds ,
586+ n_jobs = 1 , verbose = 0 , method = 'predict' ).reshape (- 1 , 1 )
587+ _ = model .fit (coo_matrix (X_train ), y_train )
588+ S_test_1 = model .predict (coo_matrix (X_test )).reshape (- 1 , 1 )
589+
590+ models = [LinearRegression ()]
591+ S_train_2 , S_test_2 = stacking (models , coo_matrix (X_train ), y_train , coo_matrix (X_test ),
592+ regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
593+ mode = 'oof_pred' , random_state = 0 , verbose = 0 )
594+
595+ # Load OOF from file
596+ # Normally if cleaning is performed there is only one .npy file at given moment
597+ # But if we have no cleaning there may be more then one file so we take the latest
598+ file_name = sorted (glob .glob ('*.npy' ))[- 1 ] # take the latest file
599+ S = np .load (file_name )
600+ S_train_3 = S [0 ]
601+ S_test_3 = S [1 ]
602+
603+ assert_array_equal (S_train_1 , S_train_2 )
604+ assert_array_equal (S_test_1 , S_test_2 )
605+
606+ assert_array_equal (S_train_1 , S_train_3 )
607+ assert_array_equal (S_test_1 , S_test_3 )
608+
609+ #---------------------------------------------------------------------------
610+ # Testing X_train -> SCR, X_test -> COO
611+ #---------------------------------------------------------------------------
612+
613+ def test_oof_pred_mode_sparse_csr_coo (self ):
614+
615+ model = LinearRegression ()
616+ S_train_1 = cross_val_predict (model , csr_matrix (X_train ), y = y_train , cv = n_folds ,
617+ n_jobs = 1 , verbose = 0 , method = 'predict' ).reshape (- 1 , 1 )
618+ _ = model .fit (csr_matrix (X_train ), y_train )
619+ S_test_1 = model .predict (coo_matrix (X_test )).reshape (- 1 , 1 )
620+
621+ models = [LinearRegression ()]
622+ S_train_2 , S_test_2 = stacking (models , csr_matrix (X_train ), y_train , coo_matrix (X_test ),
623+ regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
624+ mode = 'oof_pred' , random_state = 0 , verbose = 0 )
625+
626+ # Load OOF from file
627+ # Normally if cleaning is performed there is only one .npy file at given moment
628+ # But if we have no cleaning there may be more then one file so we take the latest
629+ file_name = sorted (glob .glob ('*.npy' ))[- 1 ] # take the latest file
630+ S = np .load (file_name )
631+ S_train_3 = S [0 ]
632+ S_test_3 = S [1 ]
633+
634+ assert_array_equal (S_train_1 , S_train_2 )
635+ assert_array_equal (S_test_1 , S_test_2 )
636+
637+ assert_array_equal (S_train_1 , S_train_3 )
638+ assert_array_equal (S_test_1 , S_test_3 )
639+
640+ #---------------------------------------------------------------------------
641+ # Testing X_train -> SCR, X_test -> Dense
642+ #---------------------------------------------------------------------------
643+
644+ def test_oof_pred_mode_sparse_csr_dense (self ):
645+
646+ model = LinearRegression ()
647+ S_train_1 = cross_val_predict (model , csr_matrix (X_train ), y = y_train , cv = n_folds ,
648+ n_jobs = 1 , verbose = 0 , method = 'predict' ).reshape (- 1 , 1 )
649+ _ = model .fit (csr_matrix (X_train ), y_train )
650+ S_test_1 = model .predict (X_test ).reshape (- 1 , 1 )
651+
652+ models = [LinearRegression ()]
653+ S_train_2 , S_test_2 = stacking (models , csr_matrix (X_train ), y_train , X_test ,
654+ regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
655+ mode = 'oof_pred' , random_state = 0 , verbose = 0 )
656+
657+ # Load OOF from file
658+ # Normally if cleaning is performed there is only one .npy file at given moment
659+ # But if we have no cleaning there may be more then one file so we take the latest
660+ file_name = sorted (glob .glob ('*.npy' ))[- 1 ] # take the latest file
661+ S = np .load (file_name )
662+ S_train_3 = S [0 ]
663+ S_test_3 = S [1 ]
664+
665+ assert_array_equal (S_train_1 , S_train_2 )
666+ assert_array_equal (S_test_1 , S_test_2 )
504667
668+ assert_array_equal (S_train_1 , S_train_3 )
669+ assert_array_equal (S_test_1 , S_test_3 )
670+
671+ #---------------------------------------------------------------------------
672+ # Testing X_test=None
673+ #---------------------------------------------------------------------------
674+ def test_oof_mode_xtest_is_none (self ):
675+
676+ model = LinearRegression ()
677+ S_train_1 = cross_val_predict (model , X_train , y = y_train , cv = n_folds ,
678+ n_jobs = 1 , verbose = 0 , method = 'predict' ).reshape (- 1 , 1 )
679+ S_test_1 = None
680+
681+ models = [LinearRegression ()]
682+ S_train_2 , S_test_2 = stacking (models , X_train , y_train , None ,
683+ regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
684+ mode = 'oof' , random_state = 0 , verbose = 0 )
685+
686+ # Load OOF from file
687+ # Normally if cleaning is performed there is only one .npy file at given moment
688+ # But if we have no cleaning there may be more then one file so we take the latest
689+ file_name = sorted (glob .glob ('*.npy' ))[- 1 ] # take the latest file
690+ S = np .load (file_name )
691+ S_train_3 = S [0 ]
692+ S_test_3 = S [1 ]
693+
694+ assert_array_equal (S_train_1 , S_train_2 )
695+ assert_array_equal (S_test_1 , S_test_2 )
696+
697+ assert_array_equal (S_train_1 , S_train_3 )
698+ assert_array_equal (S_test_1 , S_test_3 )
699+
700+ #---------------------------------------------------------------------------
701+ # Testing parameter exceptions
702+ #---------------------------------------------------------------------------
703+ def test_exceptions (self ):
704+ # Empty model list
705+ assert_raises (ValueError , stacking , [], X_train , y_train , X_test )
706+ # Wrong mode
707+ assert_raises (ValueError , stacking , [LinearRegression ()],
708+ X_train , y_train , X_test , mode = 'abc' )
709+ # Path does not exist
710+ assert_raises (ValueError , stacking , [LinearRegression ()],
711+ X_train , y_train , X_test , save_dir = './As26bV85' )
712+ # n_folds is not int
713+ assert_raises (ValueError , stacking , [LinearRegression ()],
714+ X_train , y_train , X_test , n_folds = 'A' )
715+ # n_folds is less than 2
716+ assert_raises (ValueError , stacking , [LinearRegression ()],
717+ X_train , y_train , X_test , n_folds = 1 )
718+ # Wrong verbose value
719+ assert_raises (ValueError , stacking , [LinearRegression ()],
720+ X_train , y_train , X_test , verbose = 25 )
721+
722+ # Internal function model_action
723+ assert_raises (ValueError , model_action , LinearRegression (),
724+ X_train , y_train , X_test , sample_weight = None ,
725+ action = 'abc' , transform = None )
726+
727+ #---------------------------------------------------------------------------
728+ # Testing parameter warnings
729+ #---------------------------------------------------------------------------
730+ def test_warnings (self ):
731+ # Parameters specific for classification are ignored if regression=True
732+ assert_warns (UserWarning , stacking , [LinearRegression ()],
733+ X_train , y_train , X_test , regression = True ,
734+ needs_proba = True )
735+
736+ assert_warns (UserWarning , stacking , [LinearRegression ()],
737+ X_train , y_train , X_test , regression = True ,
738+ stratified = True )
739+
740+ assert_warns (UserWarning , stacking , [LinearRegression ()],
741+ X_train , y_train , X_test , regression = True ,
742+ needs_proba = True , stratified = True )
743+
744+ #---------------------------------------------------------------------------
745+ # Test if model has no 'get_params'
746+ #---------------------------------------------------------------------------
747+ def test_oof_pred_mode_no_get_params (self ):
748+
749+ S_train_1 = np .ones (X_train .shape [0 ]).reshape (- 1 , 1 )
750+ S_test_1 = np .ones (X_test .shape [0 ]).reshape (- 1 , 1 )
751+
752+ models = [MinimalEstimator ()]
753+ S_train_2 , S_test_2 = stacking (models , X_train , y_train , X_test ,
754+ regression = True , n_folds = n_folds , shuffle = False , save_dir = '.' ,
755+ mode = 'oof_pred' , random_state = 0 , verbose = 0 )
756+
757+ # Load OOF from file
758+ # Normally if cleaning is performed there is only one .npy file at given moment
759+ # But if we have no cleaning there may be more then one file so we take the latest
760+ file_name = sorted (glob .glob ('*.npy' ))[- 1 ] # take the latest file
761+ S = np .load (file_name )
762+ S_train_3 = S [0 ]
763+ S_test_3 = S [1 ]
764+
765+ assert_array_equal (S_train_1 , S_train_2 )
766+ assert_array_equal (S_test_1 , S_test_2 )
767+
768+ assert_array_equal (S_train_1 , S_train_3 )
769+ assert_array_equal (S_test_1 , S_test_3 )
770+
771+ #-------------------------------------------------------------------------------
772+ # Test inconsistent data shape or type
773+ #-------------------------------------------------------------------------------
774+ def test_inconsistent_data (self ):
775+ # nan or inf in y
776+ y_train_nan = y_train .copy ()
777+ y_train_nan [0 ] = np .nan
778+ assert_raises (ValueError , stacking , [LinearRegression ()],
779+ X_train , y_train_nan , X_test )
780+
781+ # y has two or more columns
782+ assert_raises (ValueError , stacking , [LinearRegression ()],
783+ X_train , np .c_ [y_train , y_train ], X_test )
784+
785+ # X_train and y_train shape nismatch
786+ assert_raises (ValueError , stacking , [LinearRegression ()],
787+ X_train , y_train [:10 ], X_test )
788+
505789#-------------------------------------------------------------------------------
506790#-------------------------------------------------------------------------------
507791
0 commit comments