Skip to content

Commit b8bf538

Browse files
authored
Add new tests
1 parent 56adaa2 commit b8bf538

File tree

1 file changed

+290
-6
lines changed

1 file changed

+290
-6
lines changed

tests/test_vecstack_regression.py

Lines changed: 290 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,15 @@
1010
from numpy.testing import assert_array_equal
1111
from numpy.testing import assert_allclose
1212
from numpy.testing import assert_equal
13+
from numpy.testing import assert_raises
14+
from numpy.testing import assert_warns
1315

1416
import os
1517
import glob
1618
import numpy as np
19+
from scipy.sparse import csr_matrix
20+
from scipy.sparse import csc_matrix
21+
from scipy.sparse import coo_matrix
1722
from sklearn.model_selection import cross_val_predict
1823
from sklearn.model_selection import cross_val_score
1924
from sklearn.model_selection import train_test_split
@@ -22,8 +27,9 @@
2227
from sklearn.metrics import mean_absolute_error
2328
from sklearn.metrics import make_scorer
2429
from sklearn.linear_model import LinearRegression
25-
from sklearn.linear_model import SGDRegressor
30+
from sklearn.linear_model import Ridge
2631
from vecstack import stacking
32+
from vecstack.core import model_action
2733

2834
n_folds = 5
2935

@@ -34,6 +40,20 @@
3440
#-------------------------------------------------------------------------------
3541
#-------------------------------------------------------------------------------
3642

43+
class MinimalEstimator:
44+
"""Has no get_params attribute"""
45+
def __init__(self, random_state=0):
46+
self.random_state = random_state
47+
def fit(self, X, y):
48+
return self
49+
def predict(self, X):
50+
return np.ones(X.shape[0])
51+
def predict_proba(self, X):
52+
return np.zeros(X.shape[0])
53+
54+
#-------------------------------------------------------------------------------
55+
#-------------------------------------------------------------------------------
56+
3757
class TestRegression(unittest.TestCase):
3858

3959
def tearDown(self):
@@ -410,7 +430,7 @@ def test_oof_pred_mode_2_models(self):
410430
_ = model.fit(X_train, y_train)
411431
S_test_1_a = model.predict(X_test).reshape(-1, 1)
412432

413-
model = SGDRegressor(random_state = 0)
433+
model = Ridge(random_state = 0)
414434
S_train_1_b = cross_val_predict(model, X_train, y = y_train, cv = n_folds,
415435
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
416436
_ = model.fit(X_train, y_train)
@@ -420,7 +440,7 @@ def test_oof_pred_mode_2_models(self):
420440
S_test_1 = np.c_[S_test_1_a, S_test_1_b]
421441

422442
models = [LinearRegression(),
423-
SGDRegressor(random_state = 0)]
443+
Ridge(random_state = 0)]
424444
S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test,
425445
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
426446
mode = 'oof_pred', random_state = 0, verbose = 0)
@@ -468,12 +488,12 @@ def test_oof_pred_bag_mode_2_models(self):
468488
y_tr = y_train[tr_index]
469489
X_te = X_train[te_index]
470490
y_te = y_train[te_index]
471-
model = SGDRegressor(random_state = 0)
491+
model = Ridge(random_state = 0)
472492
_ = model.fit(X_tr, y_tr)
473493
S_test_temp[:, fold_counter] = model.predict(X_test)
474494
S_test_1_b = np.mean(S_test_temp, axis = 1).reshape(-1, 1)
475495

476-
model = SGDRegressor(random_state = 0)
496+
model = Ridge(random_state = 0)
477497
S_train_1_b = cross_val_predict(model, X_train, y = y_train, cv = n_folds,
478498
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
479499

@@ -482,7 +502,7 @@ def test_oof_pred_bag_mode_2_models(self):
482502

483503

484504
models = [LinearRegression(),
485-
SGDRegressor(random_state = 0)]
505+
Ridge(random_state = 0)]
486506
S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test,
487507
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
488508
mode = 'oof_pred_bag', random_state = 0, verbose = 0)
@@ -501,7 +521,271 @@ def test_oof_pred_bag_mode_2_models(self):
501521
assert_array_equal(S_train_1, S_train_3)
502522
assert_array_equal(S_test_1, S_test_3)
503523

524+
#---------------------------------------------------------------------------
525+
# Testing sparse types CSR, CSC, COO
526+
#---------------------------------------------------------------------------
527+
528+
def test_oof_pred_mode_sparse_csr(self):
529+
530+
model = LinearRegression()
531+
S_train_1 = cross_val_predict(model, csr_matrix(X_train), y = y_train, cv = n_folds,
532+
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
533+
_ = model.fit(csr_matrix(X_train), y_train)
534+
S_test_1 = model.predict(csr_matrix(X_test)).reshape(-1, 1)
535+
536+
models = [LinearRegression()]
537+
S_train_2, S_test_2 = stacking(models, csr_matrix(X_train), y_train, csr_matrix(X_test),
538+
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
539+
mode = 'oof_pred', random_state = 0, verbose = 0)
540+
541+
# Load OOF from file
542+
# Normally if cleaning is performed there is only one .npy file at given moment
543+
# But if we have no cleaning there may be more then one file so we take the latest
544+
file_name = sorted(glob.glob('*.npy'))[-1] # take the latest file
545+
S = np.load(file_name)
546+
S_train_3 = S[0]
547+
S_test_3 = S[1]
548+
549+
assert_array_equal(S_train_1, S_train_2)
550+
assert_array_equal(S_test_1, S_test_2)
551+
552+
assert_array_equal(S_train_1, S_train_3)
553+
assert_array_equal(S_test_1, S_test_3)
554+
555+
def test_oof_pred_mode_sparse_csc(self):
556+
557+
model = LinearRegression()
558+
S_train_1 = cross_val_predict(model, csc_matrix(X_train), y = y_train, cv = n_folds,
559+
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
560+
_ = model.fit(csc_matrix(X_train), y_train)
561+
S_test_1 = model.predict(csc_matrix(X_test)).reshape(-1, 1)
562+
563+
models = [LinearRegression()]
564+
S_train_2, S_test_2 = stacking(models, csc_matrix(X_train), y_train, csc_matrix(X_test),
565+
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
566+
mode = 'oof_pred', random_state = 0, verbose = 0)
567+
568+
# Load OOF from file
569+
# Normally if cleaning is performed there is only one .npy file at given moment
570+
# But if we have no cleaning there may be more then one file so we take the latest
571+
file_name = sorted(glob.glob('*.npy'))[-1] # take the latest file
572+
S = np.load(file_name)
573+
S_train_3 = S[0]
574+
S_test_3 = S[1]
575+
576+
assert_array_equal(S_train_1, S_train_2)
577+
assert_array_equal(S_test_1, S_test_2)
578+
579+
assert_array_equal(S_train_1, S_train_3)
580+
assert_array_equal(S_test_1, S_test_3)
581+
582+
def test_oof_pred_mode_sparse_coo(self):
583+
584+
model = LinearRegression()
585+
S_train_1 = cross_val_predict(model, coo_matrix(X_train), y = y_train, cv = n_folds,
586+
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
587+
_ = model.fit(coo_matrix(X_train), y_train)
588+
S_test_1 = model.predict(coo_matrix(X_test)).reshape(-1, 1)
589+
590+
models = [LinearRegression()]
591+
S_train_2, S_test_2 = stacking(models, coo_matrix(X_train), y_train, coo_matrix(X_test),
592+
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
593+
mode = 'oof_pred', random_state = 0, verbose = 0)
594+
595+
# Load OOF from file
596+
# Normally if cleaning is performed there is only one .npy file at given moment
597+
# But if we have no cleaning there may be more then one file so we take the latest
598+
file_name = sorted(glob.glob('*.npy'))[-1] # take the latest file
599+
S = np.load(file_name)
600+
S_train_3 = S[0]
601+
S_test_3 = S[1]
602+
603+
assert_array_equal(S_train_1, S_train_2)
604+
assert_array_equal(S_test_1, S_test_2)
605+
606+
assert_array_equal(S_train_1, S_train_3)
607+
assert_array_equal(S_test_1, S_test_3)
608+
609+
#---------------------------------------------------------------------------
610+
# Testing X_train -> SCR, X_test -> COO
611+
#---------------------------------------------------------------------------
612+
613+
def test_oof_pred_mode_sparse_csr_coo(self):
614+
615+
model = LinearRegression()
616+
S_train_1 = cross_val_predict(model, csr_matrix(X_train), y = y_train, cv = n_folds,
617+
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
618+
_ = model.fit(csr_matrix(X_train), y_train)
619+
S_test_1 = model.predict(coo_matrix(X_test)).reshape(-1, 1)
620+
621+
models = [LinearRegression()]
622+
S_train_2, S_test_2 = stacking(models, csr_matrix(X_train), y_train, coo_matrix(X_test),
623+
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
624+
mode = 'oof_pred', random_state = 0, verbose = 0)
625+
626+
# Load OOF from file
627+
# Normally if cleaning is performed there is only one .npy file at given moment
628+
# But if we have no cleaning there may be more then one file so we take the latest
629+
file_name = sorted(glob.glob('*.npy'))[-1] # take the latest file
630+
S = np.load(file_name)
631+
S_train_3 = S[0]
632+
S_test_3 = S[1]
633+
634+
assert_array_equal(S_train_1, S_train_2)
635+
assert_array_equal(S_test_1, S_test_2)
636+
637+
assert_array_equal(S_train_1, S_train_3)
638+
assert_array_equal(S_test_1, S_test_3)
639+
640+
#---------------------------------------------------------------------------
641+
# Testing X_train -> SCR, X_test -> Dense
642+
#---------------------------------------------------------------------------
643+
644+
def test_oof_pred_mode_sparse_csr_dense(self):
645+
646+
model = LinearRegression()
647+
S_train_1 = cross_val_predict(model, csr_matrix(X_train), y = y_train, cv = n_folds,
648+
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
649+
_ = model.fit(csr_matrix(X_train), y_train)
650+
S_test_1 = model.predict(X_test).reshape(-1, 1)
651+
652+
models = [LinearRegression()]
653+
S_train_2, S_test_2 = stacking(models, csr_matrix(X_train), y_train, X_test,
654+
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
655+
mode = 'oof_pred', random_state = 0, verbose = 0)
656+
657+
# Load OOF from file
658+
# Normally if cleaning is performed there is only one .npy file at given moment
659+
# But if we have no cleaning there may be more then one file so we take the latest
660+
file_name = sorted(glob.glob('*.npy'))[-1] # take the latest file
661+
S = np.load(file_name)
662+
S_train_3 = S[0]
663+
S_test_3 = S[1]
664+
665+
assert_array_equal(S_train_1, S_train_2)
666+
assert_array_equal(S_test_1, S_test_2)
504667

668+
assert_array_equal(S_train_1, S_train_3)
669+
assert_array_equal(S_test_1, S_test_3)
670+
671+
#---------------------------------------------------------------------------
672+
# Testing X_test=None
673+
#---------------------------------------------------------------------------
674+
def test_oof_mode_xtest_is_none(self):
675+
676+
model = LinearRegression()
677+
S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds,
678+
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
679+
S_test_1 = None
680+
681+
models = [LinearRegression()]
682+
S_train_2, S_test_2 = stacking(models, X_train, y_train, None,
683+
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
684+
mode = 'oof', random_state = 0, verbose = 0)
685+
686+
# Load OOF from file
687+
# Normally if cleaning is performed there is only one .npy file at given moment
688+
# But if we have no cleaning there may be more then one file so we take the latest
689+
file_name = sorted(glob.glob('*.npy'))[-1] # take the latest file
690+
S = np.load(file_name)
691+
S_train_3 = S[0]
692+
S_test_3 = S[1]
693+
694+
assert_array_equal(S_train_1, S_train_2)
695+
assert_array_equal(S_test_1, S_test_2)
696+
697+
assert_array_equal(S_train_1, S_train_3)
698+
assert_array_equal(S_test_1, S_test_3)
699+
700+
#---------------------------------------------------------------------------
701+
# Testing parameter exceptions
702+
#---------------------------------------------------------------------------
703+
def test_exceptions(self):
704+
# Empty model list
705+
assert_raises(ValueError, stacking, [], X_train, y_train, X_test)
706+
# Wrong mode
707+
assert_raises(ValueError, stacking, [LinearRegression()],
708+
X_train, y_train, X_test, mode='abc')
709+
# Path does not exist
710+
assert_raises(ValueError, stacking, [LinearRegression()],
711+
X_train, y_train, X_test, save_dir='./As26bV85')
712+
# n_folds is not int
713+
assert_raises(ValueError, stacking, [LinearRegression()],
714+
X_train, y_train, X_test, n_folds='A')
715+
# n_folds is less than 2
716+
assert_raises(ValueError, stacking, [LinearRegression()],
717+
X_train, y_train, X_test, n_folds=1)
718+
# Wrong verbose value
719+
assert_raises(ValueError, stacking, [LinearRegression()],
720+
X_train, y_train, X_test, verbose=25)
721+
722+
# Internal function model_action
723+
assert_raises(ValueError, model_action, LinearRegression(),
724+
X_train, y_train, X_test, sample_weight=None,
725+
action='abc', transform=None)
726+
727+
#---------------------------------------------------------------------------
728+
# Testing parameter warnings
729+
#---------------------------------------------------------------------------
730+
def test_warnings(self):
731+
# Parameters specific for classification are ignored if regression=True
732+
assert_warns(UserWarning, stacking, [LinearRegression()],
733+
X_train, y_train, X_test, regression=True,
734+
needs_proba=True)
735+
736+
assert_warns(UserWarning, stacking, [LinearRegression()],
737+
X_train, y_train, X_test, regression=True,
738+
stratified=True)
739+
740+
assert_warns(UserWarning, stacking, [LinearRegression()],
741+
X_train, y_train, X_test, regression=True,
742+
needs_proba=True, stratified=True)
743+
744+
#---------------------------------------------------------------------------
745+
# Test if model has no 'get_params'
746+
#---------------------------------------------------------------------------
747+
def test_oof_pred_mode_no_get_params(self):
748+
749+
S_train_1 = np.ones(X_train.shape[0]).reshape(-1, 1)
750+
S_test_1 = np.ones(X_test.shape[0]).reshape(-1, 1)
751+
752+
models = [MinimalEstimator()]
753+
S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test,
754+
regression = True, n_folds = n_folds, shuffle = False, save_dir = '.',
755+
mode = 'oof_pred', random_state = 0, verbose = 0)
756+
757+
# Load OOF from file
758+
# Normally if cleaning is performed there is only one .npy file at given moment
759+
# But if we have no cleaning there may be more then one file so we take the latest
760+
file_name = sorted(glob.glob('*.npy'))[-1] # take the latest file
761+
S = np.load(file_name)
762+
S_train_3 = S[0]
763+
S_test_3 = S[1]
764+
765+
assert_array_equal(S_train_1, S_train_2)
766+
assert_array_equal(S_test_1, S_test_2)
767+
768+
assert_array_equal(S_train_1, S_train_3)
769+
assert_array_equal(S_test_1, S_test_3)
770+
771+
#-------------------------------------------------------------------------------
772+
# Test inconsistent data shape or type
773+
#-------------------------------------------------------------------------------
774+
def test_inconsistent_data(self):
775+
# nan or inf in y
776+
y_train_nan = y_train.copy()
777+
y_train_nan[0] = np.nan
778+
assert_raises(ValueError, stacking, [LinearRegression()],
779+
X_train, y_train_nan, X_test)
780+
781+
# y has two or more columns
782+
assert_raises(ValueError, stacking, [LinearRegression()],
783+
X_train, np.c_[y_train, y_train], X_test)
784+
785+
# X_train and y_train shape nismatch
786+
assert_raises(ValueError, stacking, [LinearRegression()],
787+
X_train, y_train[:10], X_test)
788+
505789
#-------------------------------------------------------------------------------
506790
#-------------------------------------------------------------------------------
507791

0 commit comments

Comments
 (0)