Merge pull request #35 from vecxoz/dev

vecxoz · web-flow · commit 1698f7bd2565 · 2019-10-30T11:27:48.000+02:00
Fill gaps in doc (X_test can be None). Add some tests
diff --git a/tests/test_func_api_regression.py b/tests/test_func_api_regression.py
@@ -828,9 +828,13 @@ def test_exceptions(self):
                       X_train, y_train, X_test, verbose=25)
                       
         # Internal function model_action
-        assert_raises(ValueError, model_action, LinearRegression(), 
-                      X_train, y_train, X_test, sample_weight=None, 
+        assert_raises(ValueError, model_action, LinearRegression(),
+                      X_train, y_train, X_test, sample_weight=None,
                       action='abc', transform=None)
+
+        # X_test is None when mode != 'oof'
+        assert_raises(ValueError, stacking, [LinearRegression()],
+                      X_train, y_train, None, mode='oof_pred_bag')
                       
     #---------------------------------------------------------------------------
     # Testing parameter warnings
@@ -940,6 +944,80 @@ def test_small_input(self):
         assert_array_equal(S_train_1, S_train_3)
         assert_array_equal(S_test_1, S_test_3)
 
+    #---------------------------------------------------------------------------
+    # Mode 'oof', X_test=None
+    #---------------------------------------------------------------------------
+
+    def test_oof_mode_with_none(self):
+
+        model = LinearRegression()
+        S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds,
+            n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
+        S_test_1 = None
+
+        models = [LinearRegression()]
+        S_train_2, S_test_2 = stacking(models, X_train, y_train, None,
+            regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir,
+            mode = 'oof', random_state = 0, verbose = 0)
+
+        # Load OOF from file
+        # Normally if cleaning is performed there is only one .npy file at given moment
+        # But if we have no cleaning there may be more then one file so we take the latest
+        file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
+        S = np.load(file_name, allow_pickle=True)
+        S_train_3 = S[0]
+        S_test_3 = S[1]
+
+        assert_array_equal(S_train_1, S_train_2)
+        assert_array_equal(S_test_1, S_test_2)
+
+        assert_array_equal(S_train_1, S_train_3)
+        assert_array_equal(S_test_1, S_test_3)
+
+    #---------------------------------------------------------------------------
+    # All default values (mode='oof_pred_bag')
+    #---------------------------------------------------------------------------
+
+    def test_all_defaults(self):
+
+        # Override global n_folds=5, because default value in stacking function is 4
+        n_folds=4
+
+        S_test_temp = np.zeros((X_test.shape[0], n_folds))
+        kf = KFold(n_splits = n_folds, shuffle = False, random_state = 0)
+        for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
+            # Split data and target
+            X_tr = X_train[tr_index]
+            y_tr = y_train[tr_index]
+            X_te = X_train[te_index]
+            y_te = y_train[te_index]
+            model = LinearRegression()
+            _ = model.fit(X_tr, y_tr)
+            S_test_temp[:, fold_counter] = model.predict(X_test)
+        S_test_1 = np.mean(S_test_temp, axis = 1).reshape(-1, 1)
+
+        model = LinearRegression()
+        S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds,
+            n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
+
+        models = [LinearRegression()]
+        S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, save_dir=temp_dir)
+
+        # Load OOF from file
+        # Normally if cleaning is performed there is only one .npy file at given moment
+        # But if we have no cleaning there may be more then one file so we take the latest
+        file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
+        S = np.load(file_name, allow_pickle=True)
+        S_train_3 = S[0]
+        S_test_3 = S[1]
+
+        assert_array_equal(S_train_1, S_train_2)
+        assert_array_equal(S_test_1, S_test_2)
+
+        assert_array_equal(S_train_1, S_train_3)
+        assert_array_equal(S_test_1, S_test_3)
+
+
 #-------------------------------------------------------------------------------
 #-------------------------------------------------------------------------------
 
diff --git a/vecstack/core.py b/vecstack/core.py
@@ -151,13 +151,14 @@ def stacking(models, X_train, y_train, X_test,
     y_train : numpy 1d array
         Target values
         
-    X_test : numpy array or sparse matrix of N-dim shape, e.g. 2-dim [n_test_samples, n_features]
+    X_test : numpy array or sparse matrix of N-dim shape, e.g. 2-dim [n_test_samples, n_features], or None
         Test data
+        Note: X_test can be set to None when mode='oof'
         
-    sample_weight : numpy array of shape [n_train_samples]
+    sample_weight : numpy array of shape [n_train_samples], default None
         Individual weights for each sample (passed to fit method of the model).
-        Note: sample_weight has length of full training set X_train and it would be
-        split automatically for each fold.
+        Note: sample_weight must have the same length as full training set X_train.
+            It will be split automatically for each fold.
         
     regression : boolean, default True
         If True - perform stacking for regression task, 
@@ -188,7 +189,7 @@ def stacking(models, X_train, y_train, X_test,
         
     mode: str, default 'oof_pred_bag' (alias 'A')
         Note: for detailes see terminology below
-        'oof' - return only oof
+        'oof' - return only oof. X_test can be set to None
         'oof_pred' (alias 'B') - return oof and pred
         'oof_pred_bag' (alias 'A') - return oof and bagged pred
         'pred' - return pred only
@@ -406,6 +407,9 @@ def your_metric(y_true, y_pred):
     # If empty <models> list
     if 0 == len(models):
         raise ValueError('List of models is empty')
+    # X_test can be None only if mode='oof'
+    if X_test is None and mode != 'oof':
+        raise ValueError("X_test can be None only if mode='oof'")
     # Check arrays
     # y_train and sample_weight must be 1d ndarrays (i.e. row, not column)
     X_train, y_train = check_X_y(X_train,