diff --git a/doc/narx.rst b/doc/narx.rst
index e52d445..6fd52a8 100644
--- a/doc/narx.rst
+++ b/doc/narx.rst
@@ -54,7 +54,8 @@ No matter how the discontinuity is caused, :class:`NARX` treats the discontinuou
     >>> import numpy as np
     >>> x0 = np.zeros((3, 2)) # First measurement session has 3 samples with 2 features
     >>> x1 = np.ones((5, 2)) # Second measurement session has 5 samples with 2 features
-    >>> u = np.r_[x0, [[np.nan, np.nan]], x1] # Insert np.nan to break the two measurement sessions
+    >>> max_delay = 2 # Assume the maximum delay for NARX model is 2
+    >>> u = np.r_[x0, [[np.nan, np.nan]]*max_delay, x1] # Insert (at least max_delay number of) np.nan to break the two measurement sessions
 
     It is important to break the different measurement sessions by np.nan, because otherwise,
     the model will assume the time interval between the two measurement sessions is the same as the time interval within a session.
diff --git a/examples/plot_narx.py b/examples/plot_narx.py
index e581172..4fab121 100644
--- a/examples/plot_narx.py
+++ b/examples/plot_narx.py
@@ -108,10 +108,14 @@
 # whose :math:`X` is the nonlinear terms and :math:`y` is the output signal.
 
 from fastcan import FastCan
+from fastcan.utils import mask_missing_values
+
+# Mask out missing values caused by time-shifting
+poly_terms_masked, y_masked = mask_missing_values(poly_terms, y)
 
 selector = FastCan(
     n_features_to_select=4,  # 4 terms should be selected
-).fit(poly_terms, y)
+).fit(poly_terms_masked, y_masked)
 
 support = selector.get_support()
 selected_poly_ids = poly_ids[support]
diff --git a/examples/plot_narx_msa.py b/examples/plot_narx_msa.py
index 7d7466a..737de11 100644
--- a/examples/plot_narx_msa.py
+++ b/examples/plot_narx_msa.py
@@ -83,8 +83,8 @@ def auto_duffing_equation(y, t):
 n_samples = 1000
 
 rng = np.random.default_rng(12345)
-e_train = rng.normal(0, 0.0002, n_samples)
-e_test = rng.normal(0, 0.0002, n_samples)
+e_train = rng.normal(0, 0.0004, n_samples)
+e_test = rng.normal(0, 0.0004, n_samples)
 t = np.linspace(0, dur, n_samples)
 
 sol = odeint(duffing_equation, [0.6, 0.8], t)
@@ -153,12 +153,13 @@ def plot_prediction(ax, t, y_true, y_pred, title):
 # The plot above shows that the NARX model cannot capture the dynamics at
 # the left equilibrium shown in the phase portraits. To improve the performance, let us
 # combine the training and test data for model training to include the dynamics of both
-# equilibria. Here, we need to insert `np.nan` to indicate the model that training data
+# equilibria. Here, we need to insert (at least max_delay number of) `np.nan` to
+# indicate the model that training data
 # and test data are from different measurement sessions. The plot shows that the
 # prediction performance of the NARX on test data has been largely improved.
 
-u_all = np.r_[u_train, [[np.nan]], u_test]
-y_all = np.r_[y_train, [np.nan], y_test]
+u_all = np.r_[u_train, [[np.nan]]*max_delay, u_test]
+y_all = np.r_[y_train, [np.nan]*max_delay, y_test]
 narx_model = make_narx(
     X=u_all,
     y=y_all,
diff --git a/fastcan/narx.py b/fastcan/narx.py
index d541adb..2883c7a 100644
--- a/fastcan/narx.py
+++ b/fastcan/narx.py
@@ -62,10 +62,10 @@ def make_time_shift_features(X, ids):
     >>> X = [[1, 2], [3, 4], [5, 6], [7, 8]]
     >>> ids = [[0, 0], [0, 1], [1, 1]]
     >>> make_time_shift_features(X, ids)
-    array([[1., 1., 2.],
-           [3., 1., 2.],
-           [5., 3., 4.],
-           [7., 5., 6.]])
+    array([[ 1., nan, nan],
+           [ 3.,  1.,  2.],
+           [ 5.,  3.,  4.],
+           [ 7.,  5.,  6.]])
     """
     X = check_array(X, ensure_2d=True, dtype=float, ensure_all_finite="allow-nan")
     ids = check_array(ids, ensure_2d=True, dtype=int)
@@ -74,7 +74,7 @@ def make_time_shift_features(X, ids):
     out = np.zeros([n_samples, n_outputs])
     for i, id_temp in enumerate(ids):
         out[:, i] = np.r_[
-            np.full(id_temp[1], X[0, id_temp[0]]),
+            np.full(id_temp[1], np.nan),
             X[: -id_temp[1] or None, id_temp[0]],
         ]
 
@@ -1413,11 +1413,11 @@ def make_narx(
     >>> print_narx(narx)
     | yid |        Term        |   Coef   |
     =======================================
-    |  0  |     Intercept      |  1.054   |
-    |  0  |    y_hat[k-1,0]    |  0.483   |
-    |  0  |   X[k,0]*X[k,0]    |  0.307   |
-    |  0  | X[k-1,0]*X[k-3,0]  |  1.999   |
-    |  0  |  X[k-2,0]*X[k,1]   |  1.527   |
+    |  0  |     Intercept      |  1.050   |
+    |  0  |    y_hat[k-1,0]    |  0.484   |
+    |  0  |   X[k,0]*X[k,0]    |  0.306   |
+    |  0  | X[k-1,0]*X[k-3,0]  |  2.000   |
+    |  0  |  X[k-2,0]*X[k,1]   |  1.528   |
     """
     X = check_array(X, dtype=float, ensure_2d=True, ensure_all_finite="allow-nan")
     y = check_array(y, dtype=float, ensure_2d=False, ensure_all_finite="allow-nan")
diff --git a/fastcan/utils.py b/fastcan/utils.py
index 47b9c0f..160d53e 100644
--- a/fastcan/utils.py
+++ b/fastcan/utils.py
@@ -168,4 +168,7 @@ def mask_missing_values(*arrays, return_mask=False):
     mask_valid = np.all(np.isfinite(np.c_[arrays]), axis=1)
     if return_mask:
         return mask_valid
+    masked_arrays = [_safe_indexing(x, mask_valid) for x in arrays]
+    if len(masked_arrays) == 1:
+        return masked_arrays[0]
     return [_safe_indexing(x, mask_valid) for x in arrays]
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8d57c73..0a634e1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -66,6 +66,10 @@ def test_mask_missing():
     b = rng.random((100, 2))
     c = rng.random(100)
     a[10, 0] = np.nan
+    a_masked = mask_missing_values(a)
+    mask_valid = mask_missing_values(a, return_mask=True)
+    assert a_masked.shape == (99, 10)
+    assert_array_equal(actual=a_masked, desired=a[mask_valid])
     b[20, 1] = np.nan
     c[30] = np.nan
     a_masked, b_masked, c_mask = mask_missing_values(a, b, c)