diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
index d231b72..3cb6852 100644
--- a/.github/workflows/emscripten.yml
+++ b/.github/workflows/emscripten.yml
@@ -9,7 +9,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
       - name: Build WASM wheel
-        uses: pypa/cibuildwheel@v3.2.1
+        uses: pypa/cibuildwheel@v3.3.0
         env:
           CIBW_PLATFORM: pyodide
       - name: Upload package
diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml
index 01ea75e..457d38d 100644
--- a/.github/workflows/wheel.yml
+++ b/.github/workflows/wheel.yml
@@ -33,7 +33,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.2.1
+        uses: pypa/cibuildwheel@v3.3.0
         env:
           CIBW_SKIP: "*_i686 *_ppc64le *_s390x *_universal2 *-musllinux_* cp314t*"
           CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9"
diff --git a/doc/narx.rst b/doc/narx.rst
index db4d33d..dc379d2 100644
--- a/doc/narx.rst
+++ b/doc/narx.rst
@@ -46,10 +46,12 @@ The discontinuity can be caused by
    lasts for one hour at 1 Hz, and another measurement is carried out at Tuesday and lasts for two hours at 1 Hz
 #. Missing values
 
-No matter how the discontinuity is caused, :class:`NARX` treats the discontinuous time series as multiple pieces of continuous time series in the same way.
+As a result, there are two ways to inform :class:`NARX` the training data from different measurement sessions.
+From version 0.5, the difference measurement sessions can be set by `session_sizes` in :meth:`NARX.fit` and :func:`make_narx`.
+Alternatively, different measurement sessions can also be notified by inserting `np.nan` to break the data.
+The following examples show how to notify :class:`NARX` the training data from different measurement sessions.
 
-.. note::
-    It is easy to notify :class:`NARX` that the time series data are from different measurement sessions by inserting np.nan to break the data. For example,
+.. code-block:: python
 
     >>> import numpy as np
     >>> x0 = np.zeros((3, 2)) # First measurement session
@@ -57,9 +59,18 @@ No matter how the discontinuity is caused, :class:`NARX` treats the discontinuou
     >>> max_delay = 2 # Assume the maximum delay for NARX model is 2
     >>> # Insert (at least max_delay number of) np.nan to break the two measurement sessions
     >>> u = np.r_[x0, [[np.nan, np.nan]]*max_delay, x1]
+    >>> # Or use session_sizes to break the two measurement sessions
+    >>> session_sizes = [3, 5]
 
-    It is important to break the different measurement sessions by np.nan, because otherwise,
-    the model will assume the time interval between the two measurement sessions is the same as the time interval within a session.
+.. note::
+    It is important to separate different measurement sessions using either method.
+    Otherwise, the model will incorrectly assume that the time interval between two measurement sessions
+    is the same as the sampling interval within a session.
+
+    Actually, the two methods will result in the exact same models when `X` is not None or multiple-step-ahead prediction is not used for training.
+    If `X` is None, i.e. Auto-Regression models, and multiple-step-ahead prediction is used for training, only `session_sizes` can
+    separate the two measurement sessions.
+    Because when using multiple-step-ahead prediction, the optimiser relies on the missing values in inputs `X` rather than output `y`, while Auto-Regression models do not have input `X`.
 
 
 One-step-ahead and multiple-step-ahead prediction
diff --git a/doc/unsupervised.rst b/doc/unsupervised.rst
index 68c9d73..1f64ed1 100644
--- a/doc/unsupervised.rst
+++ b/doc/unsupervised.rst
@@ -17,6 +17,8 @@ features, which maximize the sum of the squared canonical correlation (SSC) with
 the principal components (PCs) acquired from PCA of the feature matrix :math:`X` [1]_.
 See the example below.
 
+.. code-block:: python
+
     >>> from sklearn.decomposition import PCA
     >>> from sklearn import datasets
     >>> from fastcan import FastCan
diff --git a/fastcan/narx/_base.py b/fastcan/narx/_base.py
index f32abe1..71a7e14 100644
--- a/fastcan/narx/_base.py
+++ b/fastcan/narx/_base.py
@@ -162,11 +162,14 @@ def __init__(
         {
             "X": [None, "array-like"],
             "coef_init": [None, StrOptions({"one_step_ahead"}), "array-like"],
-            "sample_weight": ["array-like", None],
+            "sample_weight": [None, "array-like"],
+            "session_sizes": [None, "array-like"],
         },
         prefer_skip_nested_validation=True,
     )
-    def fit(self, X, y, sample_weight=None, coef_init=None, **params):
+    def fit(
+        self, X, y, sample_weight=None, coef_init=None, session_sizes=None, **params
+    ):
         """
         Fit narx model.
 
@@ -194,6 +197,13 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
                 When coef_init is `one_step_ahead`, the model will be trained as a
                 Multi-Step-Ahead NARX, rather than a One-Step-Ahead NARX.
 
+        session_sizes : array-like of shape (n_sessions,), default=None
+            The sizes of measurement sessions for time-series.
+            The sum of session_sizes should be equal to n_samples.
+            If None, the whole data is treated as one session.
+
+            .. versionadded:: 0.5
+
         **params : dict
             Keyword arguments passed to
             `scipy.optimize.least_squares`.
@@ -220,9 +230,6 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
             X = np.empty((len(y), 0), dtype=float, order="C")  # Create 0 feature input
         else:
             check_consistent_length(X, y)
-        sample_weight = _check_sample_weight(
-            sample_weight, X, dtype=X.dtype, ensure_non_negative=True
-        )
         # store the number of dimension of the target to predict an array of
         # similar shape at predict
         self._y_ndim = y.ndim
@@ -231,6 +238,12 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
         self.n_outputs_ = y.shape[1]
         n_samples, n_features = X.shape
 
+        sample_weight = _check_sample_weight(
+            sample_weight, X, dtype=X.dtype, ensure_non_negative=True
+        )
+
+        session_sizes_cumsum = _validate_session_sizes(session_sizes, n_samples)
+
         if self.feat_ids is None:
             if n_features == 0:
                 feat_ids_ = make_poly_ids(self.n_outputs_, 1) - 1
@@ -295,8 +308,13 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
             time_shift_ids, poly_ids = fd2tp(self.feat_ids_, self.delay_ids_)
             xy_hstack = np.c_[X, y]
             osa_narx = LinearRegression(fit_intercept=self.fit_intercept)
-            time_shift_vars = make_time_shift_features(xy_hstack, time_shift_ids)
-            poly_terms = make_poly_features(time_shift_vars, poly_ids)
+            poly_terms = _prepare_poly_terms(
+                xy_hstack,
+                time_shift_ids,
+                poly_ids,
+                session_sizes_cumsum,
+                self.max_delay_,
+            )
             # Remove missing values
             poly_terms_masked, y_masked, sample_weight_masked = mask_missing_values(
                 poly_terms, y, sample_weight
@@ -359,6 +377,7 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
                 self.output_ids_,
                 self.fit_intercept,
                 sample_weight_sqrt,
+                session_sizes_cumsum,
                 grad_yyd_ids,
                 grad_delay_ids,
                 grad_coef_ids,
@@ -422,6 +441,7 @@ def _loss(
         output_ids,
         fit_intercept,
         sample_weight_sqrt,
+        session_sizes_cumsum,
         *args,
     ):
         # Sum of squared errors
@@ -442,6 +462,7 @@ def _loss(
             feat_ids,
             delay_ids,
             output_ids,
+            session_sizes_cumsum,
             y_hat,
         )
 
@@ -461,6 +482,7 @@ def _grad(
         output_ids,
         fit_intercept,
         sample_weight_sqrt,
+        session_sizes_cumsum,
         grad_yyd_ids,
         grad_delay_ids,
         grad_coef_ids,
@@ -484,6 +506,7 @@ def _grad(
             feat_ids,
             delay_ids,
             output_ids,
+            session_sizes_cumsum,
             y_hat,
         )
 
@@ -510,6 +533,7 @@ def _grad(
             grad_delay_ids,
             grad_coef_ids,
             grad_feat_ids,
+            session_sizes_cumsum,
             dydx,
             dcf,
         )
@@ -587,7 +611,9 @@ def predict(self, X, y_init=None):
                     f"`y_init` should have {self.n_outputs_} outputs "
                     f"but got {y_init.shape[1]}."
                 )
-        y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=float)
+        n_samples = X.shape[0]
+        y_hat = np.zeros((n_samples, self.n_outputs_), dtype=float)
+        session_sizes_cumsum = np.array([n_samples], dtype=np.int32)
         _predict(
             X,
             y_init,
@@ -596,6 +622,7 @@ def predict(self, X, y_init=None):
             self.feat_ids_,
             self.delay_ids_,
             self.output_ids_,
+            session_sizes_cumsum,
             y_hat,
         )
         if self._y_ndim == 1:
@@ -606,3 +633,34 @@ def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.allow_nan = True
         return tags
+
+
+def _validate_session_sizes(session_sizes, n_samples):
+    if session_sizes is None:
+        return np.array([n_samples], dtype=np.int32)
+    session_sizes = column_or_1d(
+        session_sizes,
+        dtype=np.int32,
+        warn=True,
+    )
+    if (session_sizes <= 0).any():
+        raise ValueError(
+            "All elements of session_sizes should be positive, "
+            f"but got {session_sizes}."
+        )
+    if session_sizes.sum() != n_samples:
+        raise ValueError(
+            "The sum of session_sizes should be equal to n_samples, "
+            f"but got {session_sizes.sum()} != {n_samples}."
+        )
+    return np.cumsum(session_sizes, dtype=np.int32)
+
+
+def _prepare_poly_terms(
+    xy_hstack, time_shift_ids, poly_ids, session_sizes_cumsum, max_delay
+):
+    time_shift_vars = make_time_shift_features(xy_hstack, time_shift_ids)
+    for start in session_sizes_cumsum[:-1]:
+        time_shift_vars[start : start + max_delay] = np.nan
+    poly_terms = make_poly_features(time_shift_vars, poly_ids)
+    return poly_terms
diff --git a/fastcan/narx/_narx_fast.pyx b/fastcan/narx/_narx_fast.pyx
index a1e72e1..4b49187 100644
--- a/fastcan/narx/_narx_fast.pyx
+++ b/fastcan/narx/_narx_fast.pyx
@@ -122,14 +122,15 @@ cdef inline void _update_dcf(
 
 @final
 cpdef void _predict(
-    const double[:, ::1] X,             # IN
-    const double[:, ::1] y_ref,         # IN
-    const double[::1] coef,             # IN
-    const double[::1] intercept,        # IN
-    const int[:, ::1] feat_ids,         # IN
-    const int[:, ::1] delay_ids,        # IN
-    const int[::1] output_ids,          # IN
-    double[:, ::1] y_hat,               # OUT
+    const double[:, ::1] X,                 # IN
+    const double[:, ::1] y_ref,             # IN
+    const double[::1] coef,                 # IN
+    const double[::1] intercept,            # IN
+    const int[:, ::1] feat_ids,             # IN
+    const int[:, ::1] delay_ids,            # IN
+    const int[::1] output_ids,              # IN
+    const int[::1] session_sizes_cumsum,    # IN
+    double[:, ::1] y_hat,                   # OUT
 ) noexcept nogil:
     """
     Vectorized (Cython) variant of Python NARX._predict.
@@ -142,11 +143,15 @@ cpdef void _predict(
     cdef:
         Py_ssize_t n_samples = X.shape[0]
         Py_ssize_t n_ref = y_ref.shape[0]
-        Py_ssize_t k
+        Py_ssize_t k, s = 0
         Py_ssize_t init_k = 0
         bint at_init = True
 
     for k in range(n_samples):
+        if k == session_sizes_cumsum[s]:
+            s += 1
+            at_init = True
+            init_k = k
         with gil:
             if not np.all(np.isfinite(X[k])):
                 at_init = True
@@ -192,8 +197,9 @@ cpdef void _update_dydx(
     const int[:, ::1] grad_delay_ids,
     const int[::1] grad_coef_ids,
     const int[:, ::1] grad_feat_ids,
-    double[:, :, ::1] dydx,       # OUT
-    double[:, :, ::1] dcf,        # OUT
+    const int[::1] session_sizes_cumsum,
+    double[:, :, ::1] dydx,                 # OUT
+    double[:, :, ::1] dcf,                  # OUT
 ) noexcept nogil:
     """
     Computation of the Jacobian matrix dydx.
@@ -211,7 +217,7 @@ cpdef void _update_dydx(
     cdef:
         Py_ssize_t n_samples = y_hat.shape[0]
         Py_ssize_t n_coefs = feat_ids.shape[0]
-        Py_ssize_t k, i, d
+        Py_ssize_t k, i, d, s = 0
         Py_ssize_t M = dcf.shape[1]      # n_outputs
         Py_ssize_t N = dydx.shape[2]     # n_x
         Py_ssize_t init_k = 0
@@ -224,6 +230,11 @@ cpdef void _update_dydx(
         terms_intercepts[i] = 1.0
 
     for k in range(n_samples):
+        if k == session_sizes_cumsum[s]:
+            s += 1
+            at_init = True
+            init_k = k
+            continue
         with gil:
             is_finite = np.all(np.isfinite(X[k])) and np.all(np.isfinite(y_hat[k]))
         if not is_finite:
diff --git a/fastcan/narx/_utils.py b/fastcan/narx/_utils.py
index 96def3b..9f86ba9 100644
--- a/fastcan/narx/_utils.py
+++ b/fastcan/narx/_utils.py
@@ -18,11 +18,9 @@
 from .._fastcan import FastCan
 from .._refine import refine
 from ..utils import mask_missing_values
-from ._base import NARX
+from ._base import NARX, _prepare_poly_terms, _validate_session_sizes
 from ._feature import (
-    make_poly_features,
     make_poly_ids,
-    make_time_shift_features,
     make_time_shift_ids,
     tp2fd,
 )
@@ -136,6 +134,7 @@ def _get_term_str(term_feat_ids, term_delay_ids):
             Interval(Integral, 1, None, closed="left"),
         ],
         "fit_intercept": ["boolean"],
+        "session_sizes": [None, "array-like"],
         "max_candidates": [None, Interval(Integral, 1, None, closed="left")],
         "random_state": ["random_state"],
         "include_zero_delay": [None, "array-like"],
@@ -161,6 +160,7 @@ def make_narx(
     poly_degree=1,
     *,
     fit_intercept=True,
+    session_sizes=None,
     max_candidates=None,
     random_state=None,
     include_zero_delay=None,
@@ -194,6 +194,13 @@ def make_narx(
     fit_intercept : bool, default=True
         Whether to fit the intercept. If set to False, intercept will be zeros.
 
+    session_sizes : array-like of shape (n_sessions,), default=None
+        The sizes of measurement sessions for time-series.
+        The sum of session_sizes should be equal to n_samples.
+        If None, the whole data is treated as one session.
+
+        .. versionadded:: 0.5
+
     max_candidates : int, default=None
         Maximum number of candidate polynomial terms retained before selection.
         Randomly selected by reservoir sampling.
@@ -284,7 +291,7 @@ def make_narx(
         check_consistent_length(X, y)
     if y.ndim == 1:
         y = y.reshape(-1, 1)
-    n_outputs = y.shape[1]
+    n_samples, n_outputs = y.shape
     if isinstance(n_terms_to_select, Integral):
         n_terms_to_select = np.full(n_outputs, n_terms_to_select, dtype=int)
     else:
@@ -295,6 +302,7 @@ def make_narx(
                 f"the number of outputs, {n_outputs}, but got "
                 f"{len(n_terms_to_select)}."
             )
+    session_sizes_cumsum = _validate_session_sizes(session_sizes, n_samples)
 
     xy_hstack = np.c_[X, y]
     n_features = X.shape[1]
@@ -318,16 +326,20 @@ def make_narx(
         ),
         0,
     )
-    time_shift_vars = make_time_shift_features(xy_hstack, time_shift_ids_all)
-
     poly_ids_all = make_poly_ids(
         time_shift_ids_all.shape[0],
         poly_degree,
         max_poly=max_candidates,
         random_state=random_state,
     )
-    poly_terms = make_poly_features(time_shift_vars, poly_ids_all)
 
+    poly_terms = _prepare_poly_terms(
+        xy_hstack,
+        time_shift_ids_all,
+        poly_ids_all,
+        session_sizes_cumsum,
+        max_delay,
+    )
     # Remove missing values
     poly_terms_masked, y_masked = mask_missing_values(poly_terms, y)
 
diff --git a/fastcan/narx/tests/test_narx.py b/fastcan/narx/tests/test_narx.py
index ef3aa2a..5547c9e 100644
--- a/fastcan/narx/tests/test_narx.py
+++ b/fastcan/narx/tests/test_narx.py
@@ -727,3 +727,88 @@ def test_predict_ndim():
     model.fit(X, y.reshape(-1, 1))
     y_hat = model.predict(X, y_init=y[: model.max_delay_])
     assert y_hat.ndim == 2
+
+
+def test_session_sizes():
+    """Test that session sizes in make_narx and NARX.fit and error messages."""
+    X = np.random.rand(100, 2)
+    y = np.random.rand(100, 1)
+    with pytest.raises(ValueError, match=r"The sum of session_sizes should be.*"):
+        model = make_narx(
+            X,
+            y,
+            n_terms_to_select=5,
+            max_delay=3,
+            poly_degree=2,
+            verbose=0,
+            session_sizes=[10, 20],
+        )
+    with pytest.raises(
+        ValueError, match=r"All elements of session_sizes should be positive.*"
+    ):
+        model = make_narx(
+            X,
+            y,
+            n_terms_to_select=5,
+            max_delay=3,
+            poly_degree=2,
+            verbose=0,
+        )
+        model.fit(X, y, session_sizes=[-10, 110])
+    model = make_narx(
+        X,
+        y,
+        n_terms_to_select=5,
+        max_delay=3,
+        poly_degree=2,
+        verbose=0,
+    )
+    session_sizes = [20, 30, 50]
+    session_sizes_cumsum = np.cumsum(session_sizes)
+    model.fit(X, y, session_sizes=session_sizes)
+    coef_session = model.coef_
+    X_missing = X[: session_sizes_cumsum[0]]
+    y_missing = y[: session_sizes_cumsum[0]]
+    for i in range(len(session_sizes) - 1):
+        X_missing = np.r_[
+            X_missing,
+            [[np.nan, np.nan]] * model.max_delay_,
+            X[session_sizes_cumsum[i] : session_sizes_cumsum[i + 1]],
+        ]
+        y_missing = np.r_[
+            y_missing,
+            [[np.nan]] * model.max_delay_,
+            y[session_sizes_cumsum[i] : session_sizes_cumsum[i + 1]],
+        ]
+    coef_missing = model.fit(X_missing, y_missing).coef_
+    # Exactly same for one-step-ahead fitting
+    assert np.sum(np.abs(coef_session - coef_missing)) == 0
+    coef_session_msa = model.fit(
+        X, y, session_sizes=session_sizes, coef_init="one_step_ahead"
+    ).coef_
+    coef_missing_msa = model.fit(X_missing, y_missing, coef_init="one_step_ahead").coef_
+    # Exactly same for multi-step-ahead fitting
+    assert np.sum(np.abs(coef_session_msa - coef_missing_msa)) == 0
+
+    # Auto-regression with session sizes
+    arm = make_narx(
+        X=None,
+        y=y,
+        n_terms_to_select=5,
+        max_delay=3,
+        poly_degree=2,
+        verbose=0,
+    )
+    arm.fit(X=None, y=y, session_sizes=session_sizes)
+    ar_coef_session = arm.coef_
+    arm.fit(X=None, y=y_missing)
+    ar_coef_missing = arm.coef_
+    # Exactly same for one-step-ahead fitting
+    assert np.sum(np.abs(ar_coef_session - ar_coef_missing)) == 0
+
+    arm.fit(X=None, y=y, session_sizes=session_sizes, coef_init="one_step_ahead")
+    ar_coef_session_msa = arm.coef_
+    arm.fit(X=None, y=y_missing, coef_init="one_step_ahead")
+    ar_coef_missing_msa = arm.coef_
+    # Not same for multi-step-ahead fitting
+    assert np.sum(np.abs(ar_coef_session_msa - ar_coef_missing_msa)) != 0
diff --git a/fastcan/narx/tests/test_narx_jac.py b/fastcan/narx/tests/test_narx_jac.py
index 87bc873..2c2ca1e 100644
--- a/fastcan/narx/tests/test_narx_jac.py
+++ b/fastcan/narx/tests/test_narx_jac.py
@@ -36,6 +36,7 @@ def test_simple():
         delay_ids=delay_ids,
         output_ids=output_ids,
         y_hat=y_hat,
+        session_sizes_cumsum=np.array([len(y)], dtype=np.int32),
     )
 
     assert_array_equal(y_hat, y)
@@ -53,6 +54,7 @@ def test_simple():
         delay_ids=delay_ids,
         output_ids=output_ids,
         y_hat=y_hat_1,
+        session_sizes_cumsum=np.array([len(y)], dtype=np.int32),
     )
 
     grad_truth = np.array(
@@ -79,6 +81,7 @@ def test_simple():
         output_ids,
         True,
         np.sqrt(sample_weight),
+        np.array([len(y)], dtype=np.int32),
         grad_yyd_ids,
         grad_delay_ids,
         grad_coef_ids,
@@ -95,6 +98,7 @@ def test_simple():
         output_ids,
         True,
         np.sqrt(sample_weight),
+        np.array([len(y)], dtype=np.int32),
         grad_yyd_ids,
         grad_delay_ids,
         grad_coef_ids,
@@ -109,6 +113,7 @@ def test_simple():
         output_ids,
         False,
         np.sqrt(sample_weight),
+        np.array([len(y)], dtype=np.int32),
         grad_yyd_ids,
         grad_delay_ids,
         grad_coef_ids,
@@ -209,6 +214,7 @@ def test_complex():
         output_ids,
         True,
         np.sqrt(np.ones((y.shape[0], 1))),
+        np.array([len(y)], dtype=np.int32),
         grad_yyd_ids,
         grad_delay_ids,
         grad_coef_ids,
@@ -226,6 +232,7 @@ def test_complex():
         delay_ids=delay_ids,
         output_ids=output_ids,
         y_hat=y_hat_0,
+        session_sizes_cumsum=np.array([len(y)], dtype=np.int32),
     )
     e_0 = y_hat_0 - y
 
@@ -250,6 +257,7 @@ def test_complex():
             delay_ids=delay_ids,
             output_ids=output_ids,
             y_hat=y_hat_1,
+            session_sizes_cumsum=np.array([len(y)], dtype=np.int32),
         )
 
         e_1 = y_hat_1 - y
@@ -266,6 +274,7 @@ def test_complex():
         output_ids,
         False,
         np.sqrt(np.ones((y.shape[0], 1))),
+        np.array([len(y)], dtype=np.int32),
         grad_yyd_ids,
         grad_delay_ids,
         grad_coef_ids,
@@ -281,6 +290,7 @@ def test_complex():
         delay_ids=delay_ids,
         output_ids=output_ids,
         y_hat=y_hat_0,
+        session_sizes_cumsum=np.array([len(y)], dtype=np.int32),
     )
     e_0 = y_hat_0 - y
 
@@ -298,6 +308,7 @@ def test_complex():
             delay_ids=delay_ids,
             output_ids=output_ids,
             y_hat=y_hat_1,
+            session_sizes_cumsum=np.array([len(y)], dtype=np.int32),
         )
 
         e_1 = y_hat_1 - y