diff --git a/fastcan/narx/_base.py b/fastcan/narx/_base.py
index 59d4080..fe95b83 100644
--- a/fastcan/narx/_base.py
+++ b/fastcan/narx/_base.py
@@ -13,6 +13,7 @@
 from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
 from sklearn.linear_model import LinearRegression
 from sklearn.utils import check_array, check_consistent_length, column_or_1d
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, StrOptions, validate_params
 from sklearn.utils.validation import (
     _check_sample_weight,
@@ -29,9 +30,8 @@
     make_time_shift_features,
 )
 from ._narx_fast import (  # type: ignore[attr-defined]
-    _predict_step,
-    _update_cfd,
-    _update_terms,
+    _predict,
+    _update_dydx,
 )
 
 
@@ -341,7 +341,7 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
                     f"({(n_coef_intercept,)}), but got {coef_init.shape}."
                 )
 
-        grad_yyd_ids, grad_coef_ids, grad_feat_ids, grad_delay_ids = NARX._get_cfd_ids(
+        grad_yyd_ids, grad_delay_ids, grad_coef_ids, grad_feat_ids = NARX._get_dcf_ids(
             self.feat_ids_, self.delay_ids_, self.output_ids_, X.shape[1]
         )
         sample_weight_sqrt = np.sqrt(sample_weight).reshape(-1, 1)
@@ -350,7 +350,6 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
             x0=coef_init,
             jac=NARX._grad,
             args=(
-                _predict_step,
                 X,
                 y,
                 self.feat_ids_,
@@ -359,9 +358,9 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
                 self.fit_intercept,
                 sample_weight_sqrt,
                 grad_yyd_ids,
+                grad_delay_ids,
                 grad_coef_ids,
                 grad_feat_ids,
-                grad_delay_ids,
             ),
             **params,
         )
@@ -374,64 +373,17 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
         return self
 
     @staticmethod
-    def _predict(
-        expression,
-        X,
-        y_ref,
-        coef,
-        intercept,
-        feat_ids,
-        delay_ids,
-        output_ids,
-    ):
-        n_samples = X.shape[0]
-        n_ref, n_outputs = y_ref.shape
-        max_delay = np.max(delay_ids)
-        y_hat = np.zeros((n_samples, n_outputs), dtype=float)
-        at_init = True
-        init_k = 0
-        for k in range(n_samples):
-            if not np.all(np.isfinite(X[k])):
-                at_init = True
-                init_k = k + 1
-                y_hat[k] = np.nan
-                continue
-            if k - init_k == max_delay:
-                at_init = False
-
-            if at_init:
-                y_hat[k] = y_ref[k % n_ref]
-                if not np.all(np.isfinite(y_hat[k])):
-                    at_init = True
-                    init_k = k + 1
-            else:
-                y_hat[k] = intercept
-                expression(
-                    X,
-                    y_hat,
-                    y_hat[k],
-                    coef,
-                    feat_ids,
-                    delay_ids,
-                    output_ids,
-                    k,
-                )
-            if np.any(y_hat[k] > 1e20):
-                y_hat[k:] = 1e20
-                return y_hat
-        return y_hat
-
-    @staticmethod
-    def _get_cfd_ids(feat_ids, delay_ids, output_ids, n_features_in):
+    def _get_dcf_ids(feat_ids, delay_ids, output_ids, n_features_in):
         """
-        Get ids of CFD (Coef, Feature, and Delay) matrix to update dyn(k)/dx.
+        Get ids of DCF (Delay, Coef, and Feature) matrix to update dyn(k)/dx.
         Maps coefficients to their corresponding features and delays.
         """
 
-        # Initialize cfd_ids as a list of lists n_outputs *  n_outputs * max_delay
-        # axis-0 (i): [dy0(k)/dx, dy1(k)/dx, ..., dyn(k)/dx]
-        # axis-1 (j): [dy0(k-d)/dx, dy1(k-d)/dx, ..., dyn(k-d)/dx]
-        # axis-2 (d): [dyj(k-1)/dx, dyj(k-2)/dx, ..., dyj(k-max_delay)/dx]
+        # Initialize dcf_ids as a list of lists max_delay * n_outputs * n_outputs
+        # axis-0 (d): [dyj(k-1)/dx, dyj(k-2)/dx, ..., dyj(k-max_delay)/dx]
+        # axis-1 (i): [dy0(k)/dx, dy1(k)/dx, ..., dyn(k)/dx]
+        # axis-2 (j): [dy0(k-d)/dx, dy1(k-d)/dx, ..., dyn(k-d)/dx]
+
         grad_yyd_ids = []
         grad_coef_ids = []
         grad_feat_ids = []
@@ -447,106 +399,20 @@ def _get_cfd_ids(feat_ids, delay_ids, output_ids, n_features_in):
                 if feat_id >= n_features_in and delay_id > 0:
                     col_y_id = feat_id - n_features_in  # y(k-1, id)
                     grad_yyd_ids.append([row_y_id, col_y_id, delay_id - 1])
+                    grad_delay_ids.append(np.delete(term_delay_ids, var_id))
                     grad_coef_ids.append(coef_id)
                     grad_feat_ids.append(np.delete(term_feat_ids, var_id))
-                    grad_delay_ids.append(np.delete(term_delay_ids, var_id))
 
         return (
-            np.array(grad_yyd_ids, dtype=np.int32),
+            np.array(grad_yyd_ids, dtype=np.int32, ndmin=2),
+            np.array(grad_delay_ids, dtype=np.int32, ndmin=2),
             np.array(grad_coef_ids, dtype=np.int32),
-            np.array(grad_feat_ids, dtype=np.int32),
-            np.array(grad_delay_ids, dtype=np.int32),
+            np.array(grad_feat_ids, dtype=np.int32, ndmin=2),
         )
 
-    @staticmethod
-    def _update_dydx(
-        X,
-        y_hat,
-        coef,
-        feat_ids,
-        delay_ids,
-        output_ids,
-        fit_intercept,
-        grad_yyd_ids,
-        grad_coef_ids,
-        grad_feat_ids,
-        grad_delay_ids,
-    ):
-        """
-        Computation of the Jacobian matrix dydx.
-
-        Returns
-        -------
-        dydx : ndarray of shape (n_samples, n_outputs, n_x)
-            Jacobian matrix of the outputs with respect to coefficients and intercepts.
-        """
-        n_samples, n_y = y_hat.shape
-        max_delay = np.max(delay_ids)
-        n_coefs = feat_ids.shape[0]
-        if fit_intercept:
-            n_x = n_coefs + n_y  # Total number of coefficients and intercepts
-            y_ids = np.r_[output_ids, np.arange(n_y)]
-        else:
-            n_x = n_coefs
-            y_ids = output_ids
-
-        x_ids = np.arange(n_x)
-
-        dydx = np.zeros((n_samples, n_y, n_x), dtype=float)
-        at_init = True
-        init_k = 0
-        for k in range(n_samples):
-            if not (np.all(np.isfinite(X[k])) and np.all(np.isfinite(y_hat[k]))):
-                at_init = True
-                init_k = k + 1
-                continue
-            if k - init_k == max_delay:
-                at_init = False
-
-            if at_init:
-                continue
-            # Compute terms for time step k
-            terms = np.ones(n_x, dtype=float)
-            _update_terms(
-                X,
-                y_hat,
-                terms,
-                feat_ids,
-                delay_ids,
-                k,
-            )
-
-            # Update constant terms of Jacobian
-            dydx[k, y_ids, x_ids] = terms
-
-            # Update dynamic terms of Jacobian
-            if max_delay > 0 and grad_yyd_ids.size > 0:
-                cfd = np.zeros((n_y, n_y, max_delay), dtype=float)
-                _update_cfd(
-                    X,
-                    y_hat,
-                    cfd,
-                    coef,
-                    grad_yyd_ids,
-                    grad_coef_ids,
-                    grad_feat_ids,
-                    grad_delay_ids,
-                    k,
-                )
-                for d in range(max_delay):
-                    dydx[k] += cfd[:, :, d] @ dydx[k - d - 1]
-
-            # Handle divergence
-            if np.any(dydx[k] > 1e20):
-                dydx[k:] = 1e20
-                break
-
-        return dydx
-
     @staticmethod
     def _loss(
         coef_intercept,
-        expression,
         X,
         y,
         feat_ids,
@@ -557,16 +423,16 @@ def _loss(
         *args,
     ):
         # Sum of squared errors
-        n_outputs = y.shape[1]
+        n_samples, n_y = y.shape
         if fit_intercept:
-            coef = coef_intercept[:-n_outputs]
-            intercept = coef_intercept[-n_outputs:]
+            coef = coef_intercept[:-n_y]
+            intercept = coef_intercept[-n_y:]
         else:
             coef = coef_intercept
-            intercept = np.zeros(n_outputs, dtype=float)
+            intercept = np.zeros(n_y, dtype=float)
 
-        y_hat = NARX._predict(
-            expression,
+        y_hat = np.zeros((n_samples, n_y), dtype=float)
+        _predict(
             X,
             y,
             coef,
@@ -574,6 +440,7 @@ def _loss(
             feat_ids,
             delay_ids,
             output_ids,
+            y_hat,
         )
 
         y_masked, y_hat_masked, sample_weight_sqrt_masked = mask_missing_values(
@@ -585,7 +452,6 @@ def _loss(
     @staticmethod
     def _grad(
         coef_intercept,
-        expression,
         X,
         y,
         feat_ids,
@@ -594,12 +460,12 @@ def _grad(
         fit_intercept,
         sample_weight_sqrt,
         grad_yyd_ids,
+        grad_delay_ids,
         grad_coef_ids,
         grad_feat_ids,
-        grad_delay_ids,
     ):
         # Sum of squared errors
-        n_outputs = y.shape[1]
+        n_samples, n_outputs = y.shape
         if fit_intercept:
             coef = coef_intercept[:-n_outputs]
             intercept = coef_intercept[-n_outputs:]
@@ -607,8 +473,8 @@ def _grad(
             coef = coef_intercept
             intercept = np.zeros(n_outputs, dtype=float)
 
-        y_hat = NARX._predict(
-            expression,
+        y_hat = np.zeros((n_samples, n_outputs), dtype=float)
+        _predict(
             X,
             y,
             coef,
@@ -616,19 +482,35 @@ def _grad(
             feat_ids,
             delay_ids,
             output_ids,
+            y_hat,
         )
-        dydx = NARX._update_dydx(
+
+        max_delay = np.max(delay_ids)
+        n_coefs = feat_ids.shape[0]
+        if fit_intercept:
+            n_x = n_coefs + n_outputs  # Total number of coefficients and intercepts
+            y_ids = np.r_[output_ids, np.arange(n_outputs, dtype=np.int32)]
+        else:
+            n_x = n_coefs
+            y_ids = output_ids
+        dydx = np.zeros((n_samples, n_outputs, n_x), dtype=float)
+        dcf = np.zeros((max_delay, n_outputs, n_outputs), dtype=float)
+        n_threads = _openmp_effective_n_threads()
+
+        _update_dydx(
             X,
             y_hat,
             coef,
             feat_ids,
             delay_ids,
-            output_ids,
-            fit_intercept,
+            y_ids,
             grad_yyd_ids,
+            grad_delay_ids,
             grad_coef_ids,
             grad_feat_ids,
-            grad_delay_ids,
+            dydx,
+            dcf,
+            n_threads,
         )
 
         mask_valid = mask_missing_values(y, y_hat, sample_weight_sqrt, return_mask=True)
@@ -702,8 +584,8 @@ def predict(self, X, y_init=None):
                     f"`y_init` should have {self.n_outputs_} outputs "
                     f"but got {y_init.shape[1]}."
                 )
-        y_hat = NARX._predict(
-            _predict_step,
+        y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=float)
+        _predict(
             X,
             y_init,
             self.coef_,
@@ -711,6 +593,7 @@ def predict(self, X, y_init=None):
             self.feat_ids_,
             self.delay_ids_,
             self.output_ids_,
+            y_hat,
         )
         if self.n_outputs_ == 1:
             y_hat = y_hat.flatten()
diff --git a/fastcan/narx/_narx_fast.pyx b/fastcan/narx/_narx_fast.pyx
index 5543f0b..0555ff3 100644
--- a/fastcan/narx/_narx_fast.pyx
+++ b/fastcan/narx/_narx_fast.pyx
@@ -4,24 +4,30 @@ Fast computation of prediction and gradient for narx.
 # Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
-from cython cimport floating, final
+from libc.stdlib cimport malloc, free
+from libc.string cimport memset
+from cython cimport final
+import numpy as np
+from sklearn.utils._cython_blas cimport RowMajor, NoTrans
+from sklearn.utils._cython_blas cimport _gemm
 
 
 @final
-cpdef void _update_terms(
-    const floating[:, ::1] X,       # IN
-    const floating[:, ::1] y_hat,   # IN
-    floating[::1] terms,            # OUT
+cdef inline void _update_terms(
+    const double[:, ::1] X,       # IN
+    const double[:, ::1] y_hat,   # IN
+    double* terms,                # OUT
     const int[:, ::1] feat_ids,     # IN
     const int[:, ::1] delay_ids,    # IN
     const int k,                    # IN
+    const int num_threads,          # IN
 ) noexcept nogil:
     """
     Evaluate all terms for the given features and delays at timestep k.
     """
     cdef:
-        int i
-        int n_coefs = feat_ids.shape[0]
+        Py_ssize_t i
+        Py_ssize_t n_coefs = feat_ids.shape[0]
 
     for i in range(n_coefs):
         terms[i] = _evaluate_term(
@@ -30,11 +36,11 @@ cpdef void _update_terms(
 
 
 @final
-cpdef void _predict_step(
-    const floating[:, ::1] X,       # IN
-    const floating[:, ::1] y_hat,   # IN
-    floating[::1] y_pred,           # OUT
-    const floating[::1] coef,       # IN
+cdef inline void _predict_step(
+    const double[:, ::1] X,       # IN
+    const double[:, ::1] y_hat,   # IN
+    double[::1] y_pred,           # OUT
+    const double[::1] coef,       # IN
     const int[:, ::1] feat_ids,     # IN
     const int[:, ::1] delay_ids,    # IN
     const int[::1] output_ids,      # IN
@@ -44,11 +50,11 @@ cpdef void _predict_step(
     Evaluate the expression for all outputs at timestep k.
     """
     cdef:
-        int n_terms = feat_ids.shape[0]
-        int i, output_i
+        Py_ssize_t n_coefs = feat_ids.shape[0]
+        Py_ssize_t i, output_i
 
     # Add all terms
-    for i in range(n_terms):
+    for i in range(n_coefs):
         output_i = output_ids[i]
         y_pred[output_i] += coef[i] * _evaluate_term(
             X, y_hat, feat_ids[i], delay_ids[i], k
@@ -56,23 +62,23 @@ cpdef void _predict_step(
 
 
 @final
-cdef floating _evaluate_term(
-    const floating[:, ::1] X,       # IN
-    const floating[:, ::1] y_hat,   # IN
-    const int[::1] feat_ids,        # IN
-    const int[::1] delay_ids,       # IN
+cdef inline double _evaluate_term(
+    const double[:, ::1] X,       # IN
+    const double[:, ::1] y_hat,   # IN
+    const int[::1] feat_ids,    # IN
+    const int[::1] delay_ids,   # IN
     const int k,                    # IN
 ) noexcept nogil:
     """
     Evaluate a term based on feature and delay IDs.
     """
     cdef:
-        int n_feats = X.shape[1]
-        int n_vars = feat_ids.shape[0]
-        floating term = 1.0
-        int i, feat_id
+        Py_ssize_t n_feats = X.shape[1]
+        Py_ssize_t n_coefs = feat_ids.shape[0]
+        double term = 1.0
+        Py_ssize_t i, feat_id
 
-    for i in range(n_vars):
+    for i in range(n_coefs):
         feat_id = feat_ids[i]
         if feat_id != -1:
             if feat_id < n_feats:
@@ -84,30 +90,193 @@ cdef floating _evaluate_term(
 
 
 @final
-cpdef void _update_cfd(
-    const floating[:, ::1] X,           # IN
-    const floating[:, ::1] y_hat,       # IN
-    floating[:, :, ::1] cfd,            # OUT
-    const floating[::1] coef,           # IN
+cdef inline void _update_dcf(
+    const double[:, ::1] X,           # IN
+    const double[:, ::1] y_hat,       # IN
+    double[:, :, ::1] dcf,            # OUT
+    const double[::1] coef,           # IN
     const int[:, ::1] grad_yyd_ids,     # IN
+    const int[:, ::1] grad_delay_ids,   # IN
     const int[::1] grad_coef_ids,       # IN
     const int[:, ::1] grad_feat_ids,    # IN
-    const int[:, ::1] grad_delay_ids,   # IN
     const int k,                        # IN
 ) noexcept nogil:
     """
-    Updates CFD matrix based on the current state.
+    Updates DCF matrix based on the current state.
     """
     cdef:
-        int n_grad_terms = grad_yyd_ids.shape[0]
-        int i, row_y_id, col_y_id, delay_id_1
+        Py_ssize_t n_grad_terms = grad_yyd_ids.shape[0]
+        Py_ssize_t i, row_y_id, col_y_id, delay_id_1
+
+    memset(&dcf[0, 0, 0], 0, dcf.shape[0] * dcf.shape[1] * dcf.shape[2] * sizeof(double))
 
     for i in range(n_grad_terms):
         row_y_id = grad_yyd_ids[i, 0]
         col_y_id = grad_yyd_ids[i, 1]
         delay_id_1 = grad_yyd_ids[i, 2]
 
-        cfd[row_y_id, col_y_id, delay_id_1] += coef[grad_coef_ids[i]] * \
+        dcf[delay_id_1, row_y_id, col_y_id] += coef[grad_coef_ids[i]] * \
             _evaluate_term(
                 X, y_hat, grad_feat_ids[i], grad_delay_ids[i], k
             )
+
+
+@final
+cpdef void _predict(
+    const double[:, ::1] X,             # IN
+    const double[:, ::1] y_ref,         # IN
+    const double[::1] coef,             # IN
+    const double[::1] intercept,        # IN
+    const int[:, ::1] feat_ids,         # IN
+    const int[:, ::1] delay_ids,        # IN
+    const int[::1] output_ids,          # IN
+    double[:, ::1] y_hat,               # OUT
+) noexcept nogil:
+    """
+    Vectorized (Cython) variant of Python NARX._predict.
+    Returns y_hat array (n_samples, n_outputs).
+    """
+    cdef:
+        Py_ssize_t max_delay
+    with gil:
+        max_delay = np.max(delay_ids)
+    cdef:
+        Py_ssize_t n_samples = X.shape[0]
+        Py_ssize_t n_ref = y_ref.shape[0]
+        Py_ssize_t k
+        Py_ssize_t init_k = 0
+        bint at_init = True
+
+    for k in range(n_samples):
+        with gil:
+            if not np.all(np.isfinite(X[k])):
+                at_init = True
+                init_k = k + 1
+                y_hat[k] = np.nan
+                continue
+        if k - init_k == max_delay:
+            at_init = False
+
+        if at_init:
+            y_hat[k] = y_ref[k % n_ref]
+            with gil:
+                if not np.all(np.isfinite(y_hat[k])):
+                    at_init = True
+                    init_k = k + 1
+        else:
+            y_hat[k] = intercept
+            _predict_step(
+                X,
+                y_hat,
+                y_hat[k],
+                coef,
+                feat_ids,
+                delay_ids,
+                output_ids,
+                k,
+            )
+        # Handle divergence
+        with gil:
+            if np.max(np.abs(y_hat[k])) > 1e20:
+                break
+
+
+@final
+cpdef void _update_dydx(
+    const double[:, ::1] X,
+    const double[:, ::1] y_hat,
+    const double[::1] coef,
+    const int[:, ::1] feat_ids,
+    const int[:, ::1] delay_ids,
+    const int[::1] y_ids,
+    const int[:, ::1] grad_yyd_ids,
+    const int[:, ::1] grad_delay_ids,
+    const int[::1] grad_coef_ids,
+    const int[:, ::1] grad_feat_ids,
+    double[:, :, ::1] dydx,       # OUT
+    double[:, :, ::1] dcf,        # OUT
+    const int num_threads,          # IN
+) noexcept nogil:
+    """
+    Computation of the Jacobian matrix dydx.
+
+    Returns
+    -------
+    dydx : ndarray of shape (n_samples, n_outputs, n_x)
+        Jacobian matrix of the outputs with respect to coefficients and intercepts.
+    """
+    cdef Py_ssize_t max_delay
+    cdef bint not_empty
+    with gil:
+        max_delay = np.max(delay_ids)
+        not_empty = max_delay > 0 and grad_yyd_ids.size > 0
+    cdef:
+        Py_ssize_t n_samples = y_hat.shape[0]
+        Py_ssize_t n_coefs = feat_ids.shape[0]
+        Py_ssize_t k, i, d
+        Py_ssize_t M = dcf.shape[1]      # n_outputs
+        Py_ssize_t N = dydx.shape[2]     # n_x
+        Py_ssize_t init_k = 0
+        bint at_init = True
+        bint is_finite
+        double* terms_intercepts = <double*> malloc(sizeof(double) * N)
+
+    # Set intercepts
+    for i in range(n_coefs, N):
+        terms_intercepts[i] = 1.0
+
+    for k in range(n_samples):
+        with gil:
+            is_finite = np.all(np.isfinite(X[k])) and np.all(np.isfinite(y_hat[k]))
+        if not is_finite:
+            at_init = True
+            init_k = k + 1
+            continue
+        if k - init_k == max_delay:
+            at_init = False
+
+        if at_init:
+            continue
+        # Compute terms for time step k (no effect on intercepts)
+        _update_terms(
+            X,
+            y_hat,
+            terms_intercepts,
+            feat_ids,
+            delay_ids,
+            k,
+            num_threads,
+        )
+
+        # Update constant terms of Jacobian
+        for i in range(N):
+            dydx[k, y_ids[i], i] = terms_intercepts[i]
+
+        # Update dynamic terms of Jacobian
+        if not_empty:
+            _update_dcf(
+                X,
+                y_hat,
+                dcf,
+                coef,
+                grad_yyd_ids,
+                grad_delay_ids,
+                grad_coef_ids,
+                grad_feat_ids,
+                k,
+            )
+            for d in range(max_delay):
+                # dydx[k] += dcf[d] @ dydx[k-d-1]
+                # dcf[d] (M,M), dydx[k] (M,N)
+                _gemm(
+                    RowMajor, NoTrans, NoTrans,
+                    M, N, M,
+                    1.0, &dcf[d, 0, 0], M, &dydx[k-d-1, 0, 0], N,
+                    1.0, &dydx[k, 0, 0], N,
+                )
+
+        # Handle divergence
+        with gil:
+            if np.max(np.abs(dydx[k])) > 1e20:
+                break
+    free(terms_intercepts)
diff --git a/fastcan/narx/tests/test_narx.py b/fastcan/narx/tests/test_narx.py
index 30be481..a54cea8 100644
--- a/fastcan/narx/tests/test_narx.py
+++ b/fastcan/narx/tests/test_narx.py
@@ -453,7 +453,8 @@ def test_divergence():
     narx = make_narx(X, y, 3, 3, 2)
     narx.fit(X, y, coef_init=[-10, 0, 0, 0])
     y_hat = narx.predict(X, y)
-    assert np.all(y_hat <= 1e20)
+    div_idx = np.where(np.abs(y_hat) > 1e20)[0][0] + 1
+    assert np.all(y_hat[div_idx:] == 0)
 
 
 def test_tp2fd():
diff --git a/fastcan/narx/tests/test_narx_jac.py b/fastcan/narx/tests/test_narx_jac.py
index 91c81d3..87bc873 100644
--- a/fastcan/narx/tests/test_narx_jac.py
+++ b/fastcan/narx/tests/test_narx_jac.py
@@ -6,7 +6,7 @@
 from sklearn.metrics import r2_score
 
 from fastcan.narx import NARX, make_narx
-from fastcan.narx._narx_fast import _predict_step  # type: ignore[attr-defined]
+from fastcan.narx._narx_fast import _predict  # type: ignore[attr-defined]
 
 
 def test_simple():
@@ -26,8 +26,8 @@ def test_simple():
     intercept = np.array([1], dtype=float)
     sample_weight = np.array([1, 1, 1], dtype=float).reshape(-1, 1)
 
-    y_hat = NARX._predict(
-        _predict_step,
+    y_hat = np.zeros_like(y, dtype=float)
+    _predict(
         X=X,
         y_ref=y,
         coef=coef,
@@ -35,6 +35,7 @@ def test_simple():
         feat_ids=feat_ids,
         delay_ids=delay_ids,
         output_ids=output_ids,
+        y_hat=y_hat,
     )
 
     assert_array_equal(y_hat, y)
@@ -42,8 +43,8 @@ def test_simple():
     delta_w = 0.00001
     coef_1 = np.array([0.4 + delta_w, 1])
 
-    y_hat_1 = NARX._predict(
-        _predict_step,
+    y_hat_1 = np.zeros_like(y, dtype=float)
+    _predict(
         X=X,
         y_ref=y,
         coef=coef_1,
@@ -51,6 +52,7 @@ def test_simple():
         feat_ids=feat_ids,
         delay_ids=delay_ids,
         output_ids=output_ids,
+        y_hat=y_hat_1,
     )
 
     grad_truth = np.array(
@@ -65,12 +67,11 @@ def test_simple():
         ]
     )
 
-    grad_yyd_ids, grad_coef_ids, grad_feat_ids, grad_delay_ids = NARX._get_cfd_ids(
+    grad_yyd_ids, grad_delay_ids, grad_coef_ids, grad_feat_ids = NARX._get_dcf_ids(
         feat_ids, delay_ids, output_ids, 1
     )
     grad = NARX._grad(
         np.r_[coef_1, intercept],
-        _predict_step,
         X,
         y,
         feat_ids,
@@ -79,15 +80,14 @@ def test_simple():
         True,
         np.sqrt(sample_weight),
         grad_yyd_ids,
+        grad_delay_ids,
         grad_coef_ids,
         grad_feat_ids,
-        grad_delay_ids,
     )
 
     assert_almost_equal(grad.sum(axis=0), grad_truth, decimal=4)
     grad_0 = NARX._grad(
         np.r_[coef_1, 0],
-        _predict_step,
         X,
         y,
         feat_ids,
@@ -96,13 +96,12 @@ def test_simple():
         True,
         np.sqrt(sample_weight),
         grad_yyd_ids,
+        grad_delay_ids,
         grad_coef_ids,
         grad_feat_ids,
-        grad_delay_ids,
     )
     grad = NARX._grad(
         coef_1,
-        _predict_step,
         X,
         y,
         feat_ids,
@@ -111,9 +110,9 @@ def test_simple():
         False,
         np.sqrt(sample_weight),
         grad_yyd_ids,
+        grad_delay_ids,
         grad_coef_ids,
         grad_feat_ids,
-        grad_delay_ids,
     )
     assert_almost_equal(grad.sum(axis=0), grad_0.sum(axis=0)[:-1])
 
@@ -198,12 +197,11 @@ def test_complex():
     intercept = np.array([1, 0.5])
 
     # NARX Jacobian
-    grad_yyd_ids, grad_coef_ids, grad_feat_ids, grad_delay_ids = NARX._get_cfd_ids(
+    grad_yyd_ids, grad_delay_ids, grad_coef_ids, grad_feat_ids = NARX._get_dcf_ids(
         feat_ids, delay_ids, output_ids, X.shape[1]
     )
     grad = NARX._grad(
         np.r_[coef, intercept],
-        _predict_step,
         X,
         y,
         feat_ids,
@@ -212,14 +210,14 @@ def test_complex():
         True,
         np.sqrt(np.ones((y.shape[0], 1))),
         grad_yyd_ids,
+        grad_delay_ids,
         grad_coef_ids,
         grad_feat_ids,
-        grad_delay_ids,
     )
 
     # Numerical gradient
-    y_hat_0 = NARX._predict(
-        _predict_step,
+    y_hat_0 = np.zeros_like(y, dtype=float)
+    _predict(
         X=X,
         y_ref=y,
         coef=coef,
@@ -227,6 +225,7 @@ def test_complex():
         feat_ids=feat_ids,
         delay_ids=delay_ids,
         output_ids=output_ids,
+        y_hat=y_hat_0,
     )
     e_0 = y_hat_0 - y
 
@@ -241,8 +240,8 @@ def test_complex():
             intercept_1 = np.copy(intercept)
             intercept_1[i - len(coef)] += delta_w
 
-        y_hat_1 = NARX._predict(
-            _predict_step,
+        y_hat_1 = np.zeros_like(y, dtype=float)
+        _predict(
             X=X,
             y_ref=y,
             coef=coef_1,
@@ -250,6 +249,7 @@ def test_complex():
             feat_ids=feat_ids,
             delay_ids=delay_ids,
             output_ids=output_ids,
+            y_hat=y_hat_1,
         )
 
         e_1 = y_hat_1 - y
@@ -259,7 +259,6 @@ def test_complex():
 
     grad = NARX._grad(
         coef,
-        _predict_step,
         X,
         y,
         feat_ids,
@@ -268,19 +267,20 @@ def test_complex():
         False,
         np.sqrt(np.ones((y.shape[0], 1))),
         grad_yyd_ids,
+        grad_delay_ids,
         grad_coef_ids,
         grad_feat_ids,
-        grad_delay_ids,
     )
-    y_hat_0 = NARX._predict(
-        _predict_step,
+    y_hat_0 = np.zeros_like(y, dtype=float)
+    _predict(
         X=X,
         y_ref=y,
         coef=coef,
-        intercept=[0, 0],
+        intercept=np.array([0, 0], dtype=float),
         feat_ids=feat_ids,
         delay_ids=delay_ids,
         output_ids=output_ids,
+        y_hat=y_hat_0,
     )
     e_0 = y_hat_0 - y
 
@@ -288,15 +288,16 @@ def test_complex():
         coef_1 = np.copy(coef)
         coef_1[i] += delta_w
 
-        y_hat_1 = NARX._predict(
-            _predict_step,
+        y_hat_1 = np.zeros_like(y, dtype=float)
+        _predict(
             X=X,
             y_ref=y,
             coef=coef_1,
-            intercept=[0, 0],
+            intercept=np.array([0, 0], dtype=float),
             feat_ids=feat_ids,
             delay_ids=delay_ids,
             output_ids=output_ids,
+            y_hat=y_hat_1,
         )
 
         e_1 = y_hat_1 - y
diff --git a/pixi.lock b/pixi.lock
index 1a4a263..5ae2194 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -6125,7 +6125,7 @@ packages:
 - pypi: ./
   name: fastcan
   version: 0.4.0
-  sha256: 335c7da2d4dbd3de173152f1fbabd574555bd732fce56793455418b723b01054
+  sha256: 2a742d953b335411ce634a3753dbbbb11d5db7c54fee7d51fec056f5b77c9bf1
   requires_dist:
   - scikit-learn>=1.7.0,!=1.7.1
   - furo ; extra == 'docs'
diff --git a/pyproject.toml b/pyproject.toml
index a68e298..b386d6b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,6 +116,7 @@ time-h = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import Fast
 time-eta = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(3000, 100); y = np.random.rand(3000, 20)' 's = FastCan(100, eta=True, verbose=0).fit(X, y)'"
 profile-minibatch = { cmd = '''python -c "import cProfile; import numpy as np; from fastcan import minibatch; X = np.random.rand(100, 3000); y = np.random.rand(100, 20); cProfile.run('minibatch(X, y, 1000, 10, verbose=0)', sort='$SORT')"''', env = { SORT = "cumtime" } }
 time-narx = '''python -m timeit -n 1 -s "import numpy as np; from fastcan.narx import make_narx; rng = np.random.default_rng(5); X = rng.random((1000, 10)); y = rng.random((1000, 2)); m = make_narx(X, y, 10, max_delay=2, poly_degree=2, verbose=0)" "m.fit(X, y, coef_init='one_step_ahead', verbose=1)"'''
+profile-narx = { cmd = '''python -c "import cProfile; import numpy as np; from fastcan.narx import make_narx; X = np.random.rand(3000, 3); y = np.random.rand(3000, 3); m = make_narx(X, y, 10, max_delay=10, poly_degree=2, verbose=0); cProfile.run('m.fit(X, y, coef_init=[0]*33)', sort='$SORT')"''', env = { SORT = "tottime" } }
 
 [tool.pixi.feature.asv.tasks]
 asv-build = { cmd = "python -m asv machine --machine $MACHINE --yes && python -m asv run --show-stderr -v --machine $MACHINE $EXTRA_ARGS", cwd = "asv_benchmarks", env = { MACHINE = "MacOS-M1", EXTRA_ARGS = "" } }