diff --git a/Project.toml b/Project.toml
index 2b6f5d51f..36cf721ef 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ModelPredictiveControl"
 uuid = "61f9bdb8-6ae4-484a-811f-bbf86720c31c"
 authors = ["Francis Gagnon"]
-version = "1.10.0"
+version = "1.11.0"
 
 [deps]
 ControlSystemsBase = "aaaaaaaa-a6ca-5380-bf3e-84a91bcd477e"
diff --git a/benchmark/3_bench_predictive_control.jl b/benchmark/3_bench_predictive_control.jl
index aa14b8591..8a624a654 100644
--- a/benchmark/3_bench_predictive_control.jl
+++ b/benchmark/3_bench_predictive_control.jl
@@ -264,12 +264,24 @@ nmpc_ipopt_ms = NonLinMPC(estim; Hp, Hc, Mwt, Nwt, Cwt, optim, transcription)
 nmpc_ipopt_ms = setconstraint!(nmpc_ipopt_ms; umin, umax)
 JuMP.unset_time_limit_sec(nmpc_ipopt_ms.optim)
 
+optim = JuMP.Model(optimizer_with_attributes(Ipopt.Optimizer,"sb"=>"yes"), add_bridges=false)
+transcription = MultipleShooting(f_threads=true)
+nmpc_ipopt_mst = NonLinMPC(estim; Hp, Hc, Mwt, Nwt, Cwt, optim, transcription)
+nmpc_ipopt_mst = setconstraint!(nmpc_ipopt_mst; umin, umax)
+JuMP.unset_time_limit_sec(nmpc_ipopt_mst.optim)
+
 optim = JuMP.Model(optimizer_with_attributes(Ipopt.Optimizer,"sb"=>"yes"), add_bridges=false)
 transcription = TrapezoidalCollocation()
 nmpc_ipopt_tc = NonLinMPC(estim; Hp, Hc, Mwt, Nwt, Cwt, optim, transcription)
 nmpc_ipopt_tc = setconstraint!(nmpc_ipopt_tc; umin, umax)
 JuMP.unset_time_limit_sec(nmpc_ipopt_tc.optim)
 
+optim = JuMP.Model(optimizer_with_attributes(Ipopt.Optimizer,"sb"=>"yes"), add_bridges=false)
+transcription = TrapezoidalCollocation(f_threads=true)
+nmpc_ipopt_tct = NonLinMPC(estim; Hp, Hc, Mwt, Nwt, Cwt, optim, transcription)
+nmpc_ipopt_tct = setconstraint!(nmpc_ipopt_tct; umin, umax)
+JuMP.unset_time_limit_sec(nmpc_ipopt_tct.optim)
+
 optim = JuMP.Model(MadNLP.Optimizer, add_bridges=false)
 transcription = SingleShooting()
 nmpc_madnlp_ss = NonLinMPC(estim; Hp, Hc, Mwt, Nwt, Cwt, optim, transcription)
@@ -301,11 +313,21 @@ CASE_MPC["Pendulum"]["NonLinMPC"]["Noneconomic"]["Ipopt"]["MultipleShooting"] =
         sim!($nmpc_ipopt_ms, $N, $ry; plant=$plant, x_0=$x_0, x̂_0=$x̂_0),
         samples=samples, evals=evals, seconds=seconds
     )
+CASE_MPC["Pendulum"]["NonLinMPC"]["Noneconomic"]["Ipopt"]["MultipleShooting (threaded)"] =
+    @benchmarkable(
+        sim!($nmpc_ipopt_mst, $N, $ry; plant=$plant, x_0=$x_0, x̂_0=$x̂_0),
+        samples=samples, evals=evals, seconds=seconds
+    )
 CASE_MPC["Pendulum"]["NonLinMPC"]["Noneconomic"]["Ipopt"]["TrapezoidalCollocation"] =
     @benchmarkable(
         sim!($nmpc_ipopt_tc, $N, $ry; plant=$plant, x_0=$x_0, x̂_0=$x̂_0),
         samples=samples, evals=evals, seconds=seconds
     )
+CASE_MPC["Pendulum"]["NonLinMPC"]["Noneconomic"]["Ipopt"]["TrapezoidalCollocation (threaded)"] =
+    @benchmarkable(
+        sim!($nmpc_ipopt_tct, $N, $ry; plant=$plant, x_0=$x_0, x̂_0=$x̂_0),
+        samples=samples, evals=evals, seconds=seconds
+    )
 CASE_MPC["Pendulum"]["NonLinMPC"]["Noneconomic"]["MadNLP"]["SingleShooting"] = 
     @benchmarkable(
         sim!($nmpc_madnlp_ss, $N, $ry; plant=$plant, x_0=$x_0, x̂_0=$x̂_0),
diff --git a/src/controller/transcription.jl b/src/controller/transcription.jl
index efd9055eb..1503dd078 100644
--- a/src/controller/transcription.jl
+++ b/src/controller/transcription.jl
@@ -31,7 +31,7 @@ plant model/constraints.
 struct SingleShooting <: ShootingMethod end
 
 @doc raw"""
-    MultipleShooting()
+    MultipleShooting(; f_threads=false, h_threads=false)
 
 Construct a direct multiple shooting [`TranscriptionMethod`](@ref).
 
@@ -56,24 +56,33 @@ for [`NonLinModel`](@ref).
 This transcription computes the predictions by calling the augmented discrete-time model
 in the equality constraint function recursively over ``H_p``, or by updating the linear
 equality constraint vector for [`LinModel`](@ref). It is generally more efficient for large
-control horizon ``H_c``, unstable or highly nonlinear plant models/constraints. 
+control horizon ``H_c``, unstable or highly nonlinear models/constraints. Multithreading
+with `threads_f` or `threads_h` keyword arguments can be advantageous if ``\mathbf{f}`` or 
+``\mathbf{h}`` in the [`NonLinModel`](@ref) is expensive to evaluate, respectively.
 
 Sparse optimizers like `OSQP` or `Ipopt` and sparse Jacobian computations are recommended
 for this transcription method.
 """
-struct MultipleShooting <: ShootingMethod end
+struct MultipleShooting <: ShootingMethod 
+    f_threads::Bool
+    h_threads::Bool
+    function MultipleShooting(; f_threads=false, h_threads=false)
+        return new(f_threads, h_threads)
+    end
+end
 
 @doc raw"""
-    TrapezoidalCollocation(h::Int=0)
+    TrapezoidalCollocation(h::Int=0; f_threads=false, h_threads=false)
 
 Construct an implicit trapezoidal [`TranscriptionMethod`](@ref) with `h`th order hold.
 
 This is the simplest collocation method. It supports continuous-time [`NonLinModel`](@ref)s
 only. The decision variables are the same as for [`MultipleShooting`](@ref), hence similar
-computational costs. The `h` argument is `0` or `1`, for piecewise constant or linear
-manipulated inputs ``\mathbf{u}`` (`h=1` is slightly less expensive). Note that the various
-[`DiffSolver`](@ref) here assume zero-order hold, so `h=1` will induce a plant-model 
-mismatch if the plant is simulated with these solvers.
+computational costs. See the same docstring for descriptions of `f_threads` and `h_threads`
+keywords. The `h` argument is `0` or `1`, for piecewise constant or linear manipulated 
+inputs ``\mathbf{u}`` (`h=1` is slightly less expensive). Note that the various [`DiffSolver`](@ref) 
+here assume zero-order hold, so `h=1` will induce a plant-model  mismatch if the plant is
+simulated with these solvers. 
 
 This transcription computes the predictions by calling the continuous-time model in the
 equality constraint function and by using the implicit trapezoidal rule. It can handle
@@ -96,12 +105,14 @@ transcription method.
 struct TrapezoidalCollocation <: CollocationMethod
     h::Int
     nc::Int
-    function TrapezoidalCollocation(h::Int=0)
+    f_threads::Bool
+    h_threads::Bool
+    function TrapezoidalCollocation(h::Int=0; f_threads=false, h_threads=false)
         if !(h == 0 || h == 1)
             throw(ArgumentError("h argument must be 0 or 1 for TrapezoidalCollocation."))
         end
         nc = 2 # 2 collocation points per interval for trapezoidal rule
-        return new(h, nc)
+        return new(h, nc, f_threads, h_threads)
     end
 end
 
@@ -1171,17 +1182,17 @@ function predict!(
     nu, nx̂, ny, nd, nk, Hp = model.nu, mpc.estim.nx̂, model.ny, model.nd, model.nk, mpc.Hp
     D̂0 = mpc.D̂0
     x̂0 = @views mpc.estim.x̂0[1:nx̂]
-    d0 = @views mpc.d0[1:nd]
+    d̂0 = @views mpc.d0[1:nd]
     for j=1:Hp
         u0     = @views U0[(1 + nu*(j-1)):(nu*j)]
         û0     = @views Û0[(1 + nu*(j-1)):(nu*j)]
         k0     = @views K0[(1 + nk*(j-1)):(nk*j)]
         x̂0next = @views X̂0[(1 + nx̂*(j-1)):(nx̂*j)]
-        f̂!(x̂0next, û0, k0, mpc.estim, model, x̂0, u0, d0)
+        f̂!(x̂0next, û0, k0, mpc.estim, model, x̂0, u0, d̂0)
         x̂0 = @views X̂0[(1 + nx̂*(j-1)):(nx̂*j)]
-        d0 = @views D̂0[(1 + nd*(j-1)):(nd*j)]
+        d̂0 = @views D̂0[(1 + nd*(j-1)):(nd*j)]
         ŷ0 = @views Ŷ0[(1 + ny*(j-1)):(ny*j)]
-        ĥ!(ŷ0, mpc.estim, model, x̂0, d0)
+        ĥ!(ŷ0, mpc.estim, model, x̂0, d̂0)
     end
     Ŷ0    .+= mpc.F # F = Ŷs if mpc.estim is an InternalModel, else F = 0.
     x̂0end  .= x̂0
@@ -1206,21 +1217,21 @@ in which ``\mathbf{x̂_0}`` is the augmented state extracted from the decision v
 """
 function predict!(
     Ŷ0, x̂0end, _, _, _,
-    mpc::PredictiveController, model::NonLinModel, ::TranscriptionMethod,
+    mpc::PredictiveController, model::NonLinModel, transcription::TranscriptionMethod,
     _ , Z̃
 )
     nu, ny, nd, nx̂, Hp, Hc = model.nu, model.ny, model.nd, mpc.estim.nx̂, mpc.Hp, mpc.Hc
+    h_threads = transcription.h_threads
     X̂0 = @views Z̃[(nu*Hc+1):(nu*Hc+nx̂*Hp)] # Z̃ = [ΔU; X̂0; ϵ]
     D̂0 = mpc.D̂0
-    local x̂0
-    for j=1:Hp
+    @threadsif h_threads for j=1:Hp
         x̂0 = @views X̂0[(1 +  nx̂*(j-1)):(nx̂*j)]
-        d0 = @views D̂0[(1 +  nd*(j-1)):(nd*j)]
+        d̂0 = @views D̂0[(1 +  nd*(j-1)):(nd*j)]
         ŷ0 = @views Ŷ0[(1 +  ny*(j-1)):(ny*j)]
-        ĥ!(ŷ0, mpc.estim, model, x̂0, d0)
+        ĥ!(ŷ0, mpc.estim, model, x̂0, d̂0)
     end
     Ŷ0    .+= mpc.F # F = Ŷs if mpc.estim is an InternalModel, else F = 0.
-    x̂0end  .= x̂0
+    x̂0end  .= @views X̂0[(1+nx̂*(Hp-1)):(nx̂*Hp)]
     return Ŷ0, x̂0end
 end
 
@@ -1329,28 +1340,30 @@ in which the augmented state ``\mathbf{x̂_0}`` are extracted from the decision
 """
 function con_nonlinprogeq!(
     geq, X̂0, Û0, K0, 
-    mpc::PredictiveController, model::NonLinModel, ::MultipleShooting, U0, Z̃
+    mpc::PredictiveController, model::NonLinModel, transcription::MultipleShooting, U0, Z̃
 )
     nu, nx̂, nd, nk = model.nu, mpc.estim.nx̂, model.nd, model.nk
     Hp, Hc = mpc.Hp, mpc.Hc
     nΔU, nX̂ = nu*Hc, nx̂*Hp
+    f_threads = transcription.f_threads
     D̂0 = mpc.D̂0
-    X̂0_Z̃ = @views Z̃[(nΔU+1):(nΔU+nX̂)]
-    x̂0 = @views mpc.estim.x̂0[1:nx̂]
-    d0 = @views mpc.d0[1:nd]
-    #TODO: allow parallel for loop or threads? 
-    for j=1:Hp
+    X̂0_Z̃ = @views Z̃[(nΔU+1):(nΔU+nX̂)] 
+    @threadsif f_threads for j=1:Hp
+        if j < 2
+            x̂0 = @views mpc.estim.x̂0[1:nx̂]
+            d̂0 = @views mpc.d0[1:nd]
+        else
+            x̂0 = @views X̂0_Z̃[(1 + nx̂*(j-2)):(nx̂*(j-1))]
+            d̂0 = @views   D̂0[(1 + nd*(j-2)):(nd*(j-1))]
+        end
         u0       = @views   U0[(1 + nu*(j-1)):(nu*j)]
         û0       = @views   Û0[(1 + nu*(j-1)):(nu*j)]
         k0       = @views   K0[(1 + nk*(j-1)):(nk*j)]
-        d0next   = @views   D̂0[(1 + nd*(j-1)):(nd*j)]
         x̂0next   = @views   X̂0[(1 + nx̂*(j-1)):(nx̂*j)]
         x̂0next_Z̃ = @views X̂0_Z̃[(1 + nx̂*(j-1)):(nx̂*j)]
         ŝnext    = @views  geq[(1 + nx̂*(j-1)):(nx̂*j)]
-        f̂!(x̂0next, û0, k0, mpc.estim, model, x̂0, u0, d0)
+        f̂!(x̂0next, û0, k0, mpc.estim, model, x̂0, u0, d̂0)
         ŝnext .= x̂0next .- x̂0next_Z̃
-        x̂0 = x̂0next_Z̃ # using states in Z̃ for next iteration (allow parallel for)
-        d0 = d0next
     end
     return geq
 end
@@ -1400,25 +1413,23 @@ function con_nonlinprogeq!(
     nu, nx̂, nd, nx, h = model.nu, mpc.estim.nx̂, model.nd, model.nx, transcription.h
     Hp, Hc = mpc.Hp, mpc.Hc
     nΔU, nX̂ = nu*Hc, nx̂*Hp
+    f_threads = transcription.f_threads
     Ts, p = model.Ts, model.p
     As, Cs_u = mpc.estim.As, mpc.estim.Cs_u
     nk = get_nk(model, transcription)
     D̂0 = mpc.D̂0
     X̂0_Z̃ = @views Z̃[(nΔU+1):(nΔU+nX̂)]
-    x̂0 = @views mpc.estim.x̂0[1:nx̂]
-    d0 = @views mpc.d0[1:nd]
-    if !iszero(h)
-        k1, u0, û0 = @views K0[1:nx], U0[1:nu], Û0[1:nu]
-        x0, xs     = @views x̂0[1:nx], x̂0[nx+1:end]
-        mul!(û0, Cs_u, xs)                 
-        û0 .+= u0                          
-        model.f!(k1, x0, û0, d0, p)
-        lastk2 = k1
-    end
     #TODO: allow parallel for loop or threads? 
-    for j=1:Hp
+    @threadsif f_threads for j=1:Hp
+        if j < 2
+            x̂0 = @views mpc.estim.x̂0[1:nx̂]
+            d̂0 = @views mpc.d0[1:nd]
+        else
+            x̂0 = @views X̂0_Z̃[(1 + nx̂*(j-2)):(nx̂*(j-1))] 
+            d̂0 = @views   D̂0[(1 + nd*(j-2)):(nd*(j-1))]
+        end
         k0       = @views   K0[(1 + nk*(j-1)):(nk*j)]
-        d0next   = @views   D̂0[(1 + nd*(j-1)):(nd*j)]
+        d̂0next   = @views   D̂0[(1 + nd*(j-1)):(nd*j)]
         x̂0next   = @views   X̂0[(1 + nx̂*(j-1)):(nx̂*j)]
         x̂0next_Z̃ = @views X̂0_Z̃[(1 + nx̂*(j-1)):(nx̂*j)]  
         ŝnext    = @views  geq[(1 + nx̂*(j-1)):(nx̂*j)]  
@@ -1431,39 +1442,28 @@ function con_nonlinprogeq!(
         mul!(xsnext, As, xs)
         ssnext .= @. xsnext - xsnext_Z̃
         # ----------------- deterministic defects --------------------------------------
-        if iszero(h) # piecewise constant manipulated inputs u:
-            u0 = @views U0[(1 + nu*(j-1)):(nu*j)]
-            û0 = @views Û0[(1 + nu*(j-1)):(nu*j)]
-            mul!(û0, Cs_u, xs)                 # ys_u(k) = Cs_u*xs(k)
-            û0 .+= u0                          #   û0(k) = u0(k) + ys_u(k)
-            model.f!(k1, x0, û0, d0, p)
-            model.f!(k2, x0next_Z̃, û0, d0next, p)
-        else # piecewise linear manipulated inputs u:
-            k1 .= lastk2
-            j == Hp && break # special case, treated after the loop
+        u0 = @views U0[(1 + nu*(j-1)):(nu*j)]
+        û0 = @views Û0[(1 + nu*(j-1)):(nu*j)]
+        mul!(û0, Cs_u, xs)                 # ys_u(k) = Cs_u*xs(k)
+        û0 .+= u0                          #   û0(k) = u0(k) + ys_u(k)
+        if f_threads || h < 1 || j < 2
+            # we need to recompute k1 with multi-threading, even with h==1, since the 
+            # last iteration (j-1) may not be executed (iterations are re-orderable)
+            model.f!(k1, x0, û0, d̂0, p)
+        else
+            k1 .= @views K0[(1 + nk*(j-1)-nx):(nk*(j-1))] # k2 of of the last iter. j-1
+        end
+        if h < 1 || j ≥ Hp
+            # j = Hp special case: u(k+Hp-1) = u(k+Hp) since Hc ≤ Hp implies Δu(k+Hp) = 0
+            û0next = û0
+        else
             u0next = @views U0[(1 + nu*j):(nu*(j+1))]
             û0next = @views Û0[(1 + nu*j):(nu*(j+1))]
             mul!(û0next, Cs_u, xsnext_Z̃)      # ys_u(k+1) = Cs_u*xs(k+1)
             û0next .+= u0next                 #   û0(k+1) = u0(k+1) + ys_u(k+1)
-            model.f!(k2, x0next_Z̃, û0next, d0next, p)
-            lastk2 = k2
         end
+        model.f!(k2, x0next_Z̃, û0next, d̂0next, p)
         sdnext .= @. x0 - x0next_Z̃ + 0.5*Ts*(k1 + k2)
-        x̂0 = x̂0next_Z̃ # using states in Z̃ for next iteration (allow parallel for)
-        d0 = d0next
-    end
-    if !iszero(h)
-        # j = Hp special case: u(k+Hp-1) = u(k+Hp) since Hc ≤ Hp implies Δu(k+Hp)=0
-        x̂0, x̂0next_Z̃   = @views X̂0_Z̃[end-2nx̂+1:end-nx̂], X̂0_Z̃[end-nx̂+1:end]
-        k1, k2         = @views K0[end-2nx+1:end-nx], K0[end-nx+1:end] # k1 already filled
-        d0next         = @views D̂0[end-nd+1:end]
-        û0next, u0next = @views Û0[end-nu+1:end], U0[end-nu+1:end] # correspond to u(k+Hp-1)
-        x0, x0next_Z̃, xsnext_Z̃ = @views x̂0[1:nx], x̂0next_Z̃[1:nx], x̂0next_Z̃[nx+1:end]
-        sdnext = @views geq[end-nx̂+1:end-nx̂+nx] # ssnext already filled
-        mul!(û0next, Cs_u, xsnext_Z̃)                 
-        û0next .+= u0next                          
-        model.f!(k2, x0next_Z̃, û0next, d0next, p)
-        sdnext .= @. x0 - x0next_Z̃ + (Ts/2)*(k1 + k2)
     end
     return geq
 end
diff --git a/src/estimator/internal_model.jl b/src/estimator/internal_model.jl
index 64111944b..a73466cf5 100644
--- a/src/estimator/internal_model.jl
+++ b/src/estimator/internal_model.jl
@@ -177,7 +177,7 @@ It calls [`f!`](@ref) directly since this estimator does not augment the states.
 function f̂!(x̂0next, _ , k0, estim::InternalModel, model::NonLinModel, x̂0, u0, d0)
     f!(x̂0next, k0, model, x̂0, u0, d0, model.p)
     x̂0next .+= estim.f̂op .- estim.x̂op
-    return x̂0next
+    return nothing
 end
 
 @doc raw"""
diff --git a/src/general.jl b/src/general.jl
index cf059257b..127feb24b 100644
--- a/src/general.jl
+++ b/src/general.jl
@@ -122,4 +122,15 @@ function inv!(A::Hermitian{<:Real, <:AbstractMatrix})
     invA = inv(Achol)
     A .= Hermitian(invA, :L)
     return A
+end
+
+"Add `Threads.@threads` to a `for` loop if `flag==true`, else leave the loop as is."
+macro threadsif(flag, expr)
+    quote
+        if $(flag)
+            Threads.@threads $expr
+        else
+            $expr
+        end
+    end |> esc
 end
\ No newline at end of file
diff --git a/test/3_test_predictive_control.jl b/test/3_test_predictive_control.jl
index 53a96b78b..9fb542eff 100644
--- a/test/3_test_predictive_control.jl
+++ b/test/3_test_predictive_control.jl
@@ -814,11 +814,13 @@ end
     f! = (ẋ,x,u,_,_) -> ẋ .= -0.001x .+ u 
     h! = (y,x,_,_) -> y .= x 
     nonlinmodel_c = NonLinModel(f!, h!, 500, 1, 1, 1)
-    nmpc5 = NonLinMPC(nonlinmodel_c, Nwt=[0], Hp=100, Hc=1, transcription=TrapezoidalCollocation())
+    transcription = TrapezoidalCollocation(0, f_threads=true, h_threads=true)
+    nmpc5 = NonLinMPC(nonlinmodel_c; Nwt=[0], Hp=100, Hc=1, transcription)
     preparestate!(nmpc5, [0.0])
     u = moveinput!(nmpc5, [1/0.001])
     @test u ≈ [1.0] atol=5e-2
-    nmpc5_1 = NonLinMPC(nonlinmodel_c, Nwt=[0], Hp=100, Hc=1, transcription=TrapezoidalCollocation(1))
+    transcription = TrapezoidalCollocation(1)
+    nmpc5_1 = NonLinMPC(nonlinmodel_c; Nwt=[0], Hp=100, Hc=1, transcription)
     preparestate!(nmpc5_1, [0.0])
     u = moveinput!(nmpc5_1, [1/0.001])
     @test u ≈ [1.0] atol=5e-2
@@ -831,13 +833,22 @@ end
     ModelPredictiveControl.h!(y, nonlinmodel2, Float32[0,0], Float32[0], nonlinmodel2.p)
     preparestate!(nmpc7, [0], [0])
     @test moveinput!(nmpc7, [0], [0]) ≈ [0.0] atol=5e-2
-    nmpc8 = NonLinMPC(nonlinmodel, Nwt=[0], Hp=100, Hc=1, transcription=MultipleShooting())
+    transcription = MultipleShooting()
+    nmpc8 = NonLinMPC(nonlinmodel; Nwt=[0], Hp=100, Hc=1, transcription)
     preparestate!(nmpc8, [0], [0])
     u = moveinput!(nmpc8, [10], [0])
     @test u ≈ [2] atol=5e-2
     info = getinfo(nmpc8)
     @test info[:u] ≈ u
     @test info[:Ŷ][end] ≈ 10 atol=5e-2
+    transcription = MultipleShooting(f_threads=true, h_threads=true)
+    nmpc8t = NonLinMPC(nonlinmodel; Nwt=[0], Hp=100, Hc=1, transcription)
+    preparestate!(nmpc8t, [0], [0])
+    u = moveinput!(nmpc8t, [10], [0])
+    @test u ≈ [2] atol=5e-2
+    info = getinfo(nmpc8t)
+    @test info[:u] ≈ u
+    @test info[:Ŷ][end] ≈ 10 atol=5e-2
     nmpc9 = NonLinMPC(linmodel, Nwt=[0], Hp=100, Hc=1, transcription=MultipleShooting())
     preparestate!(nmpc9, [10])
     u = moveinput!(nmpc9, [20])