diff --git a/Project.toml b/Project.toml
index fe51ed26b..0cb6a02d9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -42,7 +42,7 @@ Logging = "1.10"
 LoggingExtras = "0.4, 1"
 Lux = "1.12.4"
 MLUtils = "0.4"
-ModelingToolkit = "10"
+ModelingToolkit = "10.23"
 Mooncake = "0.4.138"
 Optim = ">= 1.4.1"
 OptimizationBase = "2"
@@ -57,10 +57,9 @@ Random = "1.10"
 Reexport = "1.2"
 ReverseDiff = "1"
 SafeTestsets = "0.1"
-SciMLBase = "2.82"
+SciMLBase = "2.104"
 SciMLSensitivity = "7"
 SparseArrays = "1.10"
-SparseDiffTools = "2"
 Symbolics = "6"
 TerminalLoggers = "0.1"
 Test = "1.10"
@@ -96,7 +95,6 @@ ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
@@ -106,5 +104,5 @@ Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 [targets]
 test = ["Aqua", "BenchmarkTools", "Boltz", "ComponentArrays", "DiffEqFlux", "Enzyme", "FiniteDiff", "Flux", "ForwardDiff",
     "Ipopt", "IterTools", "Lux", "MLUtils", "ModelingToolkit", "Optim", "OptimizationMOI", "OptimizationOptimJL", "OptimizationOptimisers", 
-    "OrdinaryDiffEqTsit5", "Pkg", "Random", "ReverseDiff", "SafeTestsets", "SciMLSensitivity", "SparseArrays", "SparseDiffTools",
+    "OrdinaryDiffEqTsit5", "Pkg", "Random", "ReverseDiff", "SafeTestsets", "SciMLSensitivity", "SparseArrays",
     "Symbolics",  "Test", "Tracker", "Zygote", "Mooncake"]
diff --git a/docs/Project.toml b/docs/Project.toml
index aefa3caf2..cd460edfc 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -60,7 +60,7 @@ Lux = "1"
 MLUtils = "0.4.4"
 Manifolds = "0.9"
 Manopt = "0.4"
-ModelingToolkit = "10"
+ModelingToolkit = "10.23"
 NLPModels = "0.21"
 NLPModelsTest = "0.10"
 NLopt = "0.6, 1"
diff --git a/lib/OptimizationBBO/src/OptimizationBBO.jl b/lib/OptimizationBBO/src/OptimizationBBO.jl
index e8b1ebef1..57f874356 100644
--- a/lib/OptimizationBBO/src/OptimizationBBO.jl
+++ b/lib/OptimizationBBO/src/OptimizationBBO.jl
@@ -2,6 +2,7 @@ module OptimizationBBO
 
 using Reexport
 import Optimization
+import Optimization: OptimizationBase
 import BlackBoxOptim, Optimization.SciMLBase
 import Optimization.SciMLBase: MultiObjectiveOptimizationFunction
 
diff --git a/lib/OptimizationBase/Project.toml b/lib/OptimizationBase/Project.toml
index 590acaf34..73466270a 100644
--- a/lib/OptimizationBase/Project.toml
+++ b/lib/OptimizationBase/Project.toml
@@ -51,11 +51,11 @@ ForwardDiff = "0.10.26, 1"
 LinearAlgebra = "1.9, 1.10"
 MLDataDevices = "1"
 MLUtils = "0.4"
-ModelingToolkit = "9, 10"
+ModelingToolkit = "10.23"
 PDMats = "0.11"
 Reexport = "1.2"
 ReverseDiff = "1.14"
-SciMLBase = "2"
+SciMLBase = "2.104"
 SparseConnectivityTracer = "0.6, 1"
 SparseMatrixColorings = "0.4"
 SymbolicAnalysis = "0.3"
diff --git a/lib/OptimizationBase/ext/OptimizationMTKExt.jl b/lib/OptimizationBase/ext/OptimizationMTKExt.jl
index a15ba3171..3526cb06f 100644
--- a/lib/OptimizationBase/ext/OptimizationMTKExt.jl
+++ b/lib/OptimizationBase/ext/OptimizationMTKExt.jl
@@ -21,24 +21,25 @@ function OptimizationBase.instantiate_function(
             num_cons))))
     #sys = ModelingToolkit.structural_simplify(sys)
     # don't need to pass `x` or `p` since they're defaults now
-    f = OptimizationProblem(sys, nothing; grad = g, hess = h,
+    mtkprob = OptimizationProblem(sys, nothing; grad = g, hess = h,
         sparse = true, cons_j = cons_j, cons_h = cons_h,
-        cons_sparse = true).f
+        cons_sparse = true)
+    f = mtkprob.f
 
-    grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+    grad = (G, θ, args...) -> f.grad(G, θ, mtkprob.p, args...)
 
-    hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+    hess = (H, θ, args...) -> f.hess(H, θ, mtkprob.p, args...)
 
     hv = function (H, θ, v, args...)
-        res = (eltype(θ)).(f.hess_prototype)
+        res = similar(f.hess_prototype, eltype(θ))
         hess(res, θ, args...)
         H .= res * v
     end
 
     if !isnothing(f.cons)
-        cons = (res, θ) -> f.cons(res, θ, p)
-        cons_j = (J, θ) -> f.cons_j(J, θ, p)
-        cons_h = (res, θ) -> f.cons_h(res, θ, p)
+        cons = (res, θ) -> f.cons(res, θ, mtkprob.p)
+        cons_j = (J, θ) -> f.cons_j(J, θ, mtkprob.p)
+        cons_h = (res, θ) -> f.cons_h(res, θ, mtkprob.p)
     else
         cons = nothing
         cons_j = nothing
@@ -72,24 +73,24 @@ function OptimizationBase.instantiate_function(
             num_cons))))
     #sys = ModelingToolkit.structural_simplify(sys)
     # don't need to pass `x` or `p` since they're defaults now
-    f = OptimizationProblem(sys, nothing; grad = g, hess = h,
+    mtkprob = OptimizationProblem(sys, nothing; grad = g, hess = h,
         sparse = true, cons_j = cons_j, cons_h = cons_h,
-        cons_sparse = true).f
+        cons_sparse = true)
+    f = mtkprob.f
 
-    grad = (G, θ, args...) -> f.grad(G, θ, cache.p, args...)
+    grad = (G, θ, args...) -> f.grad(G, θ, mtkprob.p, args...)
 
-    hess = (H, θ, args...) -> f.hess(H, θ, cache.p, args...)
+    hess = (H, θ, args...) -> f.hess(H, θ, mtkprob.p, args...)
 
     hv = function (H, θ, v, args...)
-        res = (eltype(θ)).(f.hess_prototype)
+        res = similar(f.hess_prototype, eltype(θ))
         hess(res, θ, args...)
         H .= res * v
     end
-
     if !isnothing(f.cons)
-        cons = (res, θ) -> f.cons(res, θ, cache.p)
-        cons_j = (J, θ) -> f.cons_j(J, θ, cache.p)
-        cons_h = (res, θ) -> f.cons_h(res, θ, cache.p)
+        cons = (res, θ) -> f.cons(res, θ, mtkprob.p)
+        cons_j = (J, θ) -> f.cons_j(J, θ, mtkprob.p)
+        cons_h = (res, θ) -> f.cons_h(res, θ, mtkprob.p)
     else
         cons = nothing
         cons_j = nothing
@@ -121,13 +122,14 @@ function OptimizationBase.instantiate_function(
             num_cons))))
     #sys = ModelingToolkit.structural_simplify(sys)
     # don't need to pass `x` or `p` since they're defaults now
-    f = OptimizationProblem(sys, nothing; grad = g, hess = h,
+    mtkprob = OptimizationProblem(sys, nothing; grad = g, hess = h,
         sparse = false, cons_j = cons_j, cons_h = cons_h,
-        cons_sparse = false).f
+        cons_sparse = false)
+    f = mtkprob.f
 
-    grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+    grad = (G, θ, args...) -> f.grad(G, θ, mtkprob.p, args...)
 
-    hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+    hess = (H, θ, args...) -> f.hess(H, θ, mtkprob.p, args...)
 
     hv = function (H, θ, v, args...)
         res = ArrayInterface.zeromatrix(θ)
@@ -136,9 +138,9 @@ function OptimizationBase.instantiate_function(
     end
 
     if !isnothing(f.cons)
-        cons = (res, θ) -> f.cons(res, θ, p)
-        cons_j = (J, θ) -> f.cons_j(J, θ, p)
-        cons_h = (res, θ) -> f.cons_h(res, θ, p)
+        cons = (res, θ) -> f.cons(res, θ, mtkprob.p)
+        cons_j = (J, θ) -> f.cons_j(J, θ, mtkprob.p)
+        cons_h = (res, θ) -> f.cons_h(res, θ, mtkprob.p)
     else
         cons = nothing
         cons_j = nothing
@@ -172,13 +174,14 @@ function OptimizationBase.instantiate_function(
             num_cons))))
     #sys = ModelingToolkit.structural_simplify(sys)
     # don't need to pass `x` or `p` since they're defaults now
-    f = OptimizationProblem(sys, nothing; grad = g, hess = h,
+    mtkprob = OptimizationProblem(sys, nothing; grad = g, hess = h,
         sparse = false, cons_j = cons_j, cons_h = cons_h,
-        cons_sparse = false).f
+        cons_sparse = false)
+    f = mtkprob.f
 
-    grad = (G, θ, args...) -> f.grad(G, θ, cache.p, args...)
+    grad = (G, θ, args...) -> f.grad(G, θ, mtkprob.p, args...)
 
-    hess = (H, θ, args...) -> f.hess(H, θ, cache.p, args...)
+    hess = (H, θ, args...) -> f.hess(H, θ, mtkprob.p, args...)
 
     hv = function (H, θ, v, args...)
         res = ArrayInterface.zeromatrix(θ)
@@ -187,9 +190,9 @@ function OptimizationBase.instantiate_function(
     end
 
     if !isnothing(f.cons)
-        cons = (res, θ) -> f.cons(res, θ, cache.p)
-        cons_j = (J, θ) -> f.cons_j(J, θ, cache.p)
-        cons_h = (res, θ) -> f.cons_h(res, θ, cache.p)
+        cons = (res, θ) -> f.cons(res, θ, mtkprob.p)
+        cons_j = (J, θ) -> f.cons_j(J, θ, mtkprob.p)
+        cons_h = (res, θ) -> f.cons_h(res, θ, mtkprob.p)
     else
         cons = nothing
         cons_j = nothing
diff --git a/lib/OptimizationBase/src/function.jl b/lib/OptimizationBase/src/function.jl
index f05e035c1..9c07554ce 100644
--- a/lib/OptimizationBase/src/function.jl
+++ b/lib/OptimizationBase/src/function.jl
@@ -56,11 +56,11 @@ function OptimizationBase.instantiate_function(
     cons_vjp = f.cons_vjp === nothing ? nothing : (res, x) -> f.cons_vjp(res, x, p)
     cons_h = f.cons_h === nothing ? nothing : (res, x) -> f.cons_h(res, x, p)
     hess_prototype = f.hess_prototype === nothing ? nothing :
-                     convert.(eltype(x), f.hess_prototype)
+                     similar(f.hess_prototype, eltype(x))
     cons_jac_prototype = f.cons_jac_prototype === nothing ? nothing :
-                         convert.(eltype(x), f.cons_jac_prototype)
+                         similar(f.cons_jac_prototype, eltype(x))
     cons_hess_prototype = f.cons_hess_prototype === nothing ? nothing :
-                          [convert.(eltype(x), f.cons_hess_prototype[i])
+                          [similar(f.cons_hess_prototype[i], eltype(x))
                            for i in 1:num_cons]
     expr = symbolify(f.expr)
     cons_expr = symbolify.(f.cons_expr)
@@ -90,11 +90,11 @@ function OptimizationBase.instantiate_function(
     cons_vjp = f.cons_vjp === nothing ? nothing : (res, x) -> f.cons_vjp(res, x, cache.p)
     cons_h = f.cons_h === nothing ? nothing : (res, x) -> f.cons_h(res, x, cache.p)
     hess_prototype = f.hess_prototype === nothing ? nothing :
-                     convert.(eltype(cache.u0), f.hess_prototype)
+                     similar(f.hess_prototype, eltype(cache.u0))
     cons_jac_prototype = f.cons_jac_prototype === nothing ? nothing :
-                         convert.(eltype(cache.u0), f.cons_jac_prototype)
+                         similar(f.cons_jac_prototype, eltype(cache.u0))
     cons_hess_prototype = f.cons_hess_prototype === nothing ? nothing :
-                          [convert.(eltype(cache.u0), f.cons_hess_prototype[i])
+                          [similar(f.cons_hess_prototype[i], eltype(cache.u0))
                            for i in 1:num_cons]
     expr = symbolify(f.expr)
     cons_expr = symbolify.(f.cons_expr)
@@ -196,11 +196,11 @@ function OptimizationBase.instantiate_function(
         end
     end
     hess_prototype = f.hess_prototype === nothing ? nothing :
-                     convert.(eltype(x), f.hess_prototype)
+                     similar(f.hess_prototype, eltype(x))
     cons_jac_prototype = f.cons_jac_prototype === nothing ? nothing :
-                         convert.(eltype(x), f.cons_jac_prototype)
+                         similar(f.cons_jac_prototype, eltype(x))
     cons_hess_prototype = f.cons_hess_prototype === nothing ? nothing :
-                          [convert.(eltype(x), f.cons_hess_prototype[i])
+                          [similar(f.cons_hess_prototype[i], eltype(x))
                            for i in 1:num_cons]
     expr = symbolify(f.expr)
     cons_expr = symbolify.(f.cons_expr)
diff --git a/lib/OptimizationIpopt/Project.toml b/lib/OptimizationIpopt/Project.toml
index 4b2d8a1fe..f61d79e99 100644
--- a/lib/OptimizationIpopt/Project.toml
+++ b/lib/OptimizationIpopt/Project.toml
@@ -14,7 +14,7 @@ SymbolicIndexingInterface = "2efcf032-c050-4f8e-a9bb-153293bab1f5"
 [compat]
 Ipopt = "1.10.3"
 LinearAlgebra = "1.10.0"
-ModelingToolkit = "10.20"
+ModelingToolkit = "10.23"
 Optimization = "4.3.0"
 SciMLBase = "2.90.0"
 SparseArrays = "1.10.0"
diff --git a/lib/OptimizationIpopt/src/cache.jl b/lib/OptimizationIpopt/src/cache.jl
index f95b9981b..27ee75c37 100644
--- a/lib/OptimizationIpopt/src/cache.jl
+++ b/lib/OptimizationIpopt/src/cache.jl
@@ -94,13 +94,13 @@ function IpoptCache(prob, opt;
     J = if isnothing(f.cons_jac_prototype)
         zeros(T, num_cons, n)
     else
-        convert.(T, f.cons_jac_prototype)
+        similar(f.cons_jac_prototype, T)
     end
     lagh = !isnothing(f.lag_hess_prototype)
     H = if lagh # lag hessian takes precedence
-        convert.(T, f.lag_hess_prototype)
+        similar(f.lag_hess_prototype, T)
     elseif !isnothing(f.hess_prototype)
-        convert.(T, f.hess_prototype)
+        similar(f.hess_prototype, T)
     else
         zeros(T, n, n)
     end
@@ -109,7 +109,7 @@ function IpoptCache(prob, opt;
     elseif isnothing(f.cons_hess_prototype)
         Matrix{T}[zeros(T, n, n) for i in 1:num_cons]
     else
-        [convert.(T, f.cons_hess_prototype[i]) for i in 1:num_cons]
+        [similar(f.cons_hess_prototype[i], T) for i in 1:num_cons]
     end
     lcons = prob.lcons === nothing ? fill(T(-Inf), num_cons) : prob.lcons
     ucons = prob.ucons === nothing ? fill(T(Inf), num_cons) : prob.ucons
diff --git a/lib/OptimizationMOI/Project.toml b/lib/OptimizationMOI/Project.toml
index 019cacd31..f02007f6f 100644
--- a/lib/OptimizationMOI/Project.toml
+++ b/lib/OptimizationMOI/Project.toml
@@ -22,7 +22,7 @@ Ipopt_jll = "300.1400"
 Juniper = "0.9"
 LinearAlgebra = "1"
 MathOptInterface = "1"
-ModelingToolkit = "10"
+ModelingToolkit = "10.23"
 NLopt = "1"
 Optimization = "4.4"
 Reexport = "1.2"
@@ -31,7 +31,7 @@ SparseArrays = "1.6"
 SymbolicIndexingInterface = "0.3"
 Symbolics = "6"
 Test = "1.6"
-Zygote = "0.6"
+Zygote = "0.6, 0.7"
 julia = "1.10"
 
 [extras]
diff --git a/lib/OptimizationMOI/src/nlp.jl b/lib/OptimizationMOI/src/nlp.jl
index 97a5d7d29..5c8b9be00 100644
--- a/lib/OptimizationMOI/src/nlp.jl
+++ b/lib/OptimizationMOI/src/nlp.jl
@@ -123,17 +123,17 @@ function MOIOptimizationNLPCache(prob::OptimizationProblem,
     end
     T = eltype(prob.u0)
     n = length(prob.u0)
-
     J = if isnothing(f.cons_jac_prototype)
         zeros(T, num_cons, n)
     else
-        convert.(T, f.cons_jac_prototype)
+        similar(f.cons_jac_prototype, T)
     end
     lagh = !isnothing(f.lag_hess_prototype)
+
     H = if lagh # lag hessian takes precedence
-        convert.(T, f.lag_hess_prototype)
+        similar(f.lag_hess_prototype, T)
     elseif !isnothing(f.hess_prototype)
-        convert.(T, f.hess_prototype)
+        similar(f.hess_prototype, T)
     else
         zeros(T, n, n)
     end
@@ -142,7 +142,7 @@ function MOIOptimizationNLPCache(prob::OptimizationProblem,
     elseif isnothing(f.cons_hess_prototype)
         Matrix{T}[zeros(T, n, n) for i in 1:num_cons]
     else
-        [convert.(T, f.cons_hess_prototype[i]) for i in 1:num_cons]
+        [similar(f.cons_hess_prototype[i], T) for i in 1:num_cons]
     end
     lcons = prob.lcons === nothing ? fill(T(-Inf), num_cons) : prob.lcons
     ucons = prob.ucons === nothing ? fill(T(Inf), num_cons) : prob.ucons
diff --git a/lib/OptimizationOptimJL/src/OptimizationOptimJL.jl b/lib/OptimizationOptimJL/src/OptimizationOptimJL.jl
index b292858d7..ca51d29b2 100644
--- a/lib/OptimizationOptimJL/src/OptimizationOptimJL.jl
+++ b/lib/OptimizationOptimJL/src/OptimizationOptimJL.jl
@@ -213,7 +213,7 @@ function SciMLBase.__solve(cache::OptimizationCache{
             isnothing(cache.f.hess_prototype) ?
             Optim.NLSolversBase.alloc_H(cache.u0,
                 real(zero(u0_type))) :
-            convert.(u0_type, cache.f.hess_prototype))
+            similar(cache.f.hess_prototype, u0_type))
     end
 
     opt_args = __map_optimizer_args(cache, cache.opt, callback = _cb,
@@ -414,7 +414,7 @@ function SciMLBase.__solve(cache::OptimizationCache{
             isnothing(cache.f.hess_prototype) ?
             Optim.NLSolversBase.alloc_H(cache.u0,
                 real(zero(u0_type))) :
-            convert.(u0_type, cache.f.hess_prototype))
+            similar(cache.f.hess_prototype, u0_type))
     else
         Optim.OnceDifferentiable(_loss, gg, fg!, cache.u0,
             real(zero(u0_type)),
diff --git a/src/auglag.jl b/src/auglag.jl
index 68226f0fb..f7c006768 100644
--- a/src/auglag.jl
+++ b/src/auglag.jl
@@ -128,7 +128,7 @@ function SciMLBase.__solve(cache::OptimizationCache{
         function aug_grad(G, θ, p)
             cache.f.grad(G, θ, p)
             if !isnothing(cache.f.cons_jac_prototype)
-                J = Float64.(cache.f.cons_jac_prototype)
+                J = similar(cache.f.cons_jac_prototype, Float64)
             else
                 J = zeros((length(cache.lcons), length(θ)))
             end
diff --git a/src/lbfgsb.jl b/src/lbfgsb.jl
index 2c6d5f83c..c04be7986 100644
--- a/src/lbfgsb.jl
+++ b/src/lbfgsb.jl
@@ -148,7 +148,7 @@ function SciMLBase.__solve(cache::OptimizationCache{
         function aug_grad(G, θ)
             cache.f.grad(G, θ)
             if !isnothing(cache.f.cons_jac_prototype)
-                J = Float64.(cache.f.cons_jac_prototype)
+                J = similar(cache.f.cons_jac_prototype, Float64)
             else
                 J = zeros((length(cache.lcons), length(θ)))
             end
diff --git a/test/ADtests.jl b/test/ADtests.jl
index 90b08477b..fecc72630 100644
--- a/test/ADtests.jl
+++ b/test/ADtests.jl
@@ -33,7 +33,7 @@ end
 end
 
 @testset "No constraint" begin
-    for adtype in [AutoEnzyme(), AutoForwardDiff(), AutoZygote(), AutoReverseDiff(),
+    @testset "$adtype" for adtype in [AutoEnzyme(), AutoForwardDiff(), AutoZygote(), AutoReverseDiff(),
         AutoFiniteDiff(), AutoModelingToolkit(), AutoSparseForwardDiff(),
         AutoSparseReverseDiff(), AutoSparse(AutoZygote()), AutoModelingToolkit(true, true), AutoMooncake()]
         optf = OptimizationFunction(rosenbrock, adtype)
@@ -71,7 +71,7 @@ end
 end
 
 @testset "One constraint" begin
-    for adtype in [AutoEnzyme(), AutoForwardDiff(), AutoZygote(), AutoReverseDiff(),
+    @testset "$adtype" for adtype in [AutoEnzyme(), AutoForwardDiff(), AutoZygote(), AutoReverseDiff(),
         AutoFiniteDiff(), AutoModelingToolkit(), AutoSparseForwardDiff(),
         AutoSparseReverseDiff(), AutoSparse(AutoZygote()), AutoModelingToolkit(true, true), AutoMooncake()]
         cons = (res, x, p) -> (res[1] = x[1]^2 + x[2]^2 - 1.0; return nothing)
@@ -92,7 +92,7 @@ end
 end
 
 @testset "Two constraints" begin
-    for adtype in [AutoForwardDiff(), AutoZygote(), AutoReverseDiff(),
+    @testset "$adtype" for adtype in [AutoForwardDiff(), AutoZygote(), AutoReverseDiff(),
         AutoFiniteDiff(), AutoModelingToolkit(), AutoSparseForwardDiff(),
         AutoSparseReverseDiff(), AutoSparse(AutoZygote()), AutoModelingToolkit(true, true), AutoMooncake()]
         function con2_c(res, x, p)