diff --git a/CHANGELOG.md b/CHANGELOG.md
index e47df2f05..fb2d91e9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `hellinger_dist`
   - `bures_dist`
   - `bures_angle`
+- Align `steadystate` ODE solver to other methods and improve GPU support. ([#421])
 
 ## [v0.27.0]
 Release date: 2025-02-14
@@ -165,3 +166,4 @@ Release date: 2024-11-13
 [#418]: https://github.com/qutip/QuantumToolbox.jl/issues/418
 [#419]: https://github.com/qutip/QuantumToolbox.jl/issues/419
 [#420]: https://github.com/qutip/QuantumToolbox.jl/issues/420
+[#421]: https://github.com/qutip/QuantumToolbox.jl/issues/421
diff --git a/docs/src/users_guide/steadystate.md b/docs/src/users_guide/steadystate.md
index bd68d5a58..a74353b90 100644
--- a/docs/src/users_guide/steadystate.md
+++ b/docs/src/users_guide/steadystate.md
@@ -37,10 +37,6 @@ To change a solver, use the keyword argument `solver`, for example:
 ρ_ss = steadystate(H, c_ops; solver = SteadyStateLinearSolver())
 ```
 
-!!! note "Initial state for `SteadyStateODESolver()`"
-    It is necessary to provide the initial state `ψ0` for ODE solving method, namely
-    `steadystate(H, ψ0, tspan, c_ops, solver = SteadyStateODESolver())`, where `tspan::Real` represents the final time step, defaults to `Inf` (infinity).
-
 Although it is not obvious, the [`SteadyStateDirectSolver()`](@ref SteadyStateDirectSolver) and [`SteadyStateEigenSolver()`](@ref SteadyStateEigenSolver) methods all use an LU decomposition internally and thus can have a large memory overhead. In contrast, for [`SteadyStateLinearSolver()`](@ref SteadyStateLinearSolver), iterative algorithms provided by [`LinearSolve.jl`](https://docs.sciml.ai/LinearSolve/stable/solvers/solvers/), such as `KrylovJL_GMRES()` and `KrylovJL_BICGSTAB()`, do not factor the matrix and thus take less memory than the LU methods and allow, in principle, for extremely large system sizes. The downside is that these methods can take much longer than the direct method as the condition number of the Liouvillian matrix is large, indicating that these iterative methods require a large number of iterations for convergence. 
 
 To overcome this, one can provide preconditioner that solves for an approximate inverse for the (modified) Liouvillian, thus better conditioning the problem, leading to faster convergence. The left and right preconditioner can be specified as the keyword argument `Pl` and `Pr`, respectively:
diff --git a/ext/QuantumToolboxCUDAExt.jl b/ext/QuantumToolboxCUDAExt.jl
index 70168b2b1..b4fcf0fc2 100644
--- a/ext/QuantumToolboxCUDAExt.jl
+++ b/ext/QuantumToolboxCUDAExt.jl
@@ -2,6 +2,7 @@ module QuantumToolboxCUDAExt
 
 using QuantumToolbox
 using QuantumToolbox: makeVal, getVal
+import QuantumToolbox: _sparse_similar
 import CUDA: cu, CuArray, allowscalar
 import CUDA.CUSPARSE: CuSparseVector, CuSparseMatrixCSC, CuSparseMatrixCSR, AbstractCuSparseArray
 import SparseArrays: SparseVector, SparseMatrixCSC
@@ -106,4 +107,7 @@ QuantumToolbox.to_dense(A::MT) where {MT<:AbstractCuSparseArray} = CuArray(A)
 QuantumToolbox.to_dense(::Type{T1}, A::CuArray{T2}) where {T1<:Number,T2<:Number} = CuArray{T1}(A)
 QuantumToolbox.to_dense(::Type{T}, A::AbstractCuSparseArray) where {T<:Number} = CuArray{T}(A)
 
+QuantumToolbox._sparse_similar(A::CuSparseMatrixCSC, args...) = sparse(args..., fmt = :csc)
+QuantumToolbox._sparse_similar(A::CuSparseMatrixCSR, args...) = sparse(args..., fmt = :csr)
+
 end
diff --git a/src/steadystate.jl b/src/steadystate.jl
index e0296954a..921c7b398 100644
--- a/src/steadystate.jl
+++ b/src/steadystate.jl
@@ -27,7 +27,7 @@ struct SteadyStateEigenSolver <: SteadyStateSolver end
 
 A solver which solves [`steadystate`](@ref) by finding the inverse of Liouvillian [`SuperOperator`](@ref) using the `alg`orithms given in [`LinearSolve.jl`](https://docs.sciml.ai/LinearSolve/stable/).
 
-# Parameters
+# Arguments
 - `alg::SciMLLinearSolveAlgorithm=KrylovJL_GMRES()`: algorithms given in [`LinearSolve.jl`](https://docs.sciml.ai/LinearSolve/stable/)
 - `Pl::Union{Function,Nothing}=nothing`: left preconditioner, see documentation [Solving for Steady-State Solutions](@ref doc:Solving-for-Steady-State-Solutions) for more details.
 - `Pr::Union{Function,Nothing}=nothing`: right preconditioner, see documentation [Solving for Steady-State Solutions](@ref doc:Solving-for-Steady-State-Solutions) for more details.
@@ -43,16 +43,40 @@ Base.@kwdef struct SteadyStateLinearSolver{
 end
 
 @doc raw"""
-    SteadyStateODESolver(alg = Tsit5())
+    SteadyStateODESolver(
+        alg = Tsit5(),
+        ψ0 = nothing,
+        tmax = Inf,
+        )
 
 An ordinary differential equation (ODE) solver for solving [`steadystate`](@ref).
 
-It includes a field (attribute) `SteadyStateODESolver.alg` that specifies the solving algorithm. Default to `Tsit5()`.
+Solve the stationary state based on time evolution (ordinary differential equations; `OrdinaryDiffEq.jl`) with a given initial state.
+
+The termination condition of the stationary state ``|\rho\rangle\rangle`` is that either the following condition is `true`:
+
+```math
+\lVert\frac{\partial |\hat{\rho}\rangle\rangle}{\partial t}\rVert \leq \textrm{reltol} \times\lVert\frac{\partial |\hat{\rho}\rangle\rangle}{\partial t}+|\hat{\rho}\rangle\rangle\rVert
+```
+
+or
 
-For more details about the solvers, please refer to [`OrdinaryDiffEq.jl`](https://docs.sciml.ai/OrdinaryDiffEq/stable/)
+```math
+\lVert\frac{\partial |\hat{\rho}\rangle\rangle}{\partial t}\rVert \leq \textrm{abstol}
+```
+
+# Arguments
+- `alg::OrdinaryDiffEqAlgorithm=Tsit5()`: The algorithm to solve the ODE.
+- `ψ0::Union{Nothing,QuantumObject}=nothing`: The initial state of the system. If not specified, a random pure state will be generated.
+- `tmax::Real=Inf`: The final time step for the steady state problem.
+
+For more details about the solvers, please refer to [`OrdinaryDiffEq.jl`](https://docs.sciml.ai/OrdinaryDiffEq/stable/).
 """
-Base.@kwdef struct SteadyStateODESolver{MT<:OrdinaryDiffEqAlgorithm} <: SteadyStateSolver
+Base.@kwdef struct SteadyStateODESolver{MT<:OrdinaryDiffEqAlgorithm,ST<:Union{Nothing,QuantumObject},T<:Real} <:
+                   SteadyStateSolver
     alg::MT = Tsit5()
+    ψ0::ST = nothing
+    tmax::T = Inf
 end
 
 @doc raw"""
@@ -108,18 +132,18 @@ function _steadystate(L::QuantumObject{SuperOperatorQuantumObject}, solver::Stea
     N = prod(L.dimensions)
     weight = norm(L_tmp, 1) / length(L_tmp)
 
-    v0 = _get_dense_similar(L_tmp, N^2)
+    v0 = _dense_similar(L_tmp, N^2)
     fill!(v0, 0)
     allowed_setindex!(v0, weight, 1) # Because scalar indexing is not allowed on GPU arrays
 
     idx_range = collect(1:N)
-    rows = _get_dense_similar(L_tmp, N)
-    cols = _get_dense_similar(L_tmp, N)
-    vals = _get_dense_similar(L_tmp, N)
+    rows = _dense_similar(L_tmp, N)
+    cols = _dense_similar(L_tmp, N)
+    vals = _dense_similar(L_tmp, N)
     fill!(rows, 1)
     copyto!(cols, N .* (idx_range .- 1) .+ idx_range)
     fill!(vals, weight)
-    Tn = sparse(rows, cols, vals, N^2, N^2)
+    Tn = _sparse_similar(L_tmp, rows, cols, vals, N^2, N^2)
     L_tmp = L_tmp + Tn
 
     (haskey(kwargs, :Pl) || haskey(kwargs, :Pr)) && error("The use of preconditioners must be defined in the solver.")
@@ -155,14 +179,14 @@ function _steadystate(L::QuantumObject{SuperOperatorQuantumObject}, solver::Stea
     N = prod(L.dimensions)
     weight = norm(L_tmp, 1) / length(L_tmp)
 
-    v0 = _get_dense_similar(L_tmp, N^2)
+    v0 = _dense_similar(L_tmp, N^2)
     fill!(v0, 0)
     allowed_setindex!(v0, weight, 1) # Because scalar indexing is not allowed on GPU arrays
 
     idx_range = collect(1:N)
-    rows = _get_dense_similar(L_tmp, N)
-    cols = _get_dense_similar(L_tmp, N)
-    vals = _get_dense_similar(L_tmp, N)
+    rows = _dense_similar(L_tmp, N)
+    cols = _dense_similar(L_tmp, N)
+    vals = _dense_similar(L_tmp, N)
     fill!(rows, 1)
     copyto!(cols, N .* (idx_range .- 1) .+ idx_range)
     fill!(vals, weight)
@@ -175,68 +199,20 @@ function _steadystate(L::QuantumObject{SuperOperatorQuantumObject}, solver::Stea
     return QuantumObject(ρss, Operator, L.dimensions)
 end
 
-_steadystate(L::QuantumObject{SuperOperatorQuantumObject}, solver::SteadyStateODESolver; kwargs...) = throw(
-    ArgumentError(
-        "The initial state ψ0 is required for SteadyStateODESolver, use the following call instead: `steadystate(H, ψ0, tmax, c_ops)`.",
-    ),
-)
+function _steadystate(L::QuantumObject{SuperOperatorQuantumObject}, solver::SteadyStateODESolver; kwargs...)
+    tmax = solver.tmax
 
-@doc raw"""
-    steadystate(
-        H::QuantumObject{HOpType},
-        ψ0::QuantumObject{StateOpType},
-        tmax::Real = Inf,
-        c_ops::Union{Nothing,AbstractVector,Tuple} = nothing;
-        solver::SteadyStateODESolver = SteadyStateODESolver(),
-        reltol::Real = 1.0e-8,
-        abstol::Real = 1.0e-10,
-        kwargs...,
-    )
-
-Solve the stationary state based on time evolution (ordinary differential equations; `OrdinaryDiffEq.jl`) with a given initial state.
+    ψ0 = isnothing(solver.ψ0) ? rand_ket(L.dimensions) : solver.ψ0
+    abstol = haskey(kwargs, :abstol) ? kwargs[:abstol] : DEFAULT_ODE_SOLVER_OPTIONS.abstol
+    reltol = haskey(kwargs, :reltol) ? kwargs[:reltol] : DEFAULT_ODE_SOLVER_OPTIONS.reltol
 
-The termination condition of the stationary state ``|\rho\rangle\rangle`` is that either the following condition is `true`:
-
-```math
-\lVert\frac{\partial |\hat{\rho}\rangle\rangle}{\partial t}\rVert \leq \textrm{reltol} \times\lVert\frac{\partial |\hat{\rho}\rangle\rangle}{\partial t}+|\hat{\rho}\rangle\rangle\rVert
-```
-
-or
-
-```math
-\lVert\frac{\partial |\hat{\rho}\rangle\rangle}{\partial t}\rVert \leq \textrm{abstol}
-```
-
-# Parameters
-- `H`: The Hamiltonian or the Liouvillian of the system.
-- `ψ0`: The initial state of the system.
-- `tmax=Inf`: The final time step for the steady state problem.
-- `c_ops=nothing`: The list of the collapse operators.
-- `solver`: see [`SteadyStateODESolver`](@ref) for more details.
-- `reltol=1.0e-8`: Relative tolerance in steady state terminate condition and solver adaptive timestepping.
-- `abstol=1.0e-10`: Absolute tolerance in steady state terminate condition and solver adaptive timestepping.
-- `kwargs`: The keyword arguments for the ODEProblem.
-"""
-function steadystate(
-    H::QuantumObject{HOpType},
-    ψ0::QuantumObject{StateOpType},
-    tmax::Real = Inf,
-    c_ops::Union{Nothing,AbstractVector,Tuple} = nothing;
-    solver::SteadyStateODESolver = SteadyStateODESolver(),
-    reltol::Real = 1.0e-8,
-    abstol::Real = 1.0e-10,
-    kwargs...,
-) where {
-    HOpType<:Union{OperatorQuantumObject,SuperOperatorQuantumObject},
-    StateOpType<:Union{KetQuantumObject,OperatorQuantumObject},
-}
     ftype = _FType(ψ0)
-    cb = TerminateSteadyState(abstol, reltol, _steadystate_ode_condition)
+    _terminate_func = SteadyStateODECondition(similar(mat2vec(ket2dm(ψ0)).data))
+    cb = TerminateSteadyState(abstol, reltol, _terminate_func)
     sol = mesolve(
-        H,
+        L,
         ψ0,
         [ftype(0), ftype(tmax)],
-        c_ops,
         progress_bar = Val(false),
         save_everystep = false,
         saveat = ftype[],
@@ -247,12 +223,17 @@ function steadystate(
     return ρss
 end
 
-function _steadystate_ode_condition(integrator, abstol, reltol, min_t)
+struct SteadyStateODECondition{CT<:AbstractArray}
+    cache::CT
+end
+
+function (f::SteadyStateODECondition)(integrator, abstol, reltol, min_t)
     # this condition is same as DiffEqBase.NormTerminationMode
 
-    du_dt = (integrator.u - integrator.uprev) / integrator.dt
-    norm_du_dt = norm(du_dt)
-    if (norm_du_dt <= reltol * norm(du_dt + integrator.u)) || (norm_du_dt <= abstol)
+    f.cache .= (integrator.u .- integrator.uprev) ./ integrator.dt
+    norm_du_dt = norm(f.cache)
+    f.cache .+= integrator.u
+    if norm_du_dt <= reltol * norm(f.cache) || norm_du_dt <= abstol
         return true
     else
         return false
diff --git a/src/utilities.jl b/src/utilities.jl
index 47b73f525..0a660995f 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -130,8 +130,10 @@ end
 
 get_typename_wrapper(A) = Base.typename(typeof(A)).wrapper
 
-_get_dense_similar(A::AbstractArray, args...) = similar(A, args...)
-_get_dense_similar(A::AbstractSparseMatrix, args...) = similar(nonzeros(A), args...)
+_dense_similar(A::AbstractArray, args...) = similar(A, args...)
+_dense_similar(A::AbstractSparseMatrix, args...) = similar(nonzeros(A), args...)
+
+_sparse_similar(A::AbstractArray, args...) = sparse(args...)
 
 _Ginibre_ensemble(n::Int, rank::Int = n) = randn(ComplexF64, n, rank) / sqrt(n)
 
diff --git a/test/core-test/steady_state.jl b/test/core-test/steady_state.jl
index de33b2ad1..795ef7d5a 100644
--- a/test/core-test/steady_state.jl
+++ b/test/core-test/steady_state.jl
@@ -11,9 +11,8 @@
     rho_me = sol_me.states[end]
 
     solver = SteadyStateODESolver()
-    ρ_ss = steadystate(H, psi0, t_l[end], c_ops, solver = solver)
+    ρ_ss = steadystate(H, c_ops, solver = solver)
     @test tracedist(rho_me, ρ_ss) < 1e-4
-    @test_throws ArgumentError steadystate(H, c_ops, solver = solver)
 
     solver = SteadyStateDirectSolver()
     ρ_ss = steadystate(H, c_ops, solver = solver)
@@ -34,9 +33,9 @@
     @testset "Type Inference (steadystate)" begin
         L = liouvillian(H, c_ops)
 
-        solver = SteadyStateODESolver()
-        @inferred steadystate(H, psi0, t_l[end], c_ops, solver = solver)
-        @inferred steadystate(L, psi0, t_l[end], solver = solver)
+        solver = SteadyStateODESolver(tmax = t_l[end])
+        @inferred steadystate(H, c_ops, solver = solver)
+        @inferred steadystate(L, solver = solver)
 
         solver = SteadyStateDirectSolver()
         @inferred steadystate(H, c_ops, solver = solver)
diff --git a/test/ext-test/gpu/cuda_ext.jl b/test/ext-test/gpu/cuda_ext.jl
index 42cd278cb..78fe6a417 100644
--- a/test/ext-test/gpu/cuda_ext.jl
+++ b/test/ext-test/gpu/cuda_ext.jl
@@ -125,6 +125,31 @@
     @test all([isapprox(sol_cpu.expect[i], sol_gpu32.expect[i]; atol = 1e-6) for i in 1:length(tlist)])
 end
 
+@testset "CUDA steadystate" begin
+    N = 50
+    Δ = 0.01
+    F = 0.1
+    γ = 0.1
+    nth = 2
+
+    a = destroy(N)
+    H = Δ * a' * a + F * (a + a')
+    c_ops = [sqrt(γ * (nth + 1)) * a, sqrt(γ * nth) * a']
+
+    ρ_ss_cpu = steadystate(H, c_ops)
+
+    H_gpu_csc = cu(H)
+    c_ops_gpu_csc = [cu(c_op) for c_op in c_ops]
+    ρ_ss_gpu_csc = steadystate(H_gpu_csc, c_ops_gpu_csc, solver = SteadyStateLinearSolver())
+
+    H_gpu_csr = CuSparseMatrixCSR(H_gpu_csc)
+    c_ops_gpu_csr = [CuSparseMatrixCSR(c_op) for c_op in c_ops_gpu_csc]
+    ρ_ss_gpu_csr = steadystate(H_gpu_csr, c_ops_gpu_csr, solver = SteadyStateLinearSolver())
+
+    @test ρ_ss_cpu.data ≈ Array(ρ_ss_gpu_csc.data) atol = 1e-8 * length(ρ_ss_cpu)
+    @test ρ_ss_cpu.data ≈ Array(ρ_ss_gpu_csr.data) atol = 1e-8 * length(ρ_ss_cpu)
+end
+
 @testset "CUDA ptrace" begin
     g = fock(2, 1)
     e = fock(2, 0)