diff --git a/benchmark/timings-parallel.py b/benchmark/timings-parallel.py
index 018347612..0beef2a70 100644
--- a/benchmark/timings-parallel.py
+++ b/benchmark/timings-parallel.py
@@ -2,6 +2,17 @@
 import numpy as np
 import scipy.sparse as spa
 from time import perf_counter_ns
+from concurrent.futures import ThreadPoolExecutor
+
+"""
+There are two interfaces to solve a QP problem with the dense backend. a) create a qp object by passing the problem data (matrices, vectors) to the qp.init method (this does memory allocation and the preconditioning) and then calling qp.solve or b) use the solve function directly taking the problem data as input (this does everything in one go).
+
+Currently, only the qp.solve method (a) is parallelized (using openmp). Therefore the memory alloc + preconditioning is done in serial when building a batch of qps that is then passed to the `solve_in_parallel` function. The solve function (b) is not parallelized but can easily be parallelized in Python using ThreadPoolExecutor.
+
+Here we do some timings to compare the two approaches. We generate a batch of QP problems and solve them in parallel using the `solve_in_parallel` function and compare the timings (need to add the timings for building the batch of qps + the parallel solving) with solving each problem in parallel using ThreadPoolExecutor for the solve function.
+"""
+
+num_threads = proxsuite.proxqp.omp_get_max_threads()
 
 
 def generate_mixed_qp(n, n_eq, n_in, seed=1):
@@ -23,45 +34,109 @@ def generate_mixed_qp(n, n_eq, n_in, seed=1):
     u = A @ v
     l = -1.0e20 * np.ones(m)
 
-    return P.toarray(), q, A[:n_eq, :], u[:n_eq], A[n_in:, :], u[n_in:], l[n_in:]
+    return P.toarray(), q, A[:n_eq, :], u[:n_eq], A[n_in:, :], l[n_in:], u[n_in:]
 
 
-n = 500
-n_eq = 200
-n_in = 200
+problem_specs = [
+    # (n, n_eq, n_in),
+    (50, 20, 20),
+    (100, 40, 40),
+    (200, 80, 80),
+    (500, 200, 200),
+    (1000, 200, 200),
+]
 
 num_qps = 128
 
-# qps = []
-timings = {}
-qps = proxsuite.proxqp.dense.VectorQP()
-
-tic = perf_counter_ns()
-for j in range(num_qps):
-    qp = proxsuite.proxqp.dense.QP(n, n_eq, n_in)
-    H, g, A, b, C, u, l = generate_mixed_qp(n, n_eq, n_in, seed=j)
-    qp.init(H, g, A, b, C, l, u)
-    qp.settings.eps_abs = 1e-9
-    qp.settings.verbose = False
-    qp.settings.initial_guess = proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS
-    qps.append(qp)
-timings["problem_data"] = (perf_counter_ns() - tic) * 1e-6
-
-tic = perf_counter_ns()
-for qp in qps:
-    qp.solve()
-timings["solve_serial"] = (perf_counter_ns() - tic) * 1e-6
+for n, n_eq, n_in in problem_specs:
 
-num_threads = proxsuite.proxqp.omp_get_max_threads()
-for j in range(1, num_threads):
+    print(f"\nProblem specs: {n=} {n_eq=} {n_in=}. Generating {num_qps} such problems.")
+    problems = [generate_mixed_qp(n, n_eq, n_in, seed=j) for j in range(num_qps)]
+    print(
+        f"Generated problems. Solving {num_qps} problems with proxsuite.proxqp.omp_get_max_threads()={num_threads} threads."
+    )
+
+    timings = {}
+
+    # create a vector of QP objects. This is not efficient because memory is allocated when creating the qp object + when it is appended to the vector which creates a copy of the object.
+    qps_vector = proxsuite.proxqp.dense.VectorQP()
     tic = perf_counter_ns()
-    proxsuite.proxqp.dense.solve_in_parallel(j, qps)
-    timings[f"solve_parallel_{j}_threads"] = (perf_counter_ns() - tic) * 1e-6
+    print("\nSetting up vector of qps")
+    for H, g, A, b, C, l, u in problems:
+        qp = proxsuite.proxqp.dense.QP(n, n_eq, n_in)
+        qp.init(H, g, A, b, C, l, u)
+        qp.settings.eps_abs = 1e-9
+        qp.settings.verbose = False
+        qp.settings.initial_guess = proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS
+        qps_vector.append(qp)
+    timings["setup_vector_of_qps"] = (perf_counter_ns() - tic) * 1e-6
 
+    # use BatchQP, which can initialize the qp objects in place and is more efficient
+    qps_batch = proxsuite.proxqp.dense.BatchQP()
+    tic = perf_counter_ns()
+    print("Setting up batch of qps")
+    for H, g, A, b, C, l, u in problems:
+        qp = qps_batch.init_qp_in_place(n, n_eq, n_in)
+        qp.init(H, g, A, b, C, l, u)
+        qp.settings.eps_abs = 1e-9
+        qp.settings.verbose = False
+        qp.settings.initial_guess = proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS
+    timings["setup_batch_of_qps"] = (perf_counter_ns() - tic) * 1e-6
 
-tic = perf_counter_ns()
-proxsuite.proxqp.dense.solve_in_parallel(qps=qps)
-timings[f"solve_parallel_heuristics_threads"] = (perf_counter_ns() - tic) * 1e-6
+    print("Solving batch of qps using solve_in_parallel with default thread config")
+    tic = perf_counter_ns()
+    proxsuite.proxqp.dense.solve_in_parallel(qps=qps_batch)
+    timings[f"solve_in_parallel_heuristics_threads"] = (perf_counter_ns() - tic) * 1e-6
+
+    print("Solving vector of qps serially")
+    tic = perf_counter_ns()
+    for qp in qps_vector:
+        qp.solve()
+    timings["qp_solve_serial"] = (perf_counter_ns() - tic) * 1e-6
+
+    print("Solving batch of qps using solve_in_parallel with various thread configs")
+    for j in range(1, num_threads, 2):
+        tic = perf_counter_ns()
+        proxsuite.proxqp.dense.solve_in_parallel(qps=qps_batch, num_threads=j)
+        timings[f"solve_in_parallel_{j}_threads"] = (perf_counter_ns() - tic) * 1e-6
+
+    def solve_problem_with_dense_backend(
+        problem,
+    ):
+        H, g, A, b, C, l, u = problem
+        return proxsuite.proxqp.dense.solve_no_gil(
+            H,
+            g,
+            A,
+            b,
+            C,
+            l,
+            u,
+            initial_guess=proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS,
+            eps_abs=1e-9,
+        )
+
+    # add final timings for the solve_in_parallel function considering setup time for batch of qps
+    for k, v in list(timings.items()):
+        if "solve_in_parallel" in k:
+            k_init = k + "_and_setup_batch_of_qps"
+            timings[k_init] = timings["setup_batch_of_qps"] + v
+
+    print("Solving each problem serially with solve function.")
+    # Note: here we just pass the problem data to the solve function. This does not require running the init method separately.
+    tic = perf_counter_ns()
+    for problem in problems:
+        solve_problem_with_dense_backend(problem)
+    timings["solve_fun_serial"] = (perf_counter_ns() - tic) * 1e-6
+
+    print(
+        "Solving each problem in parallel (with a ThreadPoolExecutor) with solve function."
+    )
+    tic = perf_counter_ns()
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        results = list(executor.map(solve_problem_with_dense_backend, problems))
+    timings["solve_fun_parallel"] = (perf_counter_ns() - tic) * 1e-6
 
-for k, v in timings.items():
-    print(f"{k}: {v}ms")
+    print("\nTimings:")
+    for k, v in timings.items():
+        print(f"{k}: {v:.3f}ms")
diff --git a/bindings/python/src/expose-solve.hpp b/bindings/python/src/expose-solve.hpp
index 25befc0bd..89004cc8b 100644
--- a/bindings/python/src/expose-solve.hpp
+++ b/bindings/python/src/expose-solve.hpp
@@ -46,7 +46,7 @@ solveDenseQp(nanobind::module_ m)
                             optional<T>,
                             bool,
                             optional<T>>(&dense::solve<T>),
-    "Function for solving a QP problem using PROXQP sparse backend directly "
+    "Function for solving a QP problem using PROXQP dense backend directly "
     "without defining a QP object. It is possible to set up some of the solver "
     "parameters (warm start, initial guess option, proximal step sizes, "
     "absolute and relative accuracies, maximum number of iterations, "
@@ -107,7 +107,7 @@ solveDenseQp(nanobind::module_ m)
                             optional<T>,
                             bool,
                             optional<T>>(&dense::solve<T>),
-    "Function for solving a QP problem using PROXQP sparse backend directly "
+    "Function for solving a QP problem using PROXQP dense backend directly "
     "without defining a QP object. It is possible to set up some of the solver "
     "parameters (warm start, initial guess option, proximal step sizes, "
     "absolute and relative accuracies, maximum number of iterations, "
@@ -140,6 +140,133 @@ solveDenseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
     nanobind::arg("default_H_eigenvalue_estimate") = 0.);
+
+  m.def("solve_no_gil",
+        nanobind::overload_cast<optional<dense::MatRef<T>>,
+                                optional<dense::VecRef<T>>,
+                                optional<dense::MatRef<T>>,
+                                optional<dense::VecRef<T>>,
+                                optional<dense::MatRef<T>>,
+                                optional<dense::VecRef<T>>,
+                                optional<dense::VecRef<T>>,
+                                optional<VecRef<T>>,
+                                optional<VecRef<T>>,
+                                optional<VecRef<T>>,
+                                optional<T>,
+                                optional<T>,
+                                optional<T>,
+                                optional<T>,
+                                optional<T>,
+                                optional<bool>,
+                                bool,
+                                bool,
+                                optional<isize>,
+                                proxsuite::proxqp::InitialGuessStatus,
+                                bool,
+                                optional<T>,
+                                optional<T>,
+                                bool,
+                                optional<T>>(&dense::solve<T>),
+        "Function for solving a QP problem using PROXQP dense backend directly "
+        "without defining a QP object and while releasing the Global "
+        "Interpreter Lock (GIL). "
+        "It is possible to set up some of the solver "
+        "parameters (warm start, initial guess option, proximal step sizes, "
+        "absolute and relative accuracies, maximum number of iterations, "
+        "preconditioner execution).",
+        nanobind::arg("H"),
+        nanobind::arg("g"),
+        nanobind::arg("A").none(),
+        nanobind::arg("b").none(),
+        nanobind::arg("C").none(),
+        nanobind::arg("l").none(),
+        nanobind::arg("u").none(),
+        nanobind::arg("x") = nanobind::none(),
+        nanobind::arg("y") = nanobind::none(),
+        nanobind::arg("z") = nanobind::none(),
+        nanobind::arg("eps_abs") = nanobind::none(),
+        nanobind::arg("eps_rel") = nanobind::none(),
+        nanobind::arg("rho") = nanobind::none(),
+        nanobind::arg("mu_eq") = nanobind::none(),
+        nanobind::arg("mu_in") = nanobind::none(),
+        nanobind::arg("verbose") = nanobind::none(),
+        nanobind::arg("compute_preconditioner") = true,
+        nanobind::arg("compute_timings") = false,
+        nanobind::arg("max_iter") = nanobind::none(),
+        nanobind::arg("initial_guess") =
+          InitialGuessStatus::EQUALITY_CONSTRAINED_INITIAL_GUESS,
+        nanobind::arg("check_duality_gap") = false,
+        nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
+        nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
+        nanobind::arg("primal_infeasibility_solving") = false,
+        nanobind::arg("default_H_eigenvalue_estimate") = 0.,
+        nanobind::call_guard<nanobind::gil_scoped_release>());
+
+  m.def(
+    "solve_no_gil",
+    nanobind::overload_cast<optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<bool>,
+                            bool,
+                            bool,
+                            optional<isize>,
+                            proxsuite::proxqp::InitialGuessStatus,
+                            bool,
+                            optional<T>,
+                            optional<T>,
+                            bool,
+                            optional<T>>(&dense::solve<T>),
+    "Function for solving a QP problem using PROXQP dense backend directly "
+    "without defining a QP object and while releasing the Global Interpreter "
+    "Lock (GIL). "
+    "It is possible to set up some of the solver "
+    "parameters (warm start, initial guess option, proximal step sizes, "
+    "absolute and relative accuracies, maximum number of iterations, "
+    "preconditioner execution).",
+    nanobind::arg("H"),
+    nanobind::arg("g"),
+    nanobind::arg("A") = nanobind::none(),
+    nanobind::arg("b") = nanobind::none(),
+    nanobind::arg("C") = nanobind::none(),
+    nanobind::arg("l") = nanobind::none(),
+    nanobind::arg("u") = nanobind::none(),
+    nanobind::arg("l_box") = nanobind::none(),
+    nanobind::arg("u_box") = nanobind::none(),
+    nanobind::arg("x") = nanobind::none(),
+    nanobind::arg("y") = nanobind::none(),
+    nanobind::arg("z") = nanobind::none(),
+    nanobind::arg("eps_abs") = nanobind::none(),
+    nanobind::arg("eps_rel") = nanobind::none(),
+    nanobind::arg("rho") = nanobind::none(),
+    nanobind::arg("mu_eq") = nanobind::none(),
+    nanobind::arg("mu_in") = nanobind::none(),
+    nanobind::arg("verbose") = nanobind::none(),
+    nanobind::arg("compute_preconditioner") = true,
+    nanobind::arg("compute_timings") = false,
+    nanobind::arg("max_iter") = nanobind::none(),
+    nanobind::arg("initial_guess") =
+      proxsuite::proxqp::InitialGuessStatus::EQUALITY_CONSTRAINED_INITIAL_GUESS,
+    nanobind::arg("check_duality_gap") = false,
+    nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
+    nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
+    nanobind::arg("primal_infeasibility_solving") = false,
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.,
+    nanobind::call_guard<nanobind::gil_scoped_release>());
 }
 
 } // namespace python
@@ -187,6 +314,45 @@ solveSparseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
     nanobind::arg("default_H_eigenvalue_estimate") = 0.);
+
+  m.def(
+    "solve_no_gil",
+    &sparse::solve<T, I>,
+    "Function for solving a QP problem using PROXQP sparse backend directly "
+    "without defining a QP object and while releasing the Global Interpreter "
+    "Lock (GIL). "
+    "It is possible to set up some of the solver "
+    "parameters (warm start, initial guess option, proximal step sizes, "
+    "absolute and relative accuracies, maximum number of iterations, "
+    "preconditioner execution).",
+    nanobind::arg("H") = nanobind::none(),
+    nanobind::arg("g") = nanobind::none(),
+    nanobind::arg("A") = nanobind::none(),
+    nanobind::arg("b") = nanobind::none(),
+    nanobind::arg("C") = nanobind::none(),
+    nanobind::arg("l") = nanobind::none(),
+    nanobind::arg("u") = nanobind::none(),
+    nanobind::arg("x") = nanobind::none(),
+    nanobind::arg("y") = nanobind::none(),
+    nanobind::arg("z") = nanobind::none(),
+    nanobind::arg("eps_abs") = nanobind::none(),
+    nanobind::arg("eps_rel") = nanobind::none(),
+    nanobind::arg("rho") = nanobind::none(),
+    nanobind::arg("mu_eq") = nanobind::none(),
+    nanobind::arg("mu_in") = nanobind::none(),
+    nanobind::arg("verbose") = nanobind::none(),
+    nanobind::arg("compute_preconditioner") = true,
+    nanobind::arg("compute_timings") = false,
+    nanobind::arg("max_iter") = nanobind::none(),
+    nanobind::arg("initial_guess") =
+      InitialGuessStatus::EQUALITY_CONSTRAINED_INITIAL_GUESS,
+    nanobind::arg("sparse_backend") = SparseBackend::Automatic,
+    nanobind::arg("check_duality_gap") = false,
+    nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
+    nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
+    nanobind::arg("primal_infeasibility_solving") = false,
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.,
+    nanobind::call_guard<nanobind::gil_scoped_release>());
 }
 
 } // namespace python