From cabbf35cfe9ad637d974415c15e3b16ea8b5570d Mon Sep 17 00:00:00 2001
From: J Berg <j.berg2349@gmail.com>
Date: Sat, 4 Jan 2025 18:28:45 +0000
Subject: [PATCH 1/7] disable GIL

---
 benchmark/timings-parallel.py        | 43 ++++++++++++++++++++++------
 bindings/python/src/expose-solve.hpp |  9 ++++--
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/benchmark/timings-parallel.py b/benchmark/timings-parallel.py
index 018347612..66a4cd4f7 100644
--- a/benchmark/timings-parallel.py
+++ b/benchmark/timings-parallel.py
@@ -2,6 +2,10 @@
 import numpy as np
 import scipy.sparse as spa
 from time import perf_counter_ns
+from concurrent.futures import ThreadPoolExecutor
+
+
+num_threads = proxsuite.proxqp.omp_get_max_threads()
 
 
 def generate_mixed_qp(n, n_eq, n_in, seed=1):
@@ -23,7 +27,7 @@ def generate_mixed_qp(n, n_eq, n_in, seed=1):
     u = A @ v
     l = -1.0e20 * np.ones(m)
 
-    return P.toarray(), q, A[:n_eq, :], u[:n_eq], A[n_in:, :], u[n_in:], l[n_in:]
+    return P.toarray(), q, A[:n_eq, :], u[:n_eq], A[n_in:, :], l[n_in:], u[n_in:]
 
 
 n = 500
@@ -32,36 +36,57 @@ def generate_mixed_qp(n, n_eq, n_in, seed=1):
 
 num_qps = 128
 
+print(f"Problem specs: {n=} {n_eq=} {n_in=}. Generating {num_qps} such problems.")
+problems = [generate_mixed_qp(n, n_eq, n_in, seed=j) for j in range(num_qps)]
+print(f"Generated problems. Solving {num_qps} problems with proxsuite.proxqp.omp_get_max_threads()={num_threads} threads.")
+
 # qps = []
 timings = {}
 qps = proxsuite.proxqp.dense.VectorQP()
 
 tic = perf_counter_ns()
-for j in range(num_qps):
+print("Setting up problem vector")
+for H, g, A, b, C, l, u in problems:
     qp = proxsuite.proxqp.dense.QP(n, n_eq, n_in)
-    H, g, A, b, C, u, l = generate_mixed_qp(n, n_eq, n_in, seed=j)
     qp.init(H, g, A, b, C, l, u)
     qp.settings.eps_abs = 1e-9
     qp.settings.verbose = False
     qp.settings.initial_guess = proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS
     qps.append(qp)
-timings["problem_data"] = (perf_counter_ns() - tic) * 1e-6
+timings["setup_vector_of_qps"] = (perf_counter_ns() - tic) * 1e-6
 
+print("Solving problem vector in parallel with default thread config")
+tic = perf_counter_ns()
+proxsuite.proxqp.dense.solve_in_parallel(qps=qps)
+timings[f"solve_parallel_heuristics_threads"] = (perf_counter_ns() - tic) * 1e-6
+
+print("Solving problem vector serially")
 tic = perf_counter_ns()
 for qp in qps:
     qp.solve()
 timings["solve_serial"] = (perf_counter_ns() - tic) * 1e-6
 
-num_threads = proxsuite.proxqp.omp_get_max_threads()
+print("Solving problem vector in parallel with various thread configs")
 for j in range(1, num_threads):
     tic = perf_counter_ns()
-    proxsuite.proxqp.dense.solve_in_parallel(j, qps)
+    proxsuite.proxqp.dense.solve_in_parallel(qps=qps, num_threads=j)
     timings[f"solve_parallel_{j}_threads"] = (perf_counter_ns() - tic) * 1e-6
 
+print("Solving each problem serially with dense backend.")
+tic = perf_counter_ns()
+for H, g, A, b, C, l, u in problems:
+    proxsuite.proxqp.dense.solve(H, g, A, b, C, l, u, initial_guess=proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS, eps_abs=1e-9)
+timings["solve_serial_dense"] = (perf_counter_ns() - tic) * 1e-6
+
+print("Solving each problem in parallel (with a ThreadPoolExecutor) with dense backend.")
+def solve_problem(problem):  # just a little helper function to keep things clean
+    H, g, A, b, C, l, u = problem
+    return proxsuite.proxqp.dense.solve(H, g, A, b, C, l, u, initial_guess=proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS, eps_abs=1e-9)
 
 tic = perf_counter_ns()
-proxsuite.proxqp.dense.solve_in_parallel(qps=qps)
-timings[f"solve_parallel_heuristics_threads"] = (perf_counter_ns() - tic) * 1e-6
+with ThreadPoolExecutor(max_workers=num_threads) as executor:
+    results = list(executor.map(solve_problem, problems))
+timings["solve_parallel_dense"] = (perf_counter_ns() - tic) * 1e-6
 
 for k, v in timings.items():
-    print(f"{k}: {v}ms")
+    print(f"{k}: {v:.3f}ms")
diff --git a/bindings/python/src/expose-solve.hpp b/bindings/python/src/expose-solve.hpp
index 25befc0bd..f514902e0 100644
--- a/bindings/python/src/expose-solve.hpp
+++ b/bindings/python/src/expose-solve.hpp
@@ -76,7 +76,8 @@ solveDenseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
-    nanobind::arg("default_H_eigenvalue_estimate") = 0.);
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.,
+    nanobind::call_guard<nanobind::gil_scoped_release>());
 
   m.def(
     "solve",
@@ -139,7 +140,8 @@ solveDenseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
-    nanobind::arg("default_H_eigenvalue_estimate") = 0.);
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.,
+    nanobind::call_guard<nanobind::gil_scoped_release>());
 }
 
 } // namespace python
@@ -186,7 +188,8 @@ solveSparseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
-    nanobind::arg("default_H_eigenvalue_estimate") = 0.);
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.,
+    nanobind::call_guard<nanobind::gil_scoped_release>());
 }
 
 } // namespace python

From 03083884554e08268b7d709f45b9dd3bba4a1e39 Mon Sep 17 00:00:00 2001
From: J Berg <j.berg2349@gmail.com>
Date: Sun, 5 Jan 2025 00:18:19 +0000
Subject: [PATCH 2/7] format

---
 benchmark/timings-parallel.py | 36 ++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/benchmark/timings-parallel.py b/benchmark/timings-parallel.py
index 66a4cd4f7..c01164506 100644
--- a/benchmark/timings-parallel.py
+++ b/benchmark/timings-parallel.py
@@ -38,7 +38,9 @@ def generate_mixed_qp(n, n_eq, n_in, seed=1):
 
 print(f"Problem specs: {n=} {n_eq=} {n_in=}. Generating {num_qps} such problems.")
 problems = [generate_mixed_qp(n, n_eq, n_in, seed=j) for j in range(num_qps)]
-print(f"Generated problems. Solving {num_qps} problems with proxsuite.proxqp.omp_get_max_threads()={num_threads} threads.")
+print(
+    f"Generated problems. Solving {num_qps} problems with proxsuite.proxqp.omp_get_max_threads()={num_threads} threads."
+)
 
 # qps = []
 timings = {}
@@ -72,20 +74,36 @@ def generate_mixed_qp(n, n_eq, n_in, seed=1):
     proxsuite.proxqp.dense.solve_in_parallel(qps=qps, num_threads=j)
     timings[f"solve_parallel_{j}_threads"] = (perf_counter_ns() - tic) * 1e-6
 
+
+def solve_problem_with_dense_backend(
+    problem,
+):  # just a little helper function to keep things clean
+    H, g, A, b, C, l, u = problem
+    return proxsuite.proxqp.dense.solve(
+        H,
+        g,
+        A,
+        b,
+        C,
+        l,
+        u,
+        initial_guess=proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS,
+        eps_abs=1e-9,
+    )
+
+
 print("Solving each problem serially with dense backend.")
 tic = perf_counter_ns()
-for H, g, A, b, C, l, u in problems:
-    proxsuite.proxqp.dense.solve(H, g, A, b, C, l, u, initial_guess=proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS, eps_abs=1e-9)
+for problem in problems:
+    solve_problem_with_dense_backend(problem)
 timings["solve_serial_dense"] = (perf_counter_ns() - tic) * 1e-6
 
-print("Solving each problem in parallel (with a ThreadPoolExecutor) with dense backend.")
-def solve_problem(problem):  # just a little helper function to keep things clean
-    H, g, A, b, C, l, u = problem
-    return proxsuite.proxqp.dense.solve(H, g, A, b, C, l, u, initial_guess=proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS, eps_abs=1e-9)
-
+print(
+    "Solving each problem in parallel (with a ThreadPoolExecutor) with dense backend."
+)
 tic = perf_counter_ns()
 with ThreadPoolExecutor(max_workers=num_threads) as executor:
-    results = list(executor.map(solve_problem, problems))
+    results = list(executor.map(solve_problem_with_dense_backend, problems))
 timings["solve_parallel_dense"] = (perf_counter_ns() - tic) * 1e-6
 
 for k, v in timings.items():

From 530d61b7b9ddba123c72278f58f7d934a9c09fbf Mon Sep 17 00:00:00 2001
From: Fabian Schramm <55981657+fabinsch@users.noreply.github.com>
Date: Tue, 7 Jan 2025 22:46:39 +0100
Subject: [PATCH 3/7] benchmark parallel: add comments and several prob spec

---
 benchmark/timings-parallel.py | 172 ++++++++++++++++++++--------------
 1 file changed, 102 insertions(+), 70 deletions(-)

diff --git a/benchmark/timings-parallel.py b/benchmark/timings-parallel.py
index c01164506..c7bc0d695 100644
--- a/benchmark/timings-parallel.py
+++ b/benchmark/timings-parallel.py
@@ -4,6 +4,13 @@
 from time import perf_counter_ns
 from concurrent.futures import ThreadPoolExecutor
 
+"""
+There are two interfaces to solve a QP problem with the dense backend. a) create a qp object by passing the problem data (matrices, vectors) to the qp.init method (this does memory allocation and the preconditioning) and then calling qp.solve or b) use the solve function directly taking the problem data as input (this does everything in one go).
+
+Currently, only the qp.solve method (a) is parallelized (using openmp). Therefore the memory alloc + preconditioning is done in serial when building a batch of qps that is then passed to the `solve_in_parallel` function. The solve function (b) is not parallelized but can easily be parallelized in Python using ThreadPoolExecutor.
+
+Here we do some timings to compare the two approaches. We generate a batch of QP problems and solve them in parallel using the `solve_in_parallel` function and compare the timings (need to add the timings for building the batch of qps + the parallel solving) with solving each problem in parallel using ThreadPoolExecutor for the solve function.
+"""
 
 num_threads = proxsuite.proxqp.omp_get_max_threads()
 
@@ -30,81 +37,106 @@ def generate_mixed_qp(n, n_eq, n_in, seed=1):
     return P.toarray(), q, A[:n_eq, :], u[:n_eq], A[n_in:, :], l[n_in:], u[n_in:]
 
 
-n = 500
-n_eq = 200
-n_in = 200
+problem_specs = [
+    # (n, n_eq, n_in),
+    (50, 20, 20),
+    (100, 40, 40),
+    (200, 80, 80),
+    (500, 200, 200),
+    (1000, 200, 200),
+]
 
 num_qps = 128
 
-print(f"Problem specs: {n=} {n_eq=} {n_in=}. Generating {num_qps} such problems.")
-problems = [generate_mixed_qp(n, n_eq, n_in, seed=j) for j in range(num_qps)]
-print(
-    f"Generated problems. Solving {num_qps} problems with proxsuite.proxqp.omp_get_max_threads()={num_threads} threads."
-)
-
-# qps = []
-timings = {}
-qps = proxsuite.proxqp.dense.VectorQP()
-
-tic = perf_counter_ns()
-print("Setting up problem vector")
-for H, g, A, b, C, l, u in problems:
-    qp = proxsuite.proxqp.dense.QP(n, n_eq, n_in)
-    qp.init(H, g, A, b, C, l, u)
-    qp.settings.eps_abs = 1e-9
-    qp.settings.verbose = False
-    qp.settings.initial_guess = proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS
-    qps.append(qp)
-timings["setup_vector_of_qps"] = (perf_counter_ns() - tic) * 1e-6
-
-print("Solving problem vector in parallel with default thread config")
-tic = perf_counter_ns()
-proxsuite.proxqp.dense.solve_in_parallel(qps=qps)
-timings[f"solve_parallel_heuristics_threads"] = (perf_counter_ns() - tic) * 1e-6
-
-print("Solving problem vector serially")
-tic = perf_counter_ns()
-for qp in qps:
-    qp.solve()
-timings["solve_serial"] = (perf_counter_ns() - tic) * 1e-6
-
-print("Solving problem vector in parallel with various thread configs")
-for j in range(1, num_threads):
-    tic = perf_counter_ns()
-    proxsuite.proxqp.dense.solve_in_parallel(qps=qps, num_threads=j)
-    timings[f"solve_parallel_{j}_threads"] = (perf_counter_ns() - tic) * 1e-6
-
-
-def solve_problem_with_dense_backend(
-    problem,
-):  # just a little helper function to keep things clean
-    H, g, A, b, C, l, u = problem
-    return proxsuite.proxqp.dense.solve(
-        H,
-        g,
-        A,
-        b,
-        C,
-        l,
-        u,
-        initial_guess=proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS,
-        eps_abs=1e-9,
+for n, n_eq, n_in in problem_specs:
+
+    print(f"\nProblem specs: {n=} {n_eq=} {n_in=}. Generating {num_qps} such problems.")
+    problems = [generate_mixed_qp(n, n_eq, n_in, seed=j) for j in range(num_qps)]
+    print(
+        f"Generated problems. Solving {num_qps} problems with proxsuite.proxqp.omp_get_max_threads()={num_threads} threads."
     )
 
+    timings = {}
+
+    # create a vector of QP objects. This is not efficient because memory is allocated when creating the qp object + when it is appended to the vector which creates a copy of the object.
+    qps_vector = proxsuite.proxqp.dense.VectorQP()
+    tic = perf_counter_ns()
+    print("\nSetting up vector of qps")
+    for H, g, A, b, C, l, u in problems:
+        qp = proxsuite.proxqp.dense.QP(n, n_eq, n_in)
+        qp.init(H, g, A, b, C, l, u)
+        qp.settings.eps_abs = 1e-9
+        qp.settings.verbose = False
+        qp.settings.initial_guess = proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS
+        qps_vector.append(qp)
+    timings["setup_vector_of_qps"] = (perf_counter_ns() - tic) * 1e-6
+
+    # use BatchQP, which can initialize the qp objects in place and is more efficient
+    qps_batch = proxsuite.proxqp.dense.BatchQP()
+    tic = perf_counter_ns()
+    print("Setting up batch of qps")
+    for H, g, A, b, C, l, u in problems:
+        qp = qps_batch.init_qp_in_place(n, n_eq, n_in)
+        qp.init(H, g, A, b, C, l, u)
+        qp.settings.eps_abs = 1e-9
+        qp.settings.verbose = False
+        qp.settings.initial_guess = proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS
+    timings["setup_batch_of_qps"] = (perf_counter_ns() - tic) * 1e-6
+
+    print("Solving batch of qps using solve_in_parallel with default thread config")
+    tic = perf_counter_ns()
+    proxsuite.proxqp.dense.solve_in_parallel(qps=qps_batch)
+    timings[f"solve_in_parallel_heuristics_threads"] = (perf_counter_ns() - tic) * 1e-6
 
-print("Solving each problem serially with dense backend.")
-tic = perf_counter_ns()
-for problem in problems:
-    solve_problem_with_dense_backend(problem)
-timings["solve_serial_dense"] = (perf_counter_ns() - tic) * 1e-6
+    print("Solving vector of qps serially")
+    tic = perf_counter_ns()
+    for qp in qps_vector:
+        qp.solve()
+    timings["qp_solve_serial"] = (perf_counter_ns() - tic) * 1e-6
+
+    print("Solving batch of qps using solve_in_parallel with various thread configs")
+    for j in range(1, num_threads, 2):
+        tic = perf_counter_ns()
+        proxsuite.proxqp.dense.solve_in_parallel(qps=qps_batch, num_threads=j)
+        timings[f"solve_in_parallel_{j}_threads"] = (perf_counter_ns() - tic) * 1e-6
+
+    def solve_problem_with_dense_backend(
+        problem,
+    ):  # just a little helper function to keep things clean
+        H, g, A, b, C, l, u = problem
+        return proxsuite.proxqp.dense.solve(
+            H,
+            g,
+            A,
+            b,
+            C,
+            l,
+            u,
+            initial_guess=proxsuite.proxqp.InitialGuess.NO_INITIAL_GUESS,
+            eps_abs=1e-9,
+        )
+
+    # add final timings for the solve_in_parallel function considering setup time for batch of qps
+    for k, v in list(timings.items()):
+        if "solve_in_parallel" in k:
+            k_init = k + "_and_setup_batch_of_qps"
+            timings[k_init] = timings["setup_batch_of_qps"] + v
+
+    print("Solving each problem serially with solve function.")
+    # Note: here we just pass the problem data to the solve function. This does not require running the init method separately.
+    tic = perf_counter_ns()
+    for problem in problems:
+        solve_problem_with_dense_backend(problem)
+    timings["solve_fun_serial"] = (perf_counter_ns() - tic) * 1e-6
 
-print(
-    "Solving each problem in parallel (with a ThreadPoolExecutor) with dense backend."
-)
-tic = perf_counter_ns()
-with ThreadPoolExecutor(max_workers=num_threads) as executor:
-    results = list(executor.map(solve_problem_with_dense_backend, problems))
-timings["solve_parallel_dense"] = (perf_counter_ns() - tic) * 1e-6
+    print(
+        "Solving each problem in parallel (with a ThreadPoolExecutor) with solve function."
+    )
+    tic = perf_counter_ns()
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        results = list(executor.map(solve_problem_with_dense_backend, problems))
+    timings["solve_fun_parallel"] = (perf_counter_ns() - tic) * 1e-6
 
-for k, v in timings.items():
-    print(f"{k}: {v:.3f}ms")
+    print("\nTimings:")
+    for k, v in timings.items():
+        print(f"{k}: {v:.3f}ms")

From d58746b2dd0d77ac1190e5ac9176ac8867e6d4a8 Mon Sep 17 00:00:00 2001
From: J Berg <j.berg2349@gmail.com>
Date: Thu, 9 Jan 2025 22:52:32 +0000
Subject: [PATCH 4/7] separate _no_gil interface

---
 benchmark/timings-parallel.py        |   4 +-
 bindings/python/src/expose-solve.hpp | 169 ++++++++++++++++++++++++++-
 2 files changed, 167 insertions(+), 6 deletions(-)

diff --git a/benchmark/timings-parallel.py b/benchmark/timings-parallel.py
index c7bc0d695..0beef2a70 100644
--- a/benchmark/timings-parallel.py
+++ b/benchmark/timings-parallel.py
@@ -102,9 +102,9 @@ def generate_mixed_qp(n, n_eq, n_in, seed=1):
 
     def solve_problem_with_dense_backend(
         problem,
-    ):  # just a little helper function to keep things clean
+    ):
         H, g, A, b, C, l, u = problem
-        return proxsuite.proxqp.dense.solve(
+        return proxsuite.proxqp.dense.solve_no_gil(
             H,
             g,
             A,
diff --git a/bindings/python/src/expose-solve.hpp b/bindings/python/src/expose-solve.hpp
index f514902e0..ccc33c072 100644
--- a/bindings/python/src/expose-solve.hpp
+++ b/bindings/python/src/expose-solve.hpp
@@ -46,7 +46,7 @@ solveDenseQp(nanobind::module_ m)
                             optional<T>,
                             bool,
                             optional<T>>(&dense::solve<T>),
-    "Function for solving a QP problem using PROXQP sparse backend directly "
+    "Function for solving a QP problem using PROXQP dense backend directly "
     "without defining a QP object. It is possible to set up some of the solver "
     "parameters (warm start, initial guess option, proximal step sizes, "
     "absolute and relative accuracies, maximum number of iterations, "
@@ -76,8 +76,7 @@ solveDenseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
-    nanobind::arg("default_H_eigenvalue_estimate") = 0.,
-    nanobind::call_guard<nanobind::gil_scoped_release>());
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.);
 
   m.def(
     "solve",
@@ -108,7 +107,7 @@ solveDenseQp(nanobind::module_ m)
                             optional<T>,
                             bool,
                             optional<T>>(&dense::solve<T>),
-    "Function for solving a QP problem using PROXQP sparse backend directly "
+    "Function for solving a QP problem using PROXQP dense backend directly "
     "without defining a QP object. It is possible to set up some of the solver "
     "parameters (warm start, initial guess option, proximal step sizes, "
     "absolute and relative accuracies, maximum number of iterations, "
@@ -140,6 +139,131 @@ solveDenseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.);
+  
+  m.def(
+    "solve_no_gil",
+    nanobind::overload_cast<optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<bool>,
+                            bool,
+                            bool,
+                            optional<isize>,
+                            proxsuite::proxqp::InitialGuessStatus,
+                            bool,
+                            optional<T>,
+                            optional<T>,
+                            bool,
+                            optional<T>>(&dense::solve<T>),
+    "Function for solving a QP problem using PROXQP dense backend directly "
+    "without defining a QP object and while releasing the Global Interpreter Lock (GIL). "
+    "It is possible to set up some of the solver "
+    "parameters (warm start, initial guess option, proximal step sizes, "
+    "absolute and relative accuracies, maximum number of iterations, "
+    "preconditioner execution).",
+    nanobind::arg("H"),
+    nanobind::arg("g"),
+    nanobind::arg("A").none(),
+    nanobind::arg("b").none(),
+    nanobind::arg("C").none(),
+    nanobind::arg("l").none(),
+    nanobind::arg("u").none(),
+    nanobind::arg("x") = nanobind::none(),
+    nanobind::arg("y") = nanobind::none(),
+    nanobind::arg("z") = nanobind::none(),
+    nanobind::arg("eps_abs") = nanobind::none(),
+    nanobind::arg("eps_rel") = nanobind::none(),
+    nanobind::arg("rho") = nanobind::none(),
+    nanobind::arg("mu_eq") = nanobind::none(),
+    nanobind::arg("mu_in") = nanobind::none(),
+    nanobind::arg("verbose") = nanobind::none(),
+    nanobind::arg("compute_preconditioner") = true,
+    nanobind::arg("compute_timings") = false,
+    nanobind::arg("max_iter") = nanobind::none(),
+    nanobind::arg("initial_guess") =
+      InitialGuessStatus::EQUALITY_CONSTRAINED_INITIAL_GUESS,
+    nanobind::arg("check_duality_gap") = false,
+    nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
+    nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
+    nanobind::arg("primal_infeasibility_solving") = false,
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.,
+    nanobind::call_guard<nanobind::gil_scoped_release>());
+
+  m.def(
+    "solve_no_gil",
+    nanobind::overload_cast<optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::MatRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<dense::VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<VecRef<T>>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<T>,
+                            optional<bool>,
+                            bool,
+                            bool,
+                            optional<isize>,
+                            proxsuite::proxqp::InitialGuessStatus,
+                            bool,
+                            optional<T>,
+                            optional<T>,
+                            bool,
+                            optional<T>>(&dense::solve<T>),
+    "Function for solving a QP problem using PROXQP dense backend directly "
+    "without defining a QP object and while releasing the Global Interpreter Lock (GIL). "
+    "It is possible to set up some of the solver "
+    "parameters (warm start, initial guess option, proximal step sizes, "
+    "absolute and relative accuracies, maximum number of iterations, "
+    "preconditioner execution).",
+    nanobind::arg("H"),
+    nanobind::arg("g"),
+    nanobind::arg("A") = nanobind::none(),
+    nanobind::arg("b") = nanobind::none(),
+    nanobind::arg("C") = nanobind::none(),
+    nanobind::arg("l") = nanobind::none(),
+    nanobind::arg("u") = nanobind::none(),
+    nanobind::arg("l_box") = nanobind::none(),
+    nanobind::arg("u_box") = nanobind::none(),
+    nanobind::arg("x") = nanobind::none(),
+    nanobind::arg("y") = nanobind::none(),
+    nanobind::arg("z") = nanobind::none(),
+    nanobind::arg("eps_abs") = nanobind::none(),
+    nanobind::arg("eps_rel") = nanobind::none(),
+    nanobind::arg("rho") = nanobind::none(),
+    nanobind::arg("mu_eq") = nanobind::none(),
+    nanobind::arg("mu_in") = nanobind::none(),
+    nanobind::arg("verbose") = nanobind::none(),
+    nanobind::arg("compute_preconditioner") = true,
+    nanobind::arg("compute_timings") = false,
+    nanobind::arg("max_iter") = nanobind::none(),
+    nanobind::arg("initial_guess") =
+      proxsuite::proxqp::InitialGuessStatus::EQUALITY_CONSTRAINED_INITIAL_GUESS,
+    nanobind::arg("check_duality_gap") = false,
+    nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
+    nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
+    nanobind::arg("primal_infeasibility_solving") = false,
     nanobind::arg("default_H_eigenvalue_estimate") = 0.,
     nanobind::call_guard<nanobind::gil_scoped_release>());
 }
@@ -188,6 +312,43 @@ solveSparseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
+    nanobind::arg("default_H_eigenvalue_estimate") = 0.);
+
+  m.def(
+    "solve_no_gil",
+    &sparse::solve<T, I>,
+    "Function for solving a QP problem using PROXQP sparse backend directly "
+    "without defining a QP object and while releasing the Global Interpreter Lock (GIL). "
+    "It is possible to set up some of the solver "
+    "parameters (warm start, initial guess option, proximal step sizes, "
+    "absolute and relative accuracies, maximum number of iterations, "
+    "preconditioner execution).",
+    nanobind::arg("H") = nanobind::none(),
+    nanobind::arg("g") = nanobind::none(),
+    nanobind::arg("A") = nanobind::none(),
+    nanobind::arg("b") = nanobind::none(),
+    nanobind::arg("C") = nanobind::none(),
+    nanobind::arg("l") = nanobind::none(),
+    nanobind::arg("u") = nanobind::none(),
+    nanobind::arg("x") = nanobind::none(),
+    nanobind::arg("y") = nanobind::none(),
+    nanobind::arg("z") = nanobind::none(),
+    nanobind::arg("eps_abs") = nanobind::none(),
+    nanobind::arg("eps_rel") = nanobind::none(),
+    nanobind::arg("rho") = nanobind::none(),
+    nanobind::arg("mu_eq") = nanobind::none(),
+    nanobind::arg("mu_in") = nanobind::none(),
+    nanobind::arg("verbose") = nanobind::none(),
+    nanobind::arg("compute_preconditioner") = true,
+    nanobind::arg("compute_timings") = false,
+    nanobind::arg("max_iter") = nanobind::none(),
+    nanobind::arg("initial_guess") =
+      InitialGuessStatus::EQUALITY_CONSTRAINED_INITIAL_GUESS,
+    nanobind::arg("sparse_backend") = SparseBackend::Automatic,
+    nanobind::arg("check_duality_gap") = false,
+    nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
+    nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
+    nanobind::arg("primal_infeasibility_solving") = false,
     nanobind::arg("default_H_eigenvalue_estimate") = 0.,
     nanobind::call_guard<nanobind::gil_scoped_release>());
 }

From b72e27ff5204965f0b486d0118d52cd7b38fad4d Mon Sep 17 00:00:00 2001
From: J Berg <j.berg2349@gmail.com>
Date: Thu, 9 Jan 2025 23:03:34 +0000
Subject: [PATCH 5/7] format

---
 bindings/python/src/expose-solve.hpp | 128 ++++++++++++++-------------
 1 file changed, 65 insertions(+), 63 deletions(-)

diff --git a/bindings/python/src/expose-solve.hpp b/bindings/python/src/expose-solve.hpp
index ccc33c072..89004cc8b 100644
--- a/bindings/python/src/expose-solve.hpp
+++ b/bindings/python/src/expose-solve.hpp
@@ -140,67 +140,67 @@ solveDenseQp(nanobind::module_ m)
     nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
     nanobind::arg("primal_infeasibility_solving") = false,
     nanobind::arg("default_H_eigenvalue_estimate") = 0.);
-  
-  m.def(
-    "solve_no_gil",
-    nanobind::overload_cast<optional<dense::MatRef<T>>,
-                            optional<dense::VecRef<T>>,
-                            optional<dense::MatRef<T>>,
-                            optional<dense::VecRef<T>>,
-                            optional<dense::MatRef<T>>,
-                            optional<dense::VecRef<T>>,
-                            optional<dense::VecRef<T>>,
-                            optional<VecRef<T>>,
-                            optional<VecRef<T>>,
-                            optional<VecRef<T>>,
-                            optional<T>,
-                            optional<T>,
-                            optional<T>,
-                            optional<T>,
-                            optional<T>,
-                            optional<bool>,
-                            bool,
-                            bool,
-                            optional<isize>,
-                            proxsuite::proxqp::InitialGuessStatus,
-                            bool,
-                            optional<T>,
-                            optional<T>,
-                            bool,
-                            optional<T>>(&dense::solve<T>),
-    "Function for solving a QP problem using PROXQP dense backend directly "
-    "without defining a QP object and while releasing the Global Interpreter Lock (GIL). "
-    "It is possible to set up some of the solver "
-    "parameters (warm start, initial guess option, proximal step sizes, "
-    "absolute and relative accuracies, maximum number of iterations, "
-    "preconditioner execution).",
-    nanobind::arg("H"),
-    nanobind::arg("g"),
-    nanobind::arg("A").none(),
-    nanobind::arg("b").none(),
-    nanobind::arg("C").none(),
-    nanobind::arg("l").none(),
-    nanobind::arg("u").none(),
-    nanobind::arg("x") = nanobind::none(),
-    nanobind::arg("y") = nanobind::none(),
-    nanobind::arg("z") = nanobind::none(),
-    nanobind::arg("eps_abs") = nanobind::none(),
-    nanobind::arg("eps_rel") = nanobind::none(),
-    nanobind::arg("rho") = nanobind::none(),
-    nanobind::arg("mu_eq") = nanobind::none(),
-    nanobind::arg("mu_in") = nanobind::none(),
-    nanobind::arg("verbose") = nanobind::none(),
-    nanobind::arg("compute_preconditioner") = true,
-    nanobind::arg("compute_timings") = false,
-    nanobind::arg("max_iter") = nanobind::none(),
-    nanobind::arg("initial_guess") =
-      InitialGuessStatus::EQUALITY_CONSTRAINED_INITIAL_GUESS,
-    nanobind::arg("check_duality_gap") = false,
-    nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
-    nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
-    nanobind::arg("primal_infeasibility_solving") = false,
-    nanobind::arg("default_H_eigenvalue_estimate") = 0.,
-    nanobind::call_guard<nanobind::gil_scoped_release>());
+
+  m.def("solve_no_gil",
+        nanobind::overload_cast<optional<dense::MatRef<T>>,
+                                optional<dense::VecRef<T>>,
+                                optional<dense::MatRef<T>>,
+                                optional<dense::VecRef<T>>,
+                                optional<dense::MatRef<T>>,
+                                optional<dense::VecRef<T>>,
+                                optional<dense::VecRef<T>>,
+                                optional<VecRef<T>>,
+                                optional<VecRef<T>>,
+                                optional<VecRef<T>>,
+                                optional<T>,
+                                optional<T>,
+                                optional<T>,
+                                optional<T>,
+                                optional<T>,
+                                optional<bool>,
+                                bool,
+                                bool,
+                                optional<isize>,
+                                proxsuite::proxqp::InitialGuessStatus,
+                                bool,
+                                optional<T>,
+                                optional<T>,
+                                bool,
+                                optional<T>>(&dense::solve<T>),
+        "Function for solving a QP problem using PROXQP dense backend directly "
+        "without defining a QP object and while releasing the Global "
+        "Interpreter Lock (GIL). "
+        "It is possible to set up some of the solver "
+        "parameters (warm start, initial guess option, proximal step sizes, "
+        "absolute and relative accuracies, maximum number of iterations, "
+        "preconditioner execution).",
+        nanobind::arg("H"),
+        nanobind::arg("g"),
+        nanobind::arg("A").none(),
+        nanobind::arg("b").none(),
+        nanobind::arg("C").none(),
+        nanobind::arg("l").none(),
+        nanobind::arg("u").none(),
+        nanobind::arg("x") = nanobind::none(),
+        nanobind::arg("y") = nanobind::none(),
+        nanobind::arg("z") = nanobind::none(),
+        nanobind::arg("eps_abs") = nanobind::none(),
+        nanobind::arg("eps_rel") = nanobind::none(),
+        nanobind::arg("rho") = nanobind::none(),
+        nanobind::arg("mu_eq") = nanobind::none(),
+        nanobind::arg("mu_in") = nanobind::none(),
+        nanobind::arg("verbose") = nanobind::none(),
+        nanobind::arg("compute_preconditioner") = true,
+        nanobind::arg("compute_timings") = false,
+        nanobind::arg("max_iter") = nanobind::none(),
+        nanobind::arg("initial_guess") =
+          InitialGuessStatus::EQUALITY_CONSTRAINED_INITIAL_GUESS,
+        nanobind::arg("check_duality_gap") = false,
+        nanobind::arg("eps_duality_gap_abs") = nanobind::none(),
+        nanobind::arg("eps_duality_gap_rel") = nanobind::none(),
+        nanobind::arg("primal_infeasibility_solving") = false,
+        nanobind::arg("default_H_eigenvalue_estimate") = 0.,
+        nanobind::call_guard<nanobind::gil_scoped_release>());
 
   m.def(
     "solve_no_gil",
@@ -232,7 +232,8 @@ solveDenseQp(nanobind::module_ m)
                             bool,
                             optional<T>>(&dense::solve<T>),
     "Function for solving a QP problem using PROXQP dense backend directly "
-    "without defining a QP object and while releasing the Global Interpreter Lock (GIL). "
+    "without defining a QP object and while releasing the Global Interpreter "
+    "Lock (GIL). "
     "It is possible to set up some of the solver "
     "parameters (warm start, initial guess option, proximal step sizes, "
     "absolute and relative accuracies, maximum number of iterations, "
@@ -318,7 +319,8 @@ solveSparseQp(nanobind::module_ m)
     "solve_no_gil",
     &sparse::solve<T, I>,
     "Function for solving a QP problem using PROXQP sparse backend directly "
-    "without defining a QP object and while releasing the Global Interpreter Lock (GIL). "
+    "without defining a QP object and while releasing the Global Interpreter "
+    "Lock (GIL). "
     "It is possible to set up some of the solver "
     "parameters (warm start, initial guess option, proximal step sizes, "
     "absolute and relative accuracies, maximum number of iterations, "

From 2301f33b08d90c330c7ea97860ca249b140eb2ef Mon Sep 17 00:00:00 2001
From: Fabian Schramm <55981657+fabinsch@users.noreply.github.com>
Date: Fri, 10 Jan 2025 14:57:10 +0100
Subject: [PATCH 6/7] ci: use setup-miniconda@v3

---
 .github/workflows/ci-linux-osx-win-conda.yml | 30 ++++++--------------
 .github/workflows/gh-pages.yml               | 10 +++----
 .github/workflows/release-osx-win.yml        | 19 ++-----------
 3 files changed, 15 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/ci-linux-osx-win-conda.yml b/.github/workflows/ci-linux-osx-win-conda.yml
index 8ec76678e..4b1913383 100644
--- a/.github/workflows/ci-linux-osx-win-conda.yml
+++ b/.github/workflows/ci-linux-osx-win-conda.yml
@@ -57,22 +57,11 @@ jobs:
       with:
         submodules: recursive
 
-    - uses: conda-incubator/setup-miniconda@v2
-      if: matrix.os != 'macos-14'
+    - uses: conda-incubator/setup-miniconda@v3
       with:
-        miniforge-variant: Mambaforge
         miniforge-version: latest
-        channels: conda-forge
-        python-version: "3.10"
         activate-environment: proxsuite
 
-    - uses: conda-incubator/setup-miniconda@v3
-      if: matrix.os == 'macos-14'
-      with:
-        channels: conda-forge
-        python-version: "3.10"
-        activate-environment: proxsuite
-        installer-url: https://github.com/conda-forge/miniforge/releases/download/23.11.0-0/Mambaforge-23.11.0-0-MacOSX-arm64.sh
 
     - name: Install dependencies [Conda]
       shell: bash -l {0}
@@ -80,17 +69,17 @@ jobs:
         # Workaround for https://github.com/conda-incubator/setup-miniconda/issues/186
         conda config --remove channels defaults
         # Compilation related dependencies
-        mamba install cmake compilers make pkg-config doxygen ninja graphviz typing_extensions llvm-openmp clang
+        conda install cmake compilers make pkg-config doxygen ninja graphviz typing_extensions llvm-openmp clang
         # Main dependencies
-        mamba install eigen simde
+        conda install eigen simde
         # Test dependencies
-        mamba install libmatio numpy scipy
+        conda install libmatio numpy scipy
 
-    - name: Install julia [macOS/Linux]
-      if: contains(matrix.os, 'macos-latest') || contains(matrix.os, 'ubuntu')
+    - name: Install julia [Linux]
+      if: contains(matrix.os, 'ubuntu')
       shell: bash -l {0}
       run: |
-        mamba install julia
+        conda install julia
 
     - name: Activate ccache [Conda]
       uses: hendrikmuhs/ccache-action@v1.2
@@ -102,7 +91,7 @@ jobs:
       shell: bash -l {0}
       run: |
         conda info
-        mamba list
+        conda list
         env
 
     - name: Configure [Conda/Linux&macOS]
@@ -142,7 +131,6 @@ jobs:
       shell: bash -l {0}
       run: |
         echo $(where ccache)
-        ls C:\\Miniconda3\\envs\\proxsuite\\Library\\lib
         git submodule update --init
         mkdir build
         cd build
@@ -155,7 +143,6 @@ jobs:
       shell: bash -l {0}
       run: |
         echo $(where ccache)
-        ls C:\\Miniconda3\\envs\\proxsuite\\Library\\lib
         git submodule update --init
         mkdir build
         cd build
@@ -168,7 +155,6 @@ jobs:
       shell: bash -l {0}
       run: |
         echo $(where ccache)
-        ls C:\\Miniconda3\\envs\\proxsuite\\Library\\lib
         git submodule update --init
         mkdir build
         cd build
diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml
index c096bd318..e23c64418 100644
--- a/.github/workflows/gh-pages.yml
+++ b/.github/workflows/gh-pages.yml
@@ -12,11 +12,9 @@ jobs:
         with:
           submodules: recursive
 
-      - uses: conda-incubator/setup-miniconda@v2
+      - uses: conda-incubator/setup-miniconda@v3
         with:
-          miniforge-variant: Mambaforge
           miniforge-version: latest
-          channels: conda-forge
           python-version: "3.10"
           activate-environment: doc
 
@@ -27,16 +25,16 @@ jobs:
           conda config --remove channels defaults
 
           # Compilation related dependencies
-          mamba install cmake make pkg-config doxygen graphviz
+          conda install cmake make pkg-config doxygen graphviz
 
           # Main dependencies
-          mamba install eigen
+          conda install eigen
 
       - name: Print environment
         shell: bash -l {0}
         run: |
           conda info
-          mamba list
+          conda list
           env
 
       - name: Configure
diff --git a/.github/workflows/release-osx-win.yml b/.github/workflows/release-osx-win.yml
index 845a980e6..d19755881 100644
--- a/.github/workflows/release-osx-win.yml
+++ b/.github/workflows/release-osx-win.yml
@@ -35,23 +35,11 @@ jobs:
           git submodule update
 
       - name: Setup conda
-        if: contains(matrix.os, 'macos-13') || contains(matrix.os, 'windows')
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          miniforge-version: latest
-          channels: conda-forge
-          python-version: ${{ matrix.python-version }}
-          activate-environment: proxsuite
-
-      - name: Setup conda
-        if: matrix.os == 'macos-14'
         uses: conda-incubator/setup-miniconda@v3
         with:
-          channels: conda-forge
+          miniforge-version: latest
           python-version: ${{ matrix.python-version }}
           activate-environment: proxsuite
-          installer-url: https://github.com/conda-forge/miniforge/releases/download/23.11.0-0/Mambaforge-23.11.0-0-MacOSX-arm64.sh
 
       - name: Install dependencies [Conda]
         if: contains(matrix.os, 'macos') || contains(matrix.os, 'windows')
@@ -59,14 +47,13 @@ jobs:
         run: |
           # Workaround for https://github.com/conda-incubator/setup-miniconda/issues/186
           conda config --remove channels defaults
-          mamba install doxygen graphviz eigen simde cmake compilers typing_extensions
+          conda install doxygen graphviz eigen simde cmake compilers typing_extensions
 
       - name: Print environment [Conda]
-        if: contains(matrix.os, 'macos') || contains(matrix.os, 'windows')
         shell: bash -l {0}
         run: |
           conda info
-          mamba list
+          conda list
           env
 
       - name: Build wheel

From 3169275113898b470b8dcda1a78cad1ca8c7d859 Mon Sep 17 00:00:00 2001
From: Fabian Schramm <55981657+fabinsch@users.noreply.github.com>
Date: Fri, 10 Jan 2025 17:20:55 +0100
Subject: [PATCH 7/7] update changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dacdbf799..9281c4299 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,9 +12,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Added
 * Stub files for Python bindings, using [nanobind's native support](https://nanobind.readthedocs.io/en/latest/typing.html#stub-generation) ([#340](https://github.com/Simple-Robotics/proxsuite/pull/340))
+* Add `solve_no_gil` for dense backend (multithreading via python) ([#363](https://github.com/Simple-Robotics/proxsuite/pull/363))
+* Add benchmarks for `solve_no_gil` vs `solve_in_parallel` (openmp) ([#363](https://github.com/Simple-Robotics/proxsuite/pull/363))
 
 ### Changed
 * Change Python bindings to use nanobind instead of pybind11 ([#340](https://github.com/Simple-Robotics/proxsuite/pull/340))
+* Update setup-minicondav2 to v3 ([#363](https://github.com/Simple-Robotics/proxsuite/pull/363))
 
 
 ## [0.6.7] - 2024-08-27