Skip to content

Commit 52970a4

Browse files
authored
Handle concurrent mode oom and avoid unnecessary deep copy of problem_t (#541)
`user_problem_t` holds the `raft::handle_t` for the whole Barrier solver run. Setting the handle correctly and calling run_barrier_thread allows running Barrier in its own stream without deep copying the `problem_t`. We are now able to solve large scale instances and any OOM in Barrier is caught and PDLP keeps iterating. Authors: - Hugo Linsenmaier (https://github.com/hlinsen) Approvers: - Chris Maes (https://github.com/chris-maes) - Rajesh Gandham (https://github.com/rg20) URL: #541
1 parent 66d6529 commit 52970a4

File tree

3 files changed

+19
-16
lines changed

3 files changed

+19
-16
lines changed

cpp/src/dual_simplex/barrier.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3575,6 +3575,9 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
35753575
} catch (const raft::cuda_error& e) {
35763576
settings.log.debug("Error in barrier_solver_t: %s\n", e.what());
35773577
return lp_status_t::NUMERICAL_ISSUES;
3578+
} catch (const rmm::out_of_memory& e) {
3579+
settings.log.debug("Out of memory in barrier_solver_t: %s\n", e.what());
3580+
return lp_status_t::NUMERICAL_ISSUES;
35783581
}
35793582
}
35803583

cpp/src/linear_programming/solve.cu

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ optimization_problem_solution_t<i_t, f_t> run_barrier(
444444
{
445445
// Convert data structures to dual simplex format and back
446446
dual_simplex::user_problem_t<i_t, f_t> dual_simplex_problem =
447-
cuopt_problem_to_simplex_problem<i_t, f_t>(problem);
447+
cuopt_problem_to_simplex_problem<i_t, f_t>(problem.handle_ptr, problem);
448448
auto sol_dual_simplex = run_barrier(dual_simplex_problem, settings, timer);
449449
return convert_dual_simplex_sol(problem,
450450
std::get<0>(sol_dual_simplex),
@@ -515,7 +515,7 @@ optimization_problem_solution_t<i_t, f_t> run_dual_simplex(
515515
{
516516
// Convert data structures to dual simplex format and back
517517
dual_simplex::user_problem_t<i_t, f_t> dual_simplex_problem =
518-
cuopt_problem_to_simplex_problem<i_t, f_t>(problem);
518+
cuopt_problem_to_simplex_problem<i_t, f_t>(problem.handle_ptr, problem);
519519
auto sol_dual_simplex = run_dual_simplex(dual_simplex_problem, settings, timer);
520520
return convert_dual_simplex_sol(problem,
521521
std::get<0>(sol_dual_simplex),
@@ -671,16 +671,14 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
671671
// Initialize the dual simplex structures before we run PDLP.
672672
// Otherwise, CUDA API calls to the problem stream may occur in both threads and throw graph
673673
// capture off
674-
auto barrier_handle = raft::handle_t(*op_problem.get_handle_ptr());
675-
detail::problem_t<i_t, f_t> d_barrier_problem(problem);
674+
auto barrier_handle = raft::handle_t(*op_problem.get_handle_ptr());
676675
rmm::cuda_stream_view barrier_stream = rmm::cuda_stream_per_thread;
677-
d_barrier_problem.handle_ptr = &barrier_handle;
678676
raft::resource::set_cuda_stream(barrier_handle, barrier_stream);
679677
// Make sure allocations are done on the original stream
680678
problem.handle_ptr->sync_stream();
681679

682680
dual_simplex::user_problem_t<i_t, f_t> dual_simplex_problem =
683-
cuopt_problem_to_simplex_problem<i_t, f_t>(d_barrier_problem);
681+
cuopt_problem_to_simplex_problem<i_t, f_t>(&barrier_handle, problem);
684682
// Create a thread for dual simplex
685683
std::unique_ptr<
686684
std::tuple<dual_simplex::lp_solution_t<i_t, f_t>, dual_simplex::lp_status_t, f_t, f_t, f_t>>

cpp/src/linear_programming/translate.hpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,21 @@ namespace cuopt::linear_programming {
2828

2929
template <typename i_t, typename f_t>
3030
static dual_simplex::user_problem_t<i_t, f_t> cuopt_problem_to_simplex_problem(
31-
detail::problem_t<i_t, f_t>& model)
31+
raft::handle_t const* handle_ptr, detail::problem_t<i_t, f_t>& model)
3232
{
33-
dual_simplex::user_problem_t<i_t, f_t> user_problem(model.handle_ptr);
33+
dual_simplex::user_problem_t<i_t, f_t> user_problem(handle_ptr);
3434

3535
int m = model.n_constraints;
3636
int n = model.n_variables;
3737
int nz = model.nnz;
3838
user_problem.num_rows = m;
3939
user_problem.num_cols = n;
40-
user_problem.objective = cuopt::host_copy(model.objective_coefficients);
40+
user_problem.objective = cuopt::host_copy(model.objective_coefficients, handle_ptr->get_stream());
4141

4242
dual_simplex::csr_matrix_t<i_t, f_t> csr_A(m, n, nz);
43-
csr_A.x = cuopt::host_copy(model.coefficients);
44-
csr_A.j = cuopt::host_copy(model.variables);
45-
csr_A.row_start = cuopt::host_copy(model.offsets);
43+
csr_A.x = cuopt::host_copy(model.coefficients, handle_ptr->get_stream());
44+
csr_A.j = cuopt::host_copy(model.variables, handle_ptr->get_stream());
45+
csr_A.row_start = cuopt::host_copy(model.offsets, handle_ptr->get_stream());
4646

4747
csr_A.to_compressed_col(user_problem.A);
4848

@@ -51,8 +51,10 @@ static dual_simplex::user_problem_t<i_t, f_t> cuopt_problem_to_simplex_problem(
5151
user_problem.range_rows.clear();
5252
user_problem.range_value.clear();
5353

54-
auto model_constraint_lower_bounds = cuopt::host_copy(model.constraint_lower_bounds);
55-
auto model_constraint_upper_bounds = cuopt::host_copy(model.constraint_upper_bounds);
54+
auto model_constraint_lower_bounds =
55+
cuopt::host_copy(model.constraint_lower_bounds, handle_ptr->get_stream());
56+
auto model_constraint_upper_bounds =
57+
cuopt::host_copy(model.constraint_upper_bounds, handle_ptr->get_stream());
5658

5759
// All constraints have lower and upper bounds
5860
// lr <= a_i^T x <= ur
@@ -79,7 +81,7 @@ static dual_simplex::user_problem_t<i_t, f_t> cuopt_problem_to_simplex_problem(
7981
}
8082
user_problem.num_range_rows = user_problem.range_rows.size();
8183
std::tie(user_problem.lower, user_problem.upper) =
82-
extract_host_bounds<f_t>(model.variable_bounds, model.handle_ptr);
84+
extract_host_bounds<f_t>(model.variable_bounds, handle_ptr);
8385
user_problem.problem_name = model.original_problem_ptr->get_problem_name();
8486
if (model.row_names.size() > 0) {
8587
user_problem.row_names.resize(m);
@@ -97,7 +99,7 @@ static dual_simplex::user_problem_t<i_t, f_t> cuopt_problem_to_simplex_problem(
9799
user_problem.obj_scale = model.presolve_data.objective_scaling_factor;
98100
user_problem.var_types.resize(n);
99101

100-
auto model_variable_types = cuopt::host_copy(model.variable_types);
102+
auto model_variable_types = cuopt::host_copy(model.variable_types, handle_ptr->get_stream());
101103
for (int j = 0; j < n; ++j) {
102104
user_problem.var_types[j] =
103105
model_variable_types[j] == var_t::CONTINUOUS

0 commit comments

Comments
 (0)