Skip to content

Commit a5b226c

Browse files
authored
Rounding and crash fixes (#670)
This PR fixes errouneous logic in the simple rounding and edge cases causing supposedly integer solutions to contain fractional variables. RINS fixes are also included (wrong problem pointer being used for some checks). Also, it seems that the cuSparse CSR-to-CSC routine occasionally produces an incorrect output transpose. As a workaround, a manual transpose implementation is used instead. Further work will be required to identify a more easily reproducible example and identify the underlying issue (and/or file a cuSparse bug) ## Summary by CodeRabbit * **Bug Fixes** * Improved numerical robustness with tighter clamping, added bounds checks, and reduced spurious runtime assertions. * **Performance Improvements** * Faster matrix transpose and improved GPU rounding kernels for better throughput. * **Stability** * Consistent use of copied problem state during heuristics and safer handling of bound/rounding edge cases. * **Behavioral** * Rounding and feasibility flows now enforce post-clamp validation and per-variable locking/decision logic. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> Authors: - Alice Boucher (https://github.com/aliceb-nv) Approvers: - Rajesh Gandham (https://github.com/rg20) URL: #670
1 parent 04b8dc2 commit a5b226c

File tree

10 files changed

+244
-81
lines changed

10 files changed

+244
-81
lines changed

cpp/src/mip/diversity/lns/rins.cu

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,16 @@ void rins_t<i_t, f_t>::run_rins()
9999

100100
if (!dm.population.is_feasible()) return;
101101

102-
cuopt_assert(lp_optimal_solution.size() == problem_ptr->n_variables, "Assignment size mismatch");
102+
cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch");
103103
cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch");
104-
cuopt_assert(problem_copy->n_variables == problem_ptr->n_variables, "Problem size mismatch");
105-
cuopt_assert(problem_copy->n_constraints == problem_ptr->n_constraints, "Problem size mismatch");
106-
cuopt_assert(problem_copy->n_integer_vars == problem_ptr->n_integer_vars,
107-
"Problem size mismatch");
108-
cuopt_assert(problem_copy->n_binary_vars == problem_ptr->n_binary_vars, "Problem size mismatch");
104+
// Do not make assertions based on problem_ptr. The original problem may have been modified within
105+
// the FP loop relaxing integers cuopt_assert(problem_copy->n_variables ==
106+
// problem_ptr->n_variables, "Problem size mismatch"); cuopt_assert(problem_copy->n_constraints ==
107+
// problem_ptr->n_constraints, "Problem size mismatch"); cuopt_assert(problem_copy->n_integer_vars
108+
// == problem_ptr->n_integer_vars,
109+
// "Problem size mismatch");
110+
// cuopt_assert(problem_copy->n_binary_vars == problem_ptr->n_binary_vars, "Problem size
111+
// mismatch");
109112
cuopt_assert(dm.population.current_size() > 0, "No solutions in population");
110113

111114
solution_t<i_t, f_t> best_sol(*problem_copy);
@@ -128,21 +131,21 @@ void rins_t<i_t, f_t>::run_rins()
128131

129132
i_t sol_size_before_rins = best_sol.assignment.size();
130133
auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream());
131-
cuopt_assert(lp_opt_device.size() == problem_ptr->n_variables, "Assignment size mismatch");
132-
cuopt_assert(best_sol.assignment.size() == problem_ptr->n_variables, "Assignment size mismatch");
134+
cuopt_assert(lp_opt_device.size() == problem_copy->n_variables, "Assignment size mismatch");
135+
cuopt_assert(best_sol.assignment.size() == problem_copy->n_variables, "Assignment size mismatch");
133136

134-
rmm::device_uvector<i_t> vars_to_fix(problem_ptr->n_integer_vars, rins_handle.get_stream());
137+
rmm::device_uvector<i_t> vars_to_fix(problem_copy->n_integer_vars, rins_handle.get_stream());
135138
auto end = thrust::copy_if(rins_handle.get_thrust_policy(),
136-
problem_ptr->integer_indices.begin(),
137-
problem_ptr->integer_indices.end(),
139+
problem_copy->integer_indices.begin(),
140+
problem_copy->integer_indices.end(),
138141
vars_to_fix.begin(),
139142
[lpopt = lp_opt_device.data(),
140-
pb = problem_ptr->view(),
143+
pb = problem_copy->view(),
141144
incumbent = best_sol.assignment.data()] __device__(i_t var_idx) {
142145
return pb.integer_equal(lpopt[var_idx], incumbent[var_idx]);
143146
});
144147
vars_to_fix.resize(end - vars_to_fix.begin(), rins_handle.get_stream());
145-
f_t fractional_ratio = (f_t)(vars_to_fix.size()) / (f_t)problem_ptr->n_integer_vars;
148+
f_t fractional_ratio = (f_t)(vars_to_fix.size()) / (f_t)problem_copy->n_integer_vars;
146149

147150
// abort if the fractional ratio is too low
148151
if (fractional_ratio < settings.min_fractional_ratio) {
@@ -164,7 +167,7 @@ void rins_t<i_t, f_t>::run_rins()
164167
cuopt_assert(thrust::all_of(rins_handle.get_thrust_policy(),
165168
vars_to_fix.begin(),
166169
vars_to_fix.end(),
167-
[pb = problem_ptr->view()] __device__(i_t var_idx) {
170+
[pb = problem_copy->view()] __device__(i_t var_idx) {
168171
return pb.is_integer_var(var_idx);
169172
}),
170173
"All variables to fix must be integer variables");
@@ -180,15 +183,15 @@ void rins_t<i_t, f_t>::run_rins()
180183
CUOPT_LOG_DEBUG("Running RINS on solution with objective %g, fixing %d/%d",
181184
best_sol.get_user_objective(),
182185
vars_to_fix.size(),
183-
problem_ptr->n_integer_vars);
186+
problem_copy->n_integer_vars);
184187
CUOPT_LOG_DEBUG("RINS fixrate %g time limit %g", fixrate, time_limit);
185188
CUOPT_LOG_DEBUG("RINS fractional ratio %g%%", fractional_ratio * 100);
186189

187190
f_t prev_obj = best_sol.get_user_objective();
188191

189192
auto [fixed_problem, fixed_assignment, variable_map] = best_sol.fix_variables(vars_to_fix);
190193
CUOPT_LOG_DEBUG(
191-
"new var count %d var_count %d", fixed_problem.n_variables, problem_ptr->n_integer_vars);
194+
"new var count %d var_count %d", fixed_problem.n_variables, problem_copy->n_integer_vars);
192195

193196
// should probably just do an spmv to get the objective instead. ugly mess of copies
194197
solution_t<i_t, f_t> best_sol_fixed_space(fixed_problem);
@@ -271,7 +274,8 @@ void rins_t<i_t, f_t>::run_rins()
271274
CUOPT_LOG_DEBUG("RINS submip solution found. Objective %.16e. Status %d",
272275
branch_and_bound_solution.objective,
273276
int(branch_and_bound_status));
274-
cuopt_assert(rins_solution_queue.size() > 0, "RINS solution queue is unexpectedly empty");
277+
// RINS submip may have just proved the initial guess is the optimal, therefore the queue might
278+
// be empty in that case
275279
}
276280
if (branch_and_bound_status == dual_simplex::mip_status_t::OPTIMAL) {
277281
CUOPT_LOG_DEBUG("RINS submip optimal");
@@ -328,7 +332,7 @@ void rins_t<i_t, f_t>::run_rins()
328332
cuopt_assert(best_sol.test_number_all_integer(), "All must be integers after RINS");
329333
if (best_sol.get_user_objective() < prev_obj) { improvement_found = true; }
330334
cuopt_assert(best_sol.assignment.size() == sol_size_before_rins, "Assignment size mismatch");
331-
cuopt_assert(best_sol.assignment.size() == problem_ptr->n_variables,
335+
cuopt_assert(best_sol.assignment.size() == problem_copy->n_variables,
332336
"Assignment size mismatch");
333337
dm.population.add_external_solution(
334338
best_sol.get_host_assignment(), best_sol.get_objective(), solution_origin_t::RINS);

cpp/src/mip/diversity/recombiners/bound_prop_recombiner.cuh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,10 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
204204
offspring.handle_ptr->sync_stream();
205205
offspring.unfix_variables(fixed_assignment, variable_map);
206206
cuopt_func_call(bool feasible_after_unfix = offspring.get_feasible());
207-
cuopt_assert(feasible_after_unfix == feasible_after_bounds_prop,
208-
"Feasible after unfix should be same as feasible after bounds prop!");
207+
// May be triggered due to numerical issues
208+
// TODO: investigate further
209+
// cuopt_assert(feasible_after_unfix == feasible_after_bounds_prop,
210+
// "Feasible after unfix should be same as feasible after bounds prop!");
209211
a.handle_ptr->sync_stream();
210212
} else {
211213
timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit);

cpp/src/mip/diversity/recombiners/sub_mip.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
141141
scaling.unscale_solutions(fixed_assignment, dummy);
142142
// unfix the assignment on given result no matter if it is feasible
143143
offspring.unfix_variables(fixed_assignment, variable_map);
144+
offspring
145+
.clamp_within_bounds(); // Scaling might bring some very slight variable bound violations
144146
} else {
145147
offspring.round_nearest();
146148
}
@@ -171,6 +173,7 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
171173
rmm::device_uvector<f_t> dummy(0, offspring.handle_ptr->get_stream());
172174
scaling.unscale_solutions(fixed_assignment, dummy);
173175
sol.unfix_variables(fixed_assignment, variable_map);
176+
sol.clamp_within_bounds(); // Scaling might bring some very slight variable bound violations
174177
sol.compute_feasibility();
175178
cuopt_func_call(sol.test_variable_bounds());
176179
population.add_solution(std::move(sol));

cpp/src/mip/feasibility_jump/fj_cpu.cu

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -710,10 +710,12 @@ static void perturb(fj_cpu_climber_t<i_t, f_t>& fj_cpu)
710710
raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0);
711711

712712
for (auto var_idx : sampled_vars) {
713-
f_t lb = ceil(std::max(get_lower(fj_cpu.h_var_bounds[var_idx]), -1e7));
714-
f_t ub = floor(std::min(get_upper(fj_cpu.h_var_bounds[var_idx]), 1e7));
713+
f_t lb = std::max(get_lower(fj_cpu.h_var_bounds[var_idx]), -1e7);
714+
f_t ub = std::min(get_upper(fj_cpu.h_var_bounds[var_idx]), 1e7);
715715
f_t val = lb + (ub - lb) * rng.next_double();
716716
if (fj_cpu.view.pb.is_integer_var(var_idx)) {
717+
lb = std::ceil(lb);
718+
ub = std::floor(ub);
717719
val = std::round(val);
718720
val = std::min(std::max(val, lb), ub);
719721
}
@@ -860,6 +862,13 @@ static void init_fj_cpu(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
860862
template <typename i_t, typename f_t>
861863
static void sanity_checks(fj_cpu_climber_t<i_t, f_t>& fj_cpu)
862864
{
865+
// Check that each variable is within its bounds
866+
for (i_t var_idx = 0; var_idx < fj_cpu.view.pb.n_variables; ++var_idx) {
867+
f_t val = fj_cpu.h_assignment[var_idx];
868+
cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, val),
869+
"Variable is out of bounds");
870+
}
871+
863872
// Check that each violated constraint is actually violated and not present in
864873
// satisfied_constraints
865874
for (const auto& cstr_idx : fj_cpu.violated_constraints) {

cpp/src/mip/local_search/rounding/constraint_prop.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,7 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
933933
update_host_assignment(sol);
934934
if (max_timer.check_time_limit()) {
935935
CUOPT_LOG_DEBUG("Second time limit is reached returning nearest rounding!");
936+
collapse_crossing_bounds(*sol.problem_ptr, *orig_sol.problem_ptr, sol.handle_ptr);
936937
sol.round_nearest();
937938
timeout_happened = true;
938939
break;

cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -19,51 +19,56 @@ template <typename i_t, typename f_t>
1919
__global__ void simple_rounding_kernel(typename solution_t<i_t, f_t>::view_t solution,
2020
bool* successful)
2121
{
22-
if (TH_ID_X >= solution.problem.n_integer_vars) { return; }
23-
i_t var_id = solution.problem.integer_indices[TH_ID_X];
24-
f_t curr_val = solution.assignment[var_id];
25-
if (solution.problem.is_integer(curr_val)) { return; }
26-
27-
i_t up_locks = 0;
28-
i_t down_locks = 0;
29-
30-
auto [offset_begin, offset_end] = solution.problem.reverse_range_for_var(var_id);
31-
for (i_t i = offset_begin; i < offset_end; i += 1) {
32-
auto cstr_idx = solution.problem.reverse_constraints[i];
33-
auto cstr_coeff = solution.problem.reverse_coefficients[i];
22+
for (i_t idx = TH_ID_X; idx < solution.problem.n_integer_vars; idx += gridDim.x * blockDim.x) {
23+
i_t var_id = solution.problem.integer_indices[idx];
24+
f_t curr_val = solution.assignment[var_id];
25+
if (solution.problem.is_integer(curr_val)) { continue; }
3426

35-
// boxed constraint. can't be rounded safely
36-
if (std::isfinite(solution.problem.constraint_lower_bounds[cstr_idx]) &&
37-
std::isfinite(solution.problem.constraint_upper_bounds[cstr_idx])) {
38-
up_locks += 1;
39-
down_locks += 1;
40-
continue;
27+
i_t up_locks = 0;
28+
i_t down_locks = 0;
29+
30+
auto [offset_begin, offset_end] = solution.problem.reverse_range_for_var(var_id);
31+
for (i_t i = offset_begin; i < offset_end; i += 1) {
32+
auto cstr_idx = solution.problem.reverse_constraints[i];
33+
auto cstr_coeff = solution.problem.reverse_coefficients[i];
34+
35+
// boxed constraint. can't be rounded safely
36+
if (std::isfinite(solution.problem.constraint_lower_bounds[cstr_idx]) &&
37+
std::isfinite(solution.problem.constraint_upper_bounds[cstr_idx])) {
38+
up_locks += 1;
39+
down_locks += 1;
40+
continue;
41+
}
42+
43+
f_t sign = std::isfinite(solution.problem.constraint_upper_bounds[cstr_idx]) ? 1 : -1;
44+
45+
if (cstr_coeff * sign > 0) {
46+
up_locks += 1;
47+
} else {
48+
down_locks += 1;
49+
}
4150
}
4251

43-
f_t sign = std::isfinite(solution.problem.constraint_upper_bounds[cstr_idx]) ? 1 : -1;
52+
auto var_bnd = solution.problem.variable_bounds[var_id];
53+
f_t lb = get_lower(var_bnd);
54+
f_t ub = get_upper(var_bnd);
4455

45-
if (cstr_coeff * sign > 0) {
46-
up_locks += 1;
47-
} else {
48-
down_locks += 1;
49-
}
50-
}
56+
bool can_round_up = up_locks == 0 && ceil(curr_val) <= floor(ub);
57+
bool can_round_down = down_locks == 0 && floor(curr_val) >= ceil(lb);
5158

52-
bool can_round_up = up_locks == 0;
53-
bool can_round_down = down_locks == 0;
54-
55-
if (can_round_up && can_round_down) {
56-
if (solution.problem.objective_coefficients[var_id] > 0) {
59+
if (can_round_up && can_round_down) {
60+
if (solution.problem.objective_coefficients[var_id] > 0) {
61+
solution.assignment[var_id] = floor(curr_val);
62+
} else {
63+
solution.assignment[var_id] = ceil(curr_val);
64+
}
65+
} else if (can_round_up) {
66+
solution.assignment[var_id] = ceil(curr_val);
67+
} else if (can_round_down) {
5768
solution.assignment[var_id] = floor(curr_val);
5869
} else {
59-
solution.assignment[var_id] = ceil(curr_val);
70+
*successful = false;
6071
}
61-
} else if (can_round_up) {
62-
solution.assignment[var_id] = ceil(curr_val);
63-
} else if (can_round_down) {
64-
solution.assignment[var_id] = floor(curr_val);
65-
} else {
66-
*successful = false;
6772
}
6873
}
6974

0 commit comments

Comments
 (0)