Skip to content

Commit 8305291

Browse files
committed
checkConvergence_gpu passes for a few cases
1 parent 811dd60 commit 8305291

File tree

4 files changed

+131
-3
lines changed

4 files changed

+131
-3
lines changed

check/TestPdlp.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ TEST_CASE("pdlp-restart-add-row", "[pdlp]") {
342342

343343
TEST_CASE("hi-pdlp", "[pdlp]") {
344344
std::string model =
345-
"shell"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
345+
"3times2"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
346346
std::string model_file =
347347
std::string(HIGHS_DIR) + "/check/instances/" + model + ".mps";
348348
Highs h;
@@ -366,7 +366,7 @@ TEST_CASE("hi-pdlp", "[pdlp]") {
366366
h.setOptionValue("pdlp_scaling_mode", pdlp_scaling);
367367
h.setOptionValue("pdlp_step_size_strategy", 1);
368368
h.setOptionValue("pdlp_restart_strategy", 2);
369-
h.setOptionValue("pdlp_iteration_limit", 100);
369+
//h.setOptionValue("pdlp_iteration_limit", 100);
370370
// h.setOptionValue("log_dev_level", kHighsLogDevLevelVerbose);
371371
auto start_hipdlp = std::chrono::high_resolution_clock::now();
372372
HighsStatus run_status = h.run();

highs/pdlp/cupdlp/cuda/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ add_library(cudalin SHARED
77
${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu
88
../../hipdlp/pdhg.cu
99
)
10-
set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
10+
set_target_properties(cudalin PROPERTIES
11+
CUDA_SEPARABLE_COMPILATION ON
12+
CUDA_ARCHITECTURES "60;70;75;80;86" )
1113

1214
# set_target_properties(cudalin PROPERTIES CUDA_ARCHITECTURES native)
1315

highs/pdlp/hipdlp/pdhg.cc

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,10 +565,12 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
565565
if (bool_checking) {
566566
#ifdef CUPDLP_GPU
567567
// **VERIFICATION**: Before checking, copy GPU state to host
568+
/*
568569
CUDA_CHECK(cudaMemcpy(x_current_.data(), d_x_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
569570
CUDA_CHECK(cudaMemcpy(y_current_.data(), d_y_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
570571
CUDA_CHECK(cudaMemcpy(Ax_cache_.data(), d_ax_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
571572
CUDA_CHECK(cudaMemcpy(ATy_cache_.data(), d_aty_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
573+
*/
572574
CUDA_CHECK(cudaMemcpy(x_sum_.data(), d_x_sum_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
573575
CUDA_CHECK(cudaMemcpy(y_sum_.data(), d_y_sum_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
574576
CUDA_CHECK(cudaMemcpy(x_avg_.data(), d_x_avg_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
@@ -577,6 +579,9 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
577579
double dScale_gpu = sum_weights_gpu_ > 0.0 ? 1.0 / sum_weights_gpu_ : 0.0;
578580
launchKernelScaleVector(d_x_avg_, d_x_sum_, dScale_gpu, lp_.num_col_);
579581
launchKernelScaleVector(d_y_avg_, d_y_sum_, dScale_gpu, lp_.num_row_);
582+
583+
linalgGpuAx(d_x_avg_, d_ax_avg_);
584+
linalgGpuATy(d_y_avg_, d_aty_avg_);
580585
#endif
581586
hipdlpTimerStart(kHipdlpClockAverageIterate);
582587
computeAverageIterate(Ax_avg, ATy_avg);
@@ -589,6 +594,45 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
589594
SolverResults current_results;
590595
SolverResults average_results;
591596

597+
598+
//=== GPU Convergence Check ===//
599+
bool current_converged_gpu = checkConvergenceGpu(
600+
iter, d_x_current_, d_y_current_,
601+
d_ax_current_, d_aty_current_,
602+
params_.tolerance, current_results, "[L-GPU]"
603+
);
604+
605+
bool average_converged_gpu = checkConvergenceGpu(
606+
iter, d_x_avg_, d_y_avg_,
607+
d_ax_avg_, d_aty_avg_,
608+
params_.tolerance, average_results, "[A-GPU]"
609+
);
610+
611+
if (current_converged_gpu) {
612+
logger_.info("Current GPU solution converged in " + std::to_string(iter) +
613+
" iterations.");
614+
final_iter_count_ = iter;
615+
CUDA_CHECK(cudaMemcpy(x.data(), d_x_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
616+
CUDA_CHECK(cudaMemcpy(y.data(), d_y_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
617+
CUDA_CHECK(cudaMemcpy(dSlackPos_.data(), d_dSlackPos_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
618+
CUDA_CHECK(cudaMemcpy(dSlackNeg_.data(), d_dSlackNeg_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
619+
results_ = current_results;
620+
return solveReturn(TerminationStatus::OPTIMAL);
621+
}
622+
623+
if (average_converged_gpu) {
624+
logger_.info("Average GPU solution converged in " + std::to_string(iter) +
625+
" iterations.");
626+
final_iter_count_ = iter;
627+
CUDA_CHECK(cudaMemcpy(x.data(), d_x_avg_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
628+
CUDA_CHECK(cudaMemcpy(y.data(), d_y_avg_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
629+
CUDA_CHECK(cudaMemcpy(dSlackPosAvg_.data(), d_dSlackPosAvg_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
630+
CUDA_CHECK(cudaMemcpy(dSlackNegAvg_.data(), d_dSlackNegAvg_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
631+
results_ = average_results;
632+
return solveReturn(TerminationStatus::OPTIMAL);
633+
}
634+
635+
// === CPU Convergence Check ===//
592636
hipdlpTimerStart(kHipdlpClockConvergenceCheck);
593637
// Compute residuals for current iterate
594638
bool current_converged = checkConvergence(
@@ -1811,8 +1855,18 @@ void PDLPSolver::setupGpu(){
18111855
CUDA_CHECK(cudaMalloc(&d_aty_current_, a_num_cols_ * sizeof(double)));
18121856
CUDA_CHECK(cudaMalloc(&d_ax_next_, a_num_rows_ * sizeof(double)));
18131857
CUDA_CHECK(cudaMalloc(&d_aty_next_, a_num_cols_ * sizeof(double)));
1858+
CUDA_CHECK(cudaMalloc(&d_ax_avg_, a_num_rows_ * sizeof(double)));
1859+
CUDA_CHECK(cudaMalloc(&d_aty_avg_, a_num_cols_ * sizeof(double)));
18141860
CUDA_CHECK(cudaMalloc(&d_x_sum_, a_num_cols_ * sizeof(double)));
18151861
CUDA_CHECK(cudaMalloc(&d_y_sum_, a_num_rows_ * sizeof(double)));
1862+
CUDA_CHECK(cudaMalloc(&d_convergence_results_, 4 * sizeof(double)));
1863+
CUDA_CHECK(cudaMalloc(&d_dSlackPos_, a_num_cols_ * sizeof(double)));
1864+
CUDA_CHECK(cudaMalloc(&d_dSlackNeg_, a_num_cols_ * sizeof(double)));
1865+
CUDA_CHECK(cudaMalloc(&d_dSlackPosAvg_, a_num_cols_ * sizeof(double)));
1866+
CUDA_CHECK(cudaMalloc(&d_dSlackNegAvg_, a_num_cols_ * sizeof(double)));
1867+
CUDA_CHECK(cudaMalloc(&d_col_scale_, a_num_cols_ * sizeof(double)));
1868+
CUDA_CHECK(cudaMalloc(&d_row_scale_, a_num_rows_ * sizeof(double)));
1869+
18161870

18171871
CUDA_CHECK(cudaMemcpy(d_col_cost_, lp_.col_cost_.data(), a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice));
18181872
CUDA_CHECK(cudaMemcpy(d_col_lower_, lp_.col_lower_.data(), a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice));
@@ -1864,11 +1918,21 @@ void PDLPSolver::setupGpu(){
18641918
CUDA_CHECK(cudaMemset(d_y_next_, 0, a_num_rows_ * sizeof(double)));
18651919
CUDA_CHECK(cudaMemset(d_ax_current_, 0, a_num_rows_ * sizeof(double)));
18661920
CUDA_CHECK(cudaMemset(d_ax_next_, 0, a_num_rows_ * sizeof(double)));
1921+
CUDA_CHECK(cudaMemset(d_ax_avg_, 0, a_num_rows_ * sizeof(double)));
1922+
CUDA_CHECK(cudaMemset(d_aty_avg_, 0, a_num_cols_ * sizeof(double)));
18671923
CUDA_CHECK(cudaMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double)));
18681924
CUDA_CHECK(cudaMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double)));
18691925
CUDA_CHECK(cudaMemset(d_aty_current_, 0, a_num_cols_ * sizeof(double)));
18701926
sum_weights_gpu_ = 0.0;
18711927

1928+
if(scaling_.IsScaled()){
1929+
CUDA_CHECK(cudaMemcpy(d_col_scale_, scaling_.GetColScaling().data(), a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice));
1930+
CUDA_CHECK(cudaMemcpy(d_row_scale_, scaling_.GetRowScaling().data(), a_num_rows_ * sizeof(double), cudaMemcpyHostToDevice));
1931+
} else {
1932+
cudaFree(d_col_scale_); d_col_scale_ = nullptr;
1933+
cudaFree(d_row_scale_); d_row_scale_ = nullptr;
1934+
}
1935+
18721936
highsLogUser(params_.log_options_, HighsLogType::kInfo, "GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred to device.\n");
18731937
}
18741938

@@ -1893,12 +1957,21 @@ void PDLPSolver::cleanupGpu(){
18931957
CUDA_CHECK(cudaFree(d_y_next_));
18941958
CUDA_CHECK(cudaFree(d_ax_current_));
18951959
CUDA_CHECK(cudaFree(d_aty_current_));
1960+
CUDA_CHECK(cudaFree(d_ax_avg_));
1961+
CUDA_CHECK(cudaFree(d_aty_avg_));
18961962
CUDA_CHECK(cudaFree(d_ax_next_));
18971963
CUDA_CHECK(cudaFree(d_aty_next_));
18981964
CUDA_CHECK(cudaFree(d_x_sum_));
18991965
CUDA_CHECK(cudaFree(d_y_sum_));
19001966
CUDA_CHECK(cudaFree(d_spmv_buffer_ax_));
19011967
CUDA_CHECK(cudaFree(d_spmv_buffer_aty_));
1968+
CUDA_CHECK(cudaFree(d_convergence_results_));
1969+
CUDA_CHECK(cudaFree(d_dSlackPos_));
1970+
CUDA_CHECK(cudaFree(d_dSlackNeg_));
1971+
CUDA_CHECK(cudaFree(d_dSlackPosAvg_));
1972+
CUDA_CHECK(cudaFree(d_dSlackNegAvg_));
1973+
if(d_col_scale_) CUDA_CHECK(cudaFree(d_col_scale_));
1974+
if(d_row_scale_) CUDA_CHECK(cudaFree(d_row_scale_));
19021975
}
19031976

19041977
void PDLPSolver::linalgGpuAx(const double* d_x_in, double* d_ax_out){
@@ -1974,4 +2047,53 @@ void PDLPSolver::launchKernelUpdateAverages(double weight) {
19742047
void PDLPSolver::launchKernelScaleVector(double* d_out, const double* d_in, double scale, int n) {
19752048
launchKernelScaleVector_wrapper(d_out, d_in, scale, n);
19762049
CUDA_CHECK(cudaGetLastError());
2050+
}
2051+
2052+
bool PDLPSolver::checkConvergenceGpu(
2053+
const int iter,
2054+
const double* d_x, const double* d_y,
2055+
const double* d_ax, const double* d_aty,
2056+
double epsilon, SolverResults& results, const char* type
2057+
){
2058+
launchCheckConvergenceKernels_wrapper(
2059+
d_convergence_results_,
2060+
d_dSlackPos_, d_dSlackNeg_,
2061+
d_x, d_y, d_ax, d_aty,
2062+
d_col_cost_, d_row_lower_,
2063+
d_col_lower_, d_col_upper_,
2064+
d_is_equality_row_,
2065+
d_col_scale_, d_row_scale_,
2066+
lp_.num_col_, lp_.num_row_
2067+
);
2068+
2069+
// copy 4 doubles back to CPU
2070+
2071+
double h_results[4];
2072+
CUDA_CHECK(cudaMemcpy(h_results, d_convergence_results_, 4 * sizeof(double), cudaMemcpyDeviceToHost));
2073+
2074+
double primal_feas_sq = h_results[0];
2075+
double dual_feas_sq = h_results[1];
2076+
double primal_obj = h_results[2] + lp_.offset_;
2077+
double dual_obj = h_results[3] + lp_.offset_;
2078+
2079+
results.primal_feasibility = std::sqrt(primal_feas_sq);
2080+
results.dual_feasibility = std::sqrt(dual_feas_sq);
2081+
results.primal_obj = primal_obj;
2082+
results.dual_obj = dual_obj;
2083+
2084+
double duality_gap = primal_obj - dual_obj;
2085+
results.duality_gap = std::abs(duality_gap);
2086+
results.relative_obj_gap = std::abs(duality_gap) / (1.0 + std::abs(primal_obj) + std::abs(dual_obj));
2087+
2088+
debugPdlpFeasOptLog(debug_pdlp_log_file_, iter, primal_obj, dual_obj,
2089+
results.relative_obj_gap,
2090+
results.primal_feasibility / (1.0 + unscaled_rhs_norm_),
2091+
results.dual_feasibility / (1.0 + unscaled_c_norm_), type);
2092+
2093+
bool primal_feasible = results.primal_feasibility < epsilon * (1.0 + unscaled_rhs_norm_);
2094+
bool dual_feasible = results.dual_feasibility < epsilon * (1.0 + unscaled_c_norm_);
2095+
bool gap_small = results.relative_obj_gap < epsilon;
2096+
2097+
return primal_feasible && dual_feasible && gap_small;
2098+
19772099
}

highs/pdlp/hipdlp/pdhg.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ class PDLPSolver {
228228
double* d_aty_current_ = nullptr; // Replaces host-side ATy_cache_
229229
double* d_ax_next_ = nullptr;
230230
double* d_aty_next_ = nullptr;
231+
double* d_ax_avg_ = nullptr;
232+
double* d_aty_avg_ = nullptr;
231233
double* d_x_sum_ = nullptr;
232234
double* d_y_sum_ = nullptr;
233235

@@ -238,6 +240,8 @@ class PDLPSolver {
238240
double* d_dSlackNeg_ = nullptr;
239241
double* d_dSlackPosAvg_ = nullptr;
240242
double* d_dSlackNegAvg_ = nullptr;
243+
double* d_col_scale_ = nullptr;
244+
double* d_row_scale_ = nullptr;
241245
bool checkConvergenceGpu(const int iter,
242246
const double* d_x, const double* d_y,
243247
const double* d_ax, const double* d_aty,

0 commit comments

Comments
 (0)