@@ -565,10 +565,12 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
565565 if (bool_checking) {
566566#ifdef CUPDLP_GPU
567567 // **VERIFICATION**: Before checking, copy GPU state to host
568+ /*
568569 CUDA_CHECK(cudaMemcpy(x_current_.data(), d_x_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
569570 CUDA_CHECK(cudaMemcpy(y_current_.data(), d_y_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
570571 CUDA_CHECK(cudaMemcpy(Ax_cache_.data(), d_ax_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
571572 CUDA_CHECK(cudaMemcpy(ATy_cache_.data(), d_aty_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
573+ */
572574 CUDA_CHECK (cudaMemcpy (x_sum_.data (), d_x_sum_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
573575 CUDA_CHECK (cudaMemcpy (y_sum_.data (), d_y_sum_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
574576 CUDA_CHECK (cudaMemcpy (x_avg_.data (), d_x_avg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
@@ -577,6 +579,9 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
577579 double dScale_gpu = sum_weights_gpu_ > 0.0 ? 1.0 / sum_weights_gpu_ : 0.0 ;
578580 launchKernelScaleVector (d_x_avg_, d_x_sum_, dScale_gpu, lp_.num_col_ );
579581 launchKernelScaleVector (d_y_avg_, d_y_sum_, dScale_gpu, lp_.num_row_ );
582+
583+ linalgGpuAx (d_x_avg_, d_ax_avg_);
584+ linalgGpuATy (d_y_avg_, d_aty_avg_);
580585#endif
581586 hipdlpTimerStart (kHipdlpClockAverageIterate );
582587 computeAverageIterate (Ax_avg, ATy_avg);
@@ -589,6 +594,45 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
589594 SolverResults current_results;
590595 SolverResults average_results;
591596
597+
598+ // === GPU Convergence Check ===//
599+ bool current_converged_gpu = checkConvergenceGpu (
600+ iter, d_x_current_, d_y_current_,
601+ d_ax_current_, d_aty_current_,
602+ params_.tolerance , current_results, " [L-GPU]"
603+ );
604+
605+ bool average_converged_gpu = checkConvergenceGpu (
606+ iter, d_x_avg_, d_y_avg_,
607+ d_ax_avg_, d_aty_avg_,
608+ params_.tolerance , average_results, " [A-GPU]"
609+ );
610+
611+ if (current_converged_gpu) {
612+ logger_.info (" Current GPU solution converged in " + std::to_string (iter) +
613+ " iterations." );
614+ final_iter_count_ = iter;
615+ CUDA_CHECK (cudaMemcpy (x.data (), d_x_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
616+ CUDA_CHECK (cudaMemcpy (y.data (), d_y_current_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
617+ CUDA_CHECK (cudaMemcpy (dSlackPos_.data (), d_dSlackPos_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
618+ CUDA_CHECK (cudaMemcpy (dSlackNeg_.data (), d_dSlackNeg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
619+ results_ = current_results;
620+ return solveReturn (TerminationStatus::OPTIMAL);
621+ }
622+
623+ if (average_converged_gpu) {
624+ logger_.info (" Average GPU solution converged in " + std::to_string (iter) +
625+ " iterations." );
626+ final_iter_count_ = iter;
627+ CUDA_CHECK (cudaMemcpy (x.data (), d_x_avg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
628+ CUDA_CHECK (cudaMemcpy (y.data (), d_y_avg_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
629+ CUDA_CHECK (cudaMemcpy (dSlackPosAvg_.data (), d_dSlackPosAvg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
630+ CUDA_CHECK (cudaMemcpy (dSlackNegAvg_.data (), d_dSlackNegAvg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
631+ results_ = average_results;
632+ return solveReturn (TerminationStatus::OPTIMAL);
633+ }
634+
635+ // === CPU Convergence Check ===//
592636 hipdlpTimerStart (kHipdlpClockConvergenceCheck );
593637 // Compute residuals for current iterate
594638 bool current_converged = checkConvergence (
@@ -1811,8 +1855,18 @@ void PDLPSolver::setupGpu(){
18111855 CUDA_CHECK (cudaMalloc (&d_aty_current_, a_num_cols_ * sizeof (double )));
18121856 CUDA_CHECK (cudaMalloc (&d_ax_next_, a_num_rows_ * sizeof (double )));
18131857 CUDA_CHECK (cudaMalloc (&d_aty_next_, a_num_cols_ * sizeof (double )));
1858+ CUDA_CHECK (cudaMalloc (&d_ax_avg_, a_num_rows_ * sizeof (double )));
1859+ CUDA_CHECK (cudaMalloc (&d_aty_avg_, a_num_cols_ * sizeof (double )));
18141860 CUDA_CHECK (cudaMalloc (&d_x_sum_, a_num_cols_ * sizeof (double )));
18151861 CUDA_CHECK (cudaMalloc (&d_y_sum_, a_num_rows_ * sizeof (double )));
1862+ CUDA_CHECK (cudaMalloc (&d_convergence_results_, 4 * sizeof (double )));
1863+ CUDA_CHECK (cudaMalloc (&d_dSlackPos_, a_num_cols_ * sizeof (double )));
1864+ CUDA_CHECK (cudaMalloc (&d_dSlackNeg_, a_num_cols_ * sizeof (double )));
1865+ CUDA_CHECK (cudaMalloc (&d_dSlackPosAvg_, a_num_cols_ * sizeof (double )));
1866+ CUDA_CHECK (cudaMalloc (&d_dSlackNegAvg_, a_num_cols_ * sizeof (double )));
1867+ CUDA_CHECK (cudaMalloc (&d_col_scale_, a_num_cols_ * sizeof (double )));
1868+ CUDA_CHECK (cudaMalloc (&d_row_scale_, a_num_rows_ * sizeof (double )));
1869+
18161870
18171871 CUDA_CHECK (cudaMemcpy (d_col_cost_, lp_.col_cost_ .data (), a_num_cols_ * sizeof (double ), cudaMemcpyHostToDevice));
18181872 CUDA_CHECK (cudaMemcpy (d_col_lower_, lp_.col_lower_ .data (), a_num_cols_ * sizeof (double ), cudaMemcpyHostToDevice));
@@ -1864,11 +1918,21 @@ void PDLPSolver::setupGpu(){
18641918 CUDA_CHECK (cudaMemset (d_y_next_, 0 , a_num_rows_ * sizeof (double )));
18651919 CUDA_CHECK (cudaMemset (d_ax_current_, 0 , a_num_rows_ * sizeof (double )));
18661920 CUDA_CHECK (cudaMemset (d_ax_next_, 0 , a_num_rows_ * sizeof (double )));
1921+ CUDA_CHECK (cudaMemset (d_ax_avg_, 0 , a_num_rows_ * sizeof (double )));
1922+ CUDA_CHECK (cudaMemset (d_aty_avg_, 0 , a_num_cols_ * sizeof (double )));
18671923 CUDA_CHECK (cudaMemset (d_x_sum_, 0 , a_num_cols_ * sizeof (double )));
18681924 CUDA_CHECK (cudaMemset (d_y_sum_, 0 , a_num_rows_ * sizeof (double )));
18691925 CUDA_CHECK (cudaMemset (d_aty_current_, 0 , a_num_cols_ * sizeof (double )));
18701926 sum_weights_gpu_ = 0.0 ;
18711927
1928+ if (scaling_.IsScaled ()){
1929+ CUDA_CHECK (cudaMemcpy (d_col_scale_, scaling_.GetColScaling ().data (), a_num_cols_ * sizeof (double ), cudaMemcpyHostToDevice));
1930+ CUDA_CHECK (cudaMemcpy (d_row_scale_, scaling_.GetRowScaling ().data (), a_num_rows_ * sizeof (double ), cudaMemcpyHostToDevice));
1931+ } else {
1932+ cudaFree (d_col_scale_); d_col_scale_ = nullptr ;
1933+ cudaFree (d_row_scale_); d_row_scale_ = nullptr ;
1934+ }
1935+
18721936 highsLogUser (params_.log_options_ , HighsLogType::kInfo , " GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred to device.\n " );
18731937}
18741938
@@ -1893,12 +1957,21 @@ void PDLPSolver::cleanupGpu(){
18931957 CUDA_CHECK (cudaFree (d_y_next_));
18941958 CUDA_CHECK (cudaFree (d_ax_current_));
18951959 CUDA_CHECK (cudaFree (d_aty_current_));
1960+ CUDA_CHECK (cudaFree (d_ax_avg_));
1961+ CUDA_CHECK (cudaFree (d_aty_avg_));
18961962 CUDA_CHECK (cudaFree (d_ax_next_));
18971963 CUDA_CHECK (cudaFree (d_aty_next_));
18981964 CUDA_CHECK (cudaFree (d_x_sum_));
18991965 CUDA_CHECK (cudaFree (d_y_sum_));
19001966 CUDA_CHECK (cudaFree (d_spmv_buffer_ax_));
19011967 CUDA_CHECK (cudaFree (d_spmv_buffer_aty_));
1968+ CUDA_CHECK (cudaFree (d_convergence_results_));
1969+ CUDA_CHECK (cudaFree (d_dSlackPos_));
1970+ CUDA_CHECK (cudaFree (d_dSlackNeg_));
1971+ CUDA_CHECK (cudaFree (d_dSlackPosAvg_));
1972+ CUDA_CHECK (cudaFree (d_dSlackNegAvg_));
1973+ if (d_col_scale_) CUDA_CHECK (cudaFree (d_col_scale_));
1974+ if (d_row_scale_) CUDA_CHECK (cudaFree (d_row_scale_));
19021975}
19031976
19041977void PDLPSolver::linalgGpuAx (const double * d_x_in, double * d_ax_out){
@@ -1974,4 +2047,53 @@ void PDLPSolver::launchKernelUpdateAverages(double weight) {
19742047void PDLPSolver::launchKernelScaleVector (double * d_out, const double * d_in, double scale, int n) {
19752048 launchKernelScaleVector_wrapper (d_out, d_in, scale, n);
19762049 CUDA_CHECK (cudaGetLastError ());
2050+ }
2051+
2052+ bool PDLPSolver::checkConvergenceGpu (
2053+ const int iter,
2054+ const double * d_x, const double * d_y,
2055+ const double * d_ax, const double * d_aty,
2056+ double epsilon, SolverResults& results, const char * type
2057+ ){
2058+ launchCheckConvergenceKernels_wrapper (
2059+ d_convergence_results_,
2060+ d_dSlackPos_, d_dSlackNeg_,
2061+ d_x, d_y, d_ax, d_aty,
2062+ d_col_cost_, d_row_lower_,
2063+ d_col_lower_, d_col_upper_,
2064+ d_is_equality_row_,
2065+ d_col_scale_, d_row_scale_,
2066+ lp_.num_col_ , lp_.num_row_
2067+ );
2068+
2069+ // copy 4 doubles back to CPU
2070+
2071+ double h_results[4 ];
2072+ CUDA_CHECK (cudaMemcpy (h_results, d_convergence_results_, 4 * sizeof (double ), cudaMemcpyDeviceToHost));
2073+
2074+ double primal_feas_sq = h_results[0 ];
2075+ double dual_feas_sq = h_results[1 ];
2076+ double primal_obj = h_results[2 ] + lp_.offset_ ;
2077+ double dual_obj = h_results[3 ] + lp_.offset_ ;
2078+
2079+ results.primal_feasibility = std::sqrt (primal_feas_sq);
2080+ results.dual_feasibility = std::sqrt (dual_feas_sq);
2081+ results.primal_obj = primal_obj;
2082+ results.dual_obj = dual_obj;
2083+
2084+ double duality_gap = primal_obj - dual_obj;
2085+ results.duality_gap = std::abs (duality_gap);
2086+ results.relative_obj_gap = std::abs (duality_gap) / (1.0 + std::abs (primal_obj) + std::abs (dual_obj));
2087+
2088+ debugPdlpFeasOptLog (debug_pdlp_log_file_, iter, primal_obj, dual_obj,
2089+ results.relative_obj_gap ,
2090+ results.primal_feasibility / (1.0 + unscaled_rhs_norm_),
2091+ results.dual_feasibility / (1.0 + unscaled_c_norm_), type);
2092+
2093+ bool primal_feasible = results.primal_feasibility < epsilon * (1.0 + unscaled_rhs_norm_);
2094+ bool dual_feasible = results.dual_feasibility < epsilon * (1.0 + unscaled_c_norm_);
2095+ bool gap_small = results.relative_obj_gap < epsilon;
2096+
2097+ return primal_feasible && dual_feasible && gap_small;
2098+
19772099}
0 commit comments