@@ -691,6 +691,13 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
691691
692692 Ax_cache_ = Ax_avg;
693693 ATy_cache_ = ATy_avg;
694+
695+ #ifdef CUPDLP_GPU
696+ CUDA_CHECK (cudaMemcpy (d_x_current_, d_x_avg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
697+ CUDA_CHECK (cudaMemcpy (d_y_current_, d_y_avg_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
698+ linalgGpuAx (d_x_current_, d_ax_current_);
699+ linalgGpuATy (d_y_current_, d_aty_current_);
700+ #endif
694701 } else {
695702 restart_scheme_.primal_feas_last_restart_ =
696703 current_results.primal_feasibility ;
@@ -701,6 +708,9 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
701708 }
702709
703710 // Perform the primal weight update using z^{n,0} and z^{n-1,0}
711+ #ifdef CUPDLP_GPU
712+ computeStepSizeRatioGpu (working_params);
713+ #endif
704714 computeStepSizeRatio (working_params);
705715 current_eta_ = working_params.eta ;
706716 restart_scheme_.passParams (&working_params);
@@ -712,6 +722,14 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
712722 std::fill (y_sum_.begin (), y_sum_.end (), 0.0 );
713723 sum_weights_ = 0.0 ;
714724
725+ #ifdef CUPDLP_GPU
726+ CUDA_CHECK (cudaMemcpy (d_x_at_last_restart_, d_x_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
727+ CUDA_CHECK (cudaMemcpy (d_y_at_last_restart_, d_y_current_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
728+ CUDA_CHECK (cudaMemset (d_x_sum_, 0 , a_num_cols_ * sizeof (double )));
729+ CUDA_CHECK (cudaMemset (d_y_sum_, 0 , a_num_rows_ * sizeof (double )));
730+ sum_weights_gpu_ = 0.0 ;
731+ #endif
732+
715733 restart_scheme_.last_restart_iter_ = iter;
716734 // Recompute Ax and ATy for the restarted iterates
717735 hipdlpTimerStart (kHipdlpClockMatrixMultiply );
@@ -1851,6 +1869,10 @@ void PDLPSolver::setupGpu(){
18511869 CUDA_CHECK (cudaMalloc (&d_y_avg_, a_num_rows_ * sizeof (double )));
18521870 CUDA_CHECK (cudaMalloc (&d_x_next_, a_num_cols_ * sizeof (double )));
18531871 CUDA_CHECK (cudaMalloc (&d_y_next_, a_num_rows_ * sizeof (double )));
1872+ CUDA_CHECK (cudaMalloc (&d_x_at_last_restart_, a_num_cols_ * sizeof (double )));
1873+ CUDA_CHECK (cudaMalloc (&d_y_at_last_restart_, a_num_rows_ * sizeof (double )));
1874+ CUDA_CHECK (cudaMalloc (&d_x_temp_diff_norm_result_, a_num_cols_ * sizeof (double )));
1875+ CUDA_CHECK (cudaMalloc (&d_y_temp_diff_norm_result_, a_num_rows_ * sizeof (double )));
18541876 CUDA_CHECK (cudaMalloc (&d_ax_current_, a_num_rows_ * sizeof (double )));
18551877 CUDA_CHECK (cudaMalloc (&d_aty_current_, a_num_cols_ * sizeof (double )));
18561878 CUDA_CHECK (cudaMalloc (&d_ax_next_, a_num_rows_ * sizeof (double )));
@@ -1951,6 +1973,10 @@ void PDLPSolver::cleanupGpu(){
19511973 CUDA_CHECK (cudaFree (d_col_upper_));
19521974 CUDA_CHECK (cudaFree (d_row_lower_));
19531975 CUDA_CHECK (cudaFree (d_is_equality_row_));
1976+ CUDA_CHECK (cudaFree (d_x_at_last_restart_));
1977+ CUDA_CHECK (cudaFree (d_y_at_last_restart_));
1978+ CUDA_CHECK (cudaFree (d_x_temp_diff_norm_result_));
1979+ CUDA_CHECK (cudaFree (d_y_temp_diff_norm_result_));
19541980 CUDA_CHECK (cudaFree (d_x_current_));
19551981 CUDA_CHECK (cudaFree (d_y_current_));
19561982 CUDA_CHECK (cudaFree (d_x_next_));
@@ -2096,4 +2122,38 @@ bool PDLPSolver::checkConvergenceGpu(
20962122
20972123 return primal_feasible && dual_feasible && gap_small;
20982124
2125+ }
2126+ void PDLPSolver::computeStepSizeRatioGpu (PrimalDualParams& working_params) {
2127+ // 1. Compute ||x_last - x_current||^2 on GPU
2128+ launchKernelDiffTwoNormSquared_wrapper (d_x_at_last_restart_, d_x_current_, d_x_temp_diff_norm_result_, a_num_cols_);
2129+
2130+ double primal_diff_sq;
2131+ CUDA_CHECK (cudaMemcpy (&primal_diff_sq, d_x_temp_diff_norm_result_, sizeof (double ), cudaMemcpyDeviceToHost));
2132+ double primal_diff_norm = std::sqrt (primal_diff_sq);
2133+
2134+ // 2. Compute ||y_last - y_current||^2 on GPU
2135+ launchKernelDiffTwoNormSquared_wrapper (d_y_at_last_restart_, d_y_current_, d_y_temp_diff_norm_result_, a_num_rows_);
2136+
2137+ double dual_diff_sq;
2138+ CUDA_CHECK (cudaMemcpy (&dual_diff_sq, d_y_temp_diff_norm_result_, sizeof (double ), cudaMemcpyDeviceToHost));
2139+ double dual_diff_norm = std::sqrt (dual_diff_sq);
2140+
2141+ double dMeanStepSize = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
2142+
2143+ // 3. Compute new beta
2144+ if (std::min (primal_diff_norm, dual_diff_norm) > 1e-10 ) {
2145+ double beta_update_ratio = dual_diff_norm / primal_diff_norm;
2146+ double old_beta = stepsize_.beta ;
2147+
2148+ double dLogBetaUpdate =
2149+ 0.5 * std::log (beta_update_ratio) + 0.5 * std::log (std::sqrt (old_beta));
2150+ stepsize_.beta = std::exp (2.0 * dLogBetaUpdate);
2151+ }
2152+
2153+ // Update steps
2154+ stepsize_.primal_step = dMeanStepSize / std::sqrt (stepsize_.beta );
2155+ stepsize_.dual_step = stepsize_.primal_step * stepsize_.beta ;
2156+ working_params.eta = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
2157+ working_params.omega = std::sqrt (stepsize_.beta );
2158+ restart_scheme_.UpdateBeta (stepsize_.beta );
20992159}
0 commit comments