@@ -572,19 +572,6 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
572572 CUDA_CHECK(cudaMemcpy(y_current_.data(), d_y_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
573573 CUDA_CHECK(cudaMemcpy(Ax_cache_.data(), d_ax_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
574574 CUDA_CHECK(cudaMemcpy(ATy_cache_.data(), d_aty_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
575- */
576- /*
577- CUDA_CHECK(cudaMemcpy(x_sum_.data(), d_x_sum_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
578- CUDA_CHECK(cudaMemcpy(y_sum_.data(), d_y_sum_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
579- CUDA_CHECK(cudaMemcpy(x_avg_.data(), d_x_avg_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
580- CUDA_CHECK(cudaMemcpy(y_avg_.data(), d_y_avg_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
581-
582- double dScale_gpu = sum_weights_gpu_ > 0.0 ? 1.0 / sum_weights_gpu_ : 0.0;
583- launchKernelScaleVector(d_x_avg_, d_x_sum_, dScale_gpu, lp_.num_col_);
584- launchKernelScaleVector(d_y_avg_, d_y_sum_, dScale_gpu, lp_.num_row_);
585-
586- linalgGpuAx(d_x_avg_, d_ax_avg_);
587- linalgGpuATy(d_y_avg_, d_aty_avg_);
588575*/
589576 computeAverageIterateGpu ();
590577#endif
@@ -612,7 +599,27 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
612599 d_ax_avg_, d_aty_avg_,
613600 params_.tolerance , average_results, " [A-GPU]"
614601 );
602+ #else
603+ // === CPU Convergence Check ===//
604+ hipdlpTimerStart (kHipdlpClockConvergenceCheck );
605+ // Compute residuals for current iterate
606+ bool current_converged = checkConvergence (
607+ iter, x_current_, y_current_, Ax_cache_, ATy_cache_,
608+ params_.tolerance , current_results, " [L]" , dSlackPos_, dSlackNeg_);
609+
610+ // Compute residuals for average iterate
611+ bool average_converged = checkConvergence (
612+ iter, x_avg_, y_avg_, Ax_avg, ATy_avg, params_.tolerance ,
613+ average_results, " [A]" , dSlackPosAvg_, dSlackNegAvg_);
614+ hipdlpTimerStop (kHipdlpClockConvergenceCheck );
615+ #endif
616+ debugPdlpIterHeaderLog (debug_pdlp_log_file_);
617+
618+ // Print iteration statistics
619+ logger_.print_iteration_stats (iter, average_results, current_eta_);
615620
621+ // Check for convergence
622+ #ifdef CUPDLP_GPU
616623 if (current_converged_gpu) {
617624 logger_.info (" Current GPU solution converged in " + std::to_string (iter) +
618625 " iterations." );
@@ -636,26 +643,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
636643 results_ = average_results;
637644 return solveReturn (TerminationStatus::OPTIMAL);
638645 }
639- #endif
640- // === CPU Convergence Check ===//
641- hipdlpTimerStart (kHipdlpClockConvergenceCheck );
642- // Compute residuals for current iterate
643- bool current_converged = checkConvergence (
644- iter, x_current_, y_current_, Ax_cache_, ATy_cache_,
645- params_.tolerance , current_results, " [L]" , dSlackPos_, dSlackNeg_);
646-
647- // Compute residuals for average iterate
648- bool average_converged = checkConvergence (
649- iter, x_avg_, y_avg_, Ax_avg, ATy_avg, params_.tolerance ,
650- average_results, " [A]" , dSlackPosAvg_, dSlackNegAvg_);
651- hipdlpTimerStop (kHipdlpClockConvergenceCheck );
652-
653- debugPdlpIterHeaderLog (debug_pdlp_log_file_);
654-
655- // Print iteration statistics
656- logger_.print_iteration_stats (iter, average_results, current_eta_);
657-
658- // Check for convergence
646+ #else
659647 if (current_converged) {
660648 logger_.info (" Current solution converged in " + std::to_string (iter) +
661649 " iterations." );
@@ -677,7 +665,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
677665 results_ = average_results;
678666 return solveReturn (TerminationStatus::OPTIMAL);
679667 }
680-
668+ # endif
681669 // --- 4. Restart Check (using computed results) ---
682670 RestartInfo restart_info =
683671 restart_scheme_.Check (iter, current_results, average_results);
@@ -691,17 +679,17 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
691679 restart_scheme_.duality_gap_last_restart_ =
692680 average_results.duality_gap ;
693681
694- x_current_ = x_avg_;
695- y_current_ = y_avg_;
696-
697- Ax_cache_ = Ax_avg;
698- ATy_cache_ = ATy_avg;
699-
700682#ifdef CUPDLP_GPU
701683 CUDA_CHECK (cudaMemcpy (d_x_current_, d_x_avg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
702684 CUDA_CHECK (cudaMemcpy (d_y_current_, d_y_avg_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
703685 linalgGpuAx (d_x_current_, d_ax_current_);
704686 linalgGpuATy (d_y_current_, d_aty_current_);
687+ #else
688+ x_current_ = x_avg_;
689+ y_current_ = y_avg_;
690+
691+ Ax_cache_ = Ax_avg;
692+ ATy_cache_ = ATy_avg;
705693#endif
706694 } else {
707695 restart_scheme_.primal_feas_last_restart_ =
@@ -726,26 +714,24 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
726714 current_eta_ = working_params.eta ;
727715 restart_scheme_.passParams (&working_params);
728716
729- x_at_last_restart_ = x_current_; // Current becomes the new last
730- y_at_last_restart_ = y_current_;
731-
732- std::fill (x_sum_.begin (), x_sum_.end (), 0.0 );
733- std::fill (y_sum_.begin (), y_sum_.end (), 0.0 );
734- sum_weights_ = 0.0 ;
735-
736717#ifdef CUPDLP_GPU
737718 CUDA_CHECK (cudaMemcpy (d_x_at_last_restart_, d_x_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
738719 CUDA_CHECK (cudaMemcpy (d_y_at_last_restart_, d_y_current_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
739720 CUDA_CHECK (cudaMemset (d_x_sum_, 0 , a_num_cols_ * sizeof (double )));
740721 CUDA_CHECK (cudaMemset (d_y_sum_, 0 , a_num_rows_ * sizeof (double )));
741722 sum_weights_gpu_ = 0.0 ;
723+ #else
724+ x_at_last_restart_ = x_current_; // Current becomes the new last
725+ y_at_last_restart_ = y_current_;
726+
727+ std::fill (x_sum_.begin (), x_sum_.end (), 0.0 );
728+ std::fill (y_sum_.begin (), y_sum_.end (), 0.0 );
729+ sum_weights_ = 0.0 ;
742730#endif
743731
744732 restart_scheme_.last_restart_iter_ = iter;
745- // Recompute Ax and ATy for the restarted iterates
746- hipdlpTimerStart (kHipdlpClockMatrixMultiply );
747- linalg::Ax (lp, x_current_, Ax_cache_);
748733
734+ // Recompute Ax and ATy for the restarted iterates
749735#ifdef CUPDLP_GPU
750736 launchKernelUpdateX (stepsize_.primal_step );
751737 linalgGpuAx (d_x_next_, d_ax_next_);
@@ -760,13 +746,14 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
760746 CUDA_CHECK (cudaMemcpy (y_next_gpu.data (), d_y_next_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
761747 CUDA_CHECK (cudaMemcpy (ax_next_gpu.data (), d_ax_next_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
762748 CUDA_CHECK (cudaMemcpy (aty_next_gpu.data (), d_aty_next_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
763- #endif
764-
749+ #else
750+ hipdlpTimerStart (kHipdlpClockMatrixMultiply );
751+ linalg::Ax (lp, x_current_, Ax_cache_);
765752 hipdlpTimerStop (kHipdlpClockMatrixMultiply );
766753 hipdlpTimerStart (kHipdlpClockMatrixTransposeMultiply );
767754 linalg::ATy (lp, y_current_, ATy_cache_);
768755 hipdlpTimerStop (kHipdlpClockMatrixTransposeMultiply );
769-
756+ # endif
770757 restart_scheme_.SetLastRestartIter (iter);
771758 }
772759 }
@@ -777,15 +764,19 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
777764
778765 // Store current iterates before update (for next iteration's x_current_,
779766 // y_current_)
767+ #ifdef CUPDLP_GPU
768+ CUDA_CHECK (cudaMemcpy (d_x_next_, d_x_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
769+ CUDA_CHECK (cudaMemcpy (d_y_next_, d_y_current_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
770+ // Copy Ax and ATy cache to host
771+ CUDA_CHECK (cudaMemcpy (Ax_cache_.data (), d_ax_current_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
772+ CUDA_CHECK (cudaMemcpy (ATy_cache_.data (), d_aty_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
773+ debug_pdlp_data_.ax_norm = linalg::vector_norm (Ax_cache_);
774+ debug_pdlp_data_.aty_norm = linalg::vector_norm (ATy_cache_);
775+ #else
780776 x_next_ = x_current_;
781777 y_next_ = y_current_;
782-
783778 debug_pdlp_data_.ax_norm = linalg::vector_norm (Ax_cache_);
784779 debug_pdlp_data_.aty_norm = linalg::vector_norm (ATy_cache_);
785-
786- #ifdef CUPDLP_GPU
787- CUDA_CHECK (cudaMemcpy (d_x_next_, d_x_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
788- CUDA_CHECK (cudaMemcpy (d_y_next_, d_y_current_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
789780#endif
790781
791782 switch (params_.step_size_strategy ) {
@@ -2159,7 +2150,6 @@ bool PDLPSolver::checkConvergenceGpu(
21592150 bool gap_small = results.relative_obj_gap < epsilon;
21602151
21612152 return primal_feasible && dual_feasible && gap_small;
2162-
21632153}
21642154
21652155void PDLPSolver::computeStepSizeRatioGpu (PrimalDualParams& working_params) {
@@ -2210,6 +2200,11 @@ void PDLPSolver::computeAverageIterateGpu() {
22102200 // Recompute Ax_avg and ATy_avg on GPU
22112201 linalgGpuAx (d_x_avg_, d_ax_avg_);
22122202 linalgGpuATy (d_y_avg_, d_aty_avg_);
2203+ // copy x_avg to host
2204+ CUDA_CHECK (cudaMemcpy (x_avg_.data (), d_x_avg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
2205+ debug_pdlp_data_.x_average_norm = linalg::vector_norm_squared (x_avg_);
2206+
2207+ // debug_pdlp_data_.ax_average_norm = computeDiffNormCuBLAS;
22132208}
22142209
22152210double PDLPSolver::computeMovementGpu (
0 commit comments