2020#include " lp_data/HConst.h"
2121#include " pdlp/cupdlp/cupdlp.h" // For pdlpLogging
2222#include " restart.hpp"
23+ #include " pdlp_gpu_debug.hpp"
2324
2425#define PDHG_CHECK_INTERVAL 40
2526static constexpr double kDivergentMovement = 1e10 ;
@@ -490,8 +491,8 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
490491 }
491492
492493 // --- 0. Using PowerMethod to estimate the largest eigenvalue ---
493- initializeStepSizes ();
494494 setupGpu ();
495+ initializeStepSizes ();
495496 PrimalDualParams working_params = params_;
496497 working_params.omega = std::sqrt (stepsize_.dual_step / stepsize_.primal_step );
497498 working_params.eta = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
@@ -515,8 +516,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
515516 linalg::project_bounds (lp_, x_current_);
516517 linalg::project_bounds (lp_, x_sum_);
517518 linalg::project_bounds (lp_, x_avg_);
518- // linalg::Ax(lp, x_current_, Ax_cache_);
519- linalgGpuAx (x_current_, Ax_cache_);
519+ linalg::Ax (lp, x_current_, Ax_cache_);
520520 std::vector<double > Ax_avg = Ax_cache_;
521521 std::vector<double > ATy_avg (lp.num_col_ , 0.0 );
522522
@@ -648,9 +648,17 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
648648 // Recompute Ax and ATy for the restarted iterates
649649 hipdlpTimerStart (kHipdlpClockMatrixMultiply );
650650 linalg::Ax (lp, x_current_, Ax_cache_);
651+ std::vector<double > cup_ax = Ax_cache_;
652+ std::vector<double > gpu_ax (lp_.num_row_ ,0.0 );
653+ linalgGpuAx (x_current_, gpu_ax);
654+ bool are_ax_same = vecDiff (cup_ax, gpu_ax, 1e-12 ," restart Ax" );
651655 hipdlpTimerStop (kHipdlpClockMatrixMultiply );
652656 hipdlpTimerStart (kHipdlpClockMatrixTransposeMultiply );
653657 linalg::ATy (lp, y_current_, ATy_cache_);
658+ std::vector<double > cup_aty = ATy_cache_;
659+ std::vector<double > gpu_aty (lp_.num_col_ ,0.0 );
660+ linalgGpuATy (y_current_, gpu_aty);
661+ bool are_aty_same = vecDiff (cup_aty, gpu_aty, 1e-12 ," restart ATy" );
654662 hipdlpTimerStop (kHipdlpClockMatrixTransposeMultiply );
655663
656664 restart_scheme_.SetLastRestartIter (iter);
@@ -822,10 +830,18 @@ void PDLPSolver::computeAverageIterate(std::vector<double>& ax_avg,
822830
823831 hipdlpTimerStart (kHipdlpClockAverageIterateMatrixMultiply );
824832 linalg::Ax (lp_, x_avg_, ax_avg);
833+ std::vector<double > cup_ax = ax_avg;
834+ std::vector<double > gpu_ax (lp_.num_row_ ,0.0 );
835+ linalgGpuAx (x_avg_, gpu_ax);
836+ bool are_ax_same = vecDiff (cup_ax, gpu_ax, 1e-12 ," computeAverageIterate" );
825837 hipdlpTimerStop (kHipdlpClockAverageIterateMatrixMultiply );
826838
827839 hipdlpTimerStart (kHipdlpClockAverageIterateMatrixTransposeMultiply );
828840 linalg::ATy (lp_, y_avg_, aty_avg);
841+ std::vector<double > cup_aty = aty_avg;
842+ std::vector<double > gpu_aty (lp_.num_col_ ,0.0 );
843+ linalgGpuATy (y_avg_, gpu_aty);
844+ bool are_aty_same = vecDiff (cup_aty, gpu_aty, 1e-12 ," computeAverageIterate" );
829845 hipdlpTimerStop (kHipdlpClockAverageIterateMatrixTransposeMultiply );
830846
831847 debug_pdlp_data_.ax_average_norm = linalg::vector_norm_squared (ax_avg);
@@ -1138,7 +1154,15 @@ double PDLPSolver::PowerMethod() {
11381154 //
11391155 // Compute A'Ax = A'(Ax)
11401156 linalg::Ax (lp, x_vec, y_vec);
1157+ std::vector<double > ax_cpu = y_vec; // For timing consistency
1158+ std::vector<double > ax_gpu (lp.num_row_ );
1159+ linalgGpuAx (x_vec, ax_gpu);
1160+ bool are_same = vecDiff (ax_cpu, ax_gpu, 1e-12 , " Power method Ax" );
11411161 linalg::ATy (lp, y_vec, z_vec); // Note: ATy computes A' * vector
1162+ std::vector<double > atx_cpu = z_vec; // For timing consistency
1163+ std::vector<double > atx_gpu (lp.num_col_ );
1164+ linalgGpuATy (y_vec, atx_gpu);
1165+ are_same = vecDiff (atx_cpu, atx_gpu, 1e-12 , " Power method A'Tx" );
11421166
11431167 // Estimate the squared operator norm (largest eigenvalue of A'A)
11441168 op_norm_sq = linalg::dot (
@@ -1432,6 +1456,10 @@ void PDLPSolver::updateIteratesFixed() {
14321456
14331457 hipdlpTimerStart (kHipdlpClockMatrixMultiply );
14341458 linalg::Ax (lp_, x_next_, Ax_next_);
1459+ std::vector<double > cup_ax = Ax_next_;
1460+ std::vector<double > gpu_ax (lp_.num_row_ ,0.0 );
1461+ linalgGpuAx (x_next_, gpu_ax);
1462+ bool are_ax_same = vecDiff (cup_ax, gpu_ax, 1e-12 ," updateIteratesFixed" );
14351463 hipdlpTimerStop (kHipdlpClockMatrixMultiply );
14361464
14371465 hipdlpTimerStart (kHipdlpClockProjectY );
@@ -1440,6 +1468,10 @@ void PDLPSolver::updateIteratesFixed() {
14401468
14411469 hipdlpTimerStart (kHipdlpClockMatrixTransposeMultiply );
14421470 linalg::ATy (lp_, y_next_, ATy_next_);
1471+ std::vector<double > cup_aty = ATy_next_;
1472+ std::vector<double > gpu_aty (lp_.num_col_ ,0.0 );
1473+ linalgGpuATy (y_next_, gpu_aty);
1474+ bool are_aty_same = vecDiff (cup_aty, gpu_aty, 1e-12 ," updateIteratesFixed" );
14431475 hipdlpTimerStop (kHipdlpClockMatrixTransposeMultiply );
14441476}
14451477
@@ -1487,6 +1519,10 @@ void PDLPSolver::updateIteratesAdaptive() {
14871519
14881520 hipdlpTimerStart (kHipdlpClockMatrixMultiply );
14891521 linalg::Ax (lp_, xupdate, axupdate);
1522+ std::vector<double > cup_ax = axupdate;
1523+ std::vector<double > gpu_ax (lp_.num_row_ ,0.0 );
1524+ linalgGpuAx (xupdate, gpu_ax);
1525+ bool are_ax_same = vecDiff (cup_ax, gpu_ax, 1e-12 ," updateIteratesAdaptive" );
14901526 hipdlpTimerStop (kHipdlpClockMatrixMultiply );
14911527
14921528 // Dual update with timing
@@ -1496,6 +1532,10 @@ void PDLPSolver::updateIteratesAdaptive() {
14961532
14971533 hipdlpTimerStart (kHipdlpClockMatrixTransposeMultiply );
14981534 linalg::ATy (lp_, yupdate, atyupdate);
1535+ std::vector<double > cup_aty = atyupdate;
1536+ std::vector<double > gpu_aty (lp_.num_col_ ,0.0 );
1537+ linalgGpuATy (yupdate, gpu_aty);
1538+ bool are_aty_same = vecDiff (cup_aty, gpu_aty, 1e-12 ," updateIteratesAdaptive" );
14991539 hipdlpTimerStop (kHipdlpClockMatrixTransposeMultiply );
15001540
15011541 // Compute deltas
@@ -1671,15 +1711,14 @@ void PDLPSolver::setupGpu(){
16711711 const std::vector<double >& h_a_val = lp_csr.value_ ;
16721712
16731713 // 3. Allocate and copy A's CSR data to GPU
1674- CUDA_CHECK (cudaMalloc ((void **)&d_a_row_ptr_, (a_num_cols_ + 1 ) * sizeof (int )));
1714+ CUDA_CHECK (cudaMalloc ((void **)&d_a_row_ptr_, (a_num_rows_ + 1 ) * sizeof (int )));
16751715 CUDA_CHECK (cudaMalloc ((void **)&d_a_col_ind_, a_nnz_ * sizeof (int )));
16761716 CUDA_CHECK (cudaMalloc ((void **)&d_a_val_, a_nnz_ * sizeof (double )));
1677-
1678- CUDA_CHECK (cudaMemcpy (d_a_row_ptr_, h_a_row_ptr.data (), (a_num_cols_ + 1 ) * sizeof (int ), cudaMemcpyHostToDevice));
1717+ CUDA_CHECK (cudaMemcpy (d_a_row_ptr_, h_a_row_ptr.data (), (a_num_rows_ + 1 ) * sizeof (int ), cudaMemcpyHostToDevice));
16791718 CUDA_CHECK (cudaMemcpy (d_a_col_ind_, h_a_col_ind.data (), a_nnz_ * sizeof (int ), cudaMemcpyHostToDevice));
16801719 CUDA_CHECK (cudaMemcpy (d_a_val_, h_a_val.data (), a_nnz_ * sizeof (double ), cudaMemcpyHostToDevice));
16811720
1682- CUSPARSE_CHECK (cusparseCreateCsr (&mat_a_T_csr_, a_num_cols_, a_num_rows_ , a_nnz_,
1721+ CUSPARSE_CHECK (cusparseCreateCsr (&mat_a_csr_, a_num_rows_, a_num_cols_ , a_nnz_,
16831722 d_a_row_ptr_, d_a_col_ind_, d_a_val_,
16841723 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
16851724 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
@@ -1689,15 +1728,16 @@ void PDLPSolver::setupGpu(){
16891728 const std::vector<int >& h_at_col_ind = lp_.a_matrix_ .index_ ;
16901729 const std::vector<double >& h_at_val = lp_.a_matrix_ .value_ ;
16911730
1692- CUDA_CHECK (cudaMalloc ((void **)&d_at_row_ptr_, (a_num_rows_ + 1 ) * sizeof (int )));
1731+ CUDA_CHECK (cudaMalloc ((void **)&d_at_row_ptr_, (a_num_cols_ + 1 ) * sizeof (int ))); // Fixed!
16931732 CUDA_CHECK (cudaMalloc ((void **)&d_at_col_ind_, a_nnz_ * sizeof (int )));
1694- CUDA_CHECK (cudaMalloc ((void **)&d_at_val_, a_nnz_ * sizeof (double )));
1733+ CUDA_CHECK (cudaMalloc ((void **)&d_at_val_, a_nnz_ * sizeof (double )));
16951734
1696- CUDA_CHECK (cudaMemcpy (d_at_row_ptr_, h_at_row_ptr.data (), (a_num_rows_ + 1 ) * sizeof (int ), cudaMemcpyHostToDevice));
1735+ CUDA_CHECK (cudaMemcpy (d_at_row_ptr_, h_at_row_ptr.data (), (a_num_cols_ + 1 ) * sizeof (int ), cudaMemcpyHostToDevice));
16971736 CUDA_CHECK (cudaMemcpy (d_at_col_ind_, h_at_col_ind.data (), a_nnz_ * sizeof (int ), cudaMemcpyHostToDevice));
1698- CUDA_CHECK (cudaMemcpy (d_at_val_, h_at_val.data (), a_nnz_ * sizeof (double ), cudaMemcpyHostToDevice));
1737+ CUDA_CHECK (cudaMemcpy (d_at_val_, h_at_val.data (), a_nnz_ * sizeof (double ), cudaMemcpyHostToDevice));
16991738
1700- CUSPARSE_CHECK (cusparseCreateCsr (&mat_a_csr_, a_num_rows_, a_num_cols_, a_nnz_,
1739+ // Create AT descriptor with swapped dimensions
1740+ CUSPARSE_CHECK (cusparseCreateCsr (&mat_a_T_csr_, a_num_cols_, a_num_rows_, a_nnz_, // Dimensions swapped!
17011741 d_at_row_ptr_, d_at_col_ind_, d_at_val_,
17021742 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
17031743 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
0 commit comments