Skip to content

Commit cf5b809

Browse files
committed
fix setupGPU
1 parent 874b652 commit cf5b809

File tree

2 files changed

+56
-16
lines changed

2 files changed

+56
-16
lines changed

check/TestPdlp.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ TEST_CASE("pdlp-restart-add-row", "[pdlp]") {
342342

343343
TEST_CASE("hi-pdlp", "[pdlp]") {
344344
std::string model =
345-
"afiro"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
345+
"adlittle"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
346346
std::string model_file =
347347
std::string(HIGHS_DIR) + "/check/instances/" + model + ".mps";
348348
Highs h;
@@ -352,10 +352,10 @@ TEST_CASE("hi-pdlp", "[pdlp]") {
352352
h.setOptionValue("kkt_tolerance", kkt_tolerance);
353353
h.setOptionValue("presolve", "off");
354354

355-
HighsInt pdlp_features_off = 0
355+
HighsInt pdlp_features_off = //0
356356
// kPdlpScalingOff +
357357
// kPdlpRestartOff
358-
// kPdlpAdaptiveStepSizeOff
358+
kPdlpAdaptiveStepSizeOff
359359
;
360360
h.setOptionValue("pdlp_features_off", pdlp_features_off);
361361

@@ -366,7 +366,7 @@ TEST_CASE("hi-pdlp", "[pdlp]") {
366366
h.setOptionValue("pdlp_scaling_mode", pdlp_scaling);
367367
h.setOptionValue("pdlp_step_size_strategy", 1);
368368
h.setOptionValue("pdlp_restart_strategy", 2);
369-
// h.setOptionValue("pdlp_iteration_limit", 3520);
369+
h.setOptionValue("pdlp_iteration_limit", 10);
370370
// h.setOptionValue("log_dev_level", kHighsLogDevLevelVerbose);
371371
auto start_hipdlp = std::chrono::high_resolution_clock::now();
372372
HighsStatus run_status = h.run();

highs/pdlp/hipdlp/pdhg.cc

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "lp_data/HConst.h"
2121
#include "pdlp/cupdlp/cupdlp.h" // For pdlpLogging
2222
#include "restart.hpp"
23+
#include "pdlp_gpu_debug.hpp"
2324

2425
#define PDHG_CHECK_INTERVAL 40
2526
static constexpr double kDivergentMovement = 1e10;
@@ -490,8 +491,8 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
490491
}
491492

492493
// --- 0. Using PowerMethod to estimate the largest eigenvalue ---
493-
initializeStepSizes();
494494
setupGpu();
495+
initializeStepSizes();
495496
PrimalDualParams working_params = params_;
496497
working_params.omega = std::sqrt(stepsize_.dual_step / stepsize_.primal_step);
497498
working_params.eta = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
@@ -515,8 +516,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
515516
linalg::project_bounds(lp_, x_current_);
516517
linalg::project_bounds(lp_, x_sum_);
517518
linalg::project_bounds(lp_, x_avg_);
518-
//linalg::Ax(lp, x_current_, Ax_cache_);
519-
linalgGpuAx(x_current_, Ax_cache_);
519+
linalg::Ax(lp, x_current_, Ax_cache_);
520520
std::vector<double> Ax_avg = Ax_cache_;
521521
std::vector<double> ATy_avg(lp.num_col_, 0.0);
522522

@@ -648,9 +648,17 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
648648
// Recompute Ax and ATy for the restarted iterates
649649
hipdlpTimerStart(kHipdlpClockMatrixMultiply);
650650
linalg::Ax(lp, x_current_, Ax_cache_);
651+
std::vector<double> cup_ax = Ax_cache_;
652+
std::vector<double> gpu_ax(lp_.num_row_,0.0);
653+
linalgGpuAx(x_current_, gpu_ax);
654+
bool are_ax_same = vecDiff(cup_ax, gpu_ax, 1e-12,"restart Ax");
651655
hipdlpTimerStop(kHipdlpClockMatrixMultiply);
652656
hipdlpTimerStart(kHipdlpClockMatrixTransposeMultiply);
653657
linalg::ATy(lp, y_current_, ATy_cache_);
658+
std::vector<double> cup_aty = ATy_cache_;
659+
std::vector<double> gpu_aty(lp_.num_col_,0.0);
660+
linalgGpuATy(y_current_, gpu_aty);
661+
bool are_aty_same = vecDiff(cup_aty, gpu_aty, 1e-12,"restart ATy");
654662
hipdlpTimerStop(kHipdlpClockMatrixTransposeMultiply);
655663

656664
restart_scheme_.SetLastRestartIter(iter);
@@ -822,10 +830,18 @@ void PDLPSolver::computeAverageIterate(std::vector<double>& ax_avg,
822830

823831
hipdlpTimerStart(kHipdlpClockAverageIterateMatrixMultiply);
824832
linalg::Ax(lp_, x_avg_, ax_avg);
833+
std::vector<double> cup_ax = ax_avg;
834+
std::vector<double> gpu_ax(lp_.num_row_,0.0);
835+
linalgGpuAx(x_avg_, gpu_ax);
836+
bool are_ax_same = vecDiff(cup_ax, gpu_ax, 1e-12,"computeAverageIterate");
825837
hipdlpTimerStop(kHipdlpClockAverageIterateMatrixMultiply);
826838

827839
hipdlpTimerStart(kHipdlpClockAverageIterateMatrixTransposeMultiply);
828840
linalg::ATy(lp_, y_avg_, aty_avg);
841+
std::vector<double> cup_aty = aty_avg;
842+
std::vector<double> gpu_aty(lp_.num_col_,0.0);
843+
linalgGpuATy(y_avg_, gpu_aty);
844+
bool are_aty_same = vecDiff(cup_aty, gpu_aty, 1e-12,"computeAverageIterate");
829845
hipdlpTimerStop(kHipdlpClockAverageIterateMatrixTransposeMultiply);
830846

831847
debug_pdlp_data_.ax_average_norm = linalg::vector_norm_squared(ax_avg);
@@ -1138,7 +1154,15 @@ double PDLPSolver::PowerMethod() {
11381154
//
11391155
// Compute A'Ax = A'(Ax)
11401156
linalg::Ax(lp, x_vec, y_vec);
1157+
std::vector<double> ax_cpu = y_vec; // For timing consistency
1158+
std::vector<double> ax_gpu(lp.num_row_);
1159+
linalgGpuAx(x_vec, ax_gpu);
1160+
bool are_same = vecDiff(ax_cpu, ax_gpu, 1e-12, "Power method Ax");
11411161
linalg::ATy(lp, y_vec, z_vec); // Note: ATy computes A' * vector
1162+
std::vector<double> atx_cpu = z_vec; // For timing consistency
1163+
std::vector<double> atx_gpu(lp.num_col_);
1164+
linalgGpuATy(y_vec, atx_gpu);
1165+
are_same = vecDiff(atx_cpu, atx_gpu, 1e-12, "Power method A'Tx");
11421166

11431167
// Estimate the squared operator norm (largest eigenvalue of A'A)
11441168
op_norm_sq = linalg::dot(
@@ -1432,6 +1456,10 @@ void PDLPSolver::updateIteratesFixed() {
14321456

14331457
hipdlpTimerStart(kHipdlpClockMatrixMultiply);
14341458
linalg::Ax(lp_, x_next_, Ax_next_);
1459+
std::vector<double> cup_ax = Ax_next_;
1460+
std::vector<double> gpu_ax(lp_.num_row_,0.0);
1461+
linalgGpuAx(x_next_, gpu_ax);
1462+
bool are_ax_same = vecDiff(cup_ax, gpu_ax, 1e-12,"updateIteratesFixed");
14351463
hipdlpTimerStop(kHipdlpClockMatrixMultiply);
14361464

14371465
hipdlpTimerStart(kHipdlpClockProjectY);
@@ -1440,6 +1468,10 @@ void PDLPSolver::updateIteratesFixed() {
14401468

14411469
hipdlpTimerStart(kHipdlpClockMatrixTransposeMultiply);
14421470
linalg::ATy(lp_, y_next_, ATy_next_);
1471+
std::vector<double> cup_aty = ATy_next_;
1472+
std::vector<double> gpu_aty(lp_.num_col_,0.0);
1473+
linalgGpuATy(y_next_, gpu_aty);
1474+
bool are_aty_same = vecDiff(cup_aty, gpu_aty, 1e-12,"updateIteratesFixed");
14431475
hipdlpTimerStop(kHipdlpClockMatrixTransposeMultiply);
14441476
}
14451477

@@ -1487,6 +1519,10 @@ void PDLPSolver::updateIteratesAdaptive() {
14871519

14881520
hipdlpTimerStart(kHipdlpClockMatrixMultiply);
14891521
linalg::Ax(lp_, xupdate, axupdate);
1522+
std::vector<double> cup_ax = axupdate;
1523+
std::vector<double> gpu_ax(lp_.num_row_,0.0);
1524+
linalgGpuAx(xupdate, gpu_ax);
1525+
bool are_ax_same = vecDiff(cup_ax, gpu_ax, 1e-12,"updateIteratesAdaptive");
14901526
hipdlpTimerStop(kHipdlpClockMatrixMultiply);
14911527

14921528
// Dual update with timing
@@ -1496,6 +1532,10 @@ void PDLPSolver::updateIteratesAdaptive() {
14961532

14971533
hipdlpTimerStart(kHipdlpClockMatrixTransposeMultiply);
14981534
linalg::ATy(lp_, yupdate, atyupdate);
1535+
std::vector<double> cup_aty = atyupdate;
1536+
std::vector<double> gpu_aty(lp_.num_col_,0.0);
1537+
linalgGpuATy(yupdate, gpu_aty);
1538+
bool are_aty_same = vecDiff(cup_aty, gpu_aty, 1e-12,"updateIteratesAdaptive");
14991539
hipdlpTimerStop(kHipdlpClockMatrixTransposeMultiply);
15001540

15011541
// Compute deltas
@@ -1671,15 +1711,14 @@ void PDLPSolver::setupGpu(){
16711711
const std::vector<double>& h_a_val = lp_csr.value_;
16721712

16731713
//3. Allocate and copy A's CSR data to GPU
1674-
CUDA_CHECK(cudaMalloc((void**)&d_a_row_ptr_, (a_num_cols_ + 1) * sizeof(int)));
1714+
CUDA_CHECK(cudaMalloc((void**)&d_a_row_ptr_, (a_num_rows_ + 1) * sizeof(int)));
16751715
CUDA_CHECK(cudaMalloc((void**)&d_a_col_ind_, a_nnz_ * sizeof(int)));
16761716
CUDA_CHECK(cudaMalloc((void**)&d_a_val_, a_nnz_ * sizeof(double)));
1677-
1678-
CUDA_CHECK(cudaMemcpy(d_a_row_ptr_, h_a_row_ptr.data(), (a_num_cols_ + 1) * sizeof(int), cudaMemcpyHostToDevice));
1717+
CUDA_CHECK(cudaMemcpy(d_a_row_ptr_, h_a_row_ptr.data(), (a_num_rows_ + 1) * sizeof(int), cudaMemcpyHostToDevice));
16791718
CUDA_CHECK(cudaMemcpy(d_a_col_ind_, h_a_col_ind.data(), a_nnz_ * sizeof(int), cudaMemcpyHostToDevice));
16801719
CUDA_CHECK(cudaMemcpy(d_a_val_, h_a_val.data(), a_nnz_ * sizeof(double), cudaMemcpyHostToDevice));
16811720

1682-
CUSPARSE_CHECK(cusparseCreateCsr(&mat_a_T_csr_, a_num_cols_, a_num_rows_, a_nnz_,
1721+
CUSPARSE_CHECK(cusparseCreateCsr(&mat_a_csr_, a_num_rows_, a_num_cols_, a_nnz_,
16831722
d_a_row_ptr_, d_a_col_ind_, d_a_val_,
16841723
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
16851724
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
@@ -1689,15 +1728,16 @@ void PDLPSolver::setupGpu(){
16891728
const std::vector<int>& h_at_col_ind = lp_.a_matrix_.index_;
16901729
const std::vector<double>& h_at_val = lp_.a_matrix_.value_;
16911730

1692-
CUDA_CHECK(cudaMalloc((void**)&d_at_row_ptr_, (a_num_rows_ + 1) * sizeof(int)));
1731+
CUDA_CHECK(cudaMalloc((void**)&d_at_row_ptr_, (a_num_cols_ + 1) * sizeof(int))); // Fixed!
16931732
CUDA_CHECK(cudaMalloc((void**)&d_at_col_ind_, a_nnz_ * sizeof(int)));
1694-
CUDA_CHECK(cudaMalloc((void**)&d_at_val_, a_nnz_ * sizeof(double)));
1733+
CUDA_CHECK(cudaMalloc((void**)&d_at_val_, a_nnz_ * sizeof(double)));
16951734

1696-
CUDA_CHECK(cudaMemcpy(d_at_row_ptr_, h_at_row_ptr.data(), (a_num_rows_ + 1) * sizeof(int), cudaMemcpyHostToDevice));
1735+
CUDA_CHECK(cudaMemcpy(d_at_row_ptr_, h_at_row_ptr.data(), (a_num_cols_ + 1) * sizeof(int), cudaMemcpyHostToDevice));
16971736
CUDA_CHECK(cudaMemcpy(d_at_col_ind_, h_at_col_ind.data(), a_nnz_ * sizeof(int), cudaMemcpyHostToDevice));
1698-
CUDA_CHECK(cudaMemcpy(d_at_val_, h_at_val.data(), a_nnz_ * sizeof(double), cudaMemcpyHostToDevice));
1737+
CUDA_CHECK(cudaMemcpy(d_at_val_, h_at_val.data(), a_nnz_ * sizeof(double), cudaMemcpyHostToDevice));
16991738

1700-
CUSPARSE_CHECK(cusparseCreateCsr(&mat_a_csr_, a_num_rows_, a_num_cols_, a_nnz_,
1739+
// Create AT descriptor with swapped dimensions
1740+
CUSPARSE_CHECK(cusparseCreateCsr(&mat_a_T_csr_, a_num_cols_, a_num_rows_, a_nnz_, // Dimensions swapped!
17011741
d_at_row_ptr_, d_at_col_ind_, d_at_val_,
17021742
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
17031743
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));

0 commit comments

Comments
 (0)