Skip to content

Commit 874b652

Browse files
committed
setup GPU and linalgGpuAx, linalgGpuATy
1 parent 071749d commit 874b652

File tree

3 files changed

+223
-3
lines changed

3 files changed

+223
-3
lines changed

check/TestPdlp.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ TEST_CASE("pdlp-restart-add-row", "[pdlp]") {
342342

343343
TEST_CASE("hi-pdlp", "[pdlp]") {
344344
std::string model =
345-
"fit2p"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
345+
"afiro"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
346346
std::string model_file =
347347
std::string(HIGHS_DIR) + "/check/instances/" + model + ".mps";
348348
Highs h;

highs/pdlp/hipdlp/pdhg.cc

Lines changed: 162 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
491491

492492
// --- 0. Using PowerMethod to estimate the largest eigenvalue ---
493493
initializeStepSizes();
494-
494+
setupGpu();
495495
PrimalDualParams working_params = params_;
496496
working_params.omega = std::sqrt(stepsize_.dual_step / stepsize_.primal_step);
497497
working_params.eta = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
@@ -515,7 +515,8 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
515515
linalg::project_bounds(lp_, x_current_);
516516
linalg::project_bounds(lp_, x_sum_);
517517
linalg::project_bounds(lp_, x_avg_);
518-
linalg::Ax(lp, x_current_, Ax_cache_);
518+
//linalg::Ax(lp, x_current_, Ax_cache_);
519+
linalgGpuAx(x_current_, Ax_cache_);
519520
std::vector<double> Ax_avg = Ax_cache_;
520521
std::vector<double> ATy_avg(lp.num_col_, 0.0);
521522

@@ -727,6 +728,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
727728
}
728729

729730
void PDLPSolver::solveReturn(const TerminationStatus term_code) {
731+
cleanupGpu();
730732
results_.term_code = term_code;
731733
hipdlpTimerStop(kHipdlpClockSolve);
732734
}
@@ -1647,3 +1649,161 @@ void PDLPSolver::hipdlpTimerStop(const HighsInt hipdlp_clock) {
16471649
void PDLPSolver::closeDebugLog() {
16481650
if (debug_pdlp_log_file_) fclose(debug_pdlp_log_file_);
16491651
}
1652+
1653+
// =============================================================================
1654+
// SECTION 5: GPU Part
1655+
// =============================================================================
1656+
1657+
void PDLPSolver::setupGpu(){
1658+
//1. Initialize cuSPARSE
1659+
CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
1660+
1661+
//2. Get matrix data from lp_ (CSC)
1662+
a_num_rows_ = lp_.num_row_;
1663+
a_num_cols_ = lp_.num_col_;
1664+
a_nnz_ = lp_.a_matrix_.numNz();
1665+
1666+
HighsSparseMatrix lp_csr = lp_.a_matrix_;
1667+
lp_csr.ensureRowwise();
1668+
1669+
const std::vector<int>& h_a_row_ptr = lp_csr.start_;
1670+
const std::vector<int>& h_a_col_ind = lp_csr.index_;
1671+
const std::vector<double>& h_a_val = lp_csr.value_;
1672+
1673+
//3. Allocate and copy A's CSR data to GPU
1674+
CUDA_CHECK(cudaMalloc((void**)&d_a_row_ptr_, (a_num_cols_ + 1) * sizeof(int)));
1675+
CUDA_CHECK(cudaMalloc((void**)&d_a_col_ind_, a_nnz_ * sizeof(int)));
1676+
CUDA_CHECK(cudaMalloc((void**)&d_a_val_, a_nnz_ * sizeof(double)));
1677+
1678+
CUDA_CHECK(cudaMemcpy(d_a_row_ptr_, h_a_row_ptr.data(), (a_num_cols_ + 1) * sizeof(int), cudaMemcpyHostToDevice));
1679+
CUDA_CHECK(cudaMemcpy(d_a_col_ind_, h_a_col_ind.data(), a_nnz_ * sizeof(int), cudaMemcpyHostToDevice));
1680+
CUDA_CHECK(cudaMemcpy(d_a_val_, h_a_val.data(), a_nnz_ * sizeof(double), cudaMemcpyHostToDevice));
1681+
1682+
CUSPARSE_CHECK(cusparseCreateCsr(&mat_a_T_csr_, a_num_cols_, a_num_rows_, a_nnz_,
1683+
d_a_row_ptr_, d_a_col_ind_, d_a_val_,
1684+
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
1685+
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
1686+
1687+
//4. Create matrix AT in CSR format = A in CSC
1688+
const std::vector<int>& h_at_row_ptr = lp_.a_matrix_.start_;
1689+
const std::vector<int>& h_at_col_ind = lp_.a_matrix_.index_;
1690+
const std::vector<double>& h_at_val = lp_.a_matrix_.value_;
1691+
1692+
CUDA_CHECK(cudaMalloc((void**)&d_at_row_ptr_, (a_num_rows_ + 1) * sizeof(int)));
1693+
CUDA_CHECK(cudaMalloc((void**)&d_at_col_ind_, a_nnz_ * sizeof(int)));
1694+
CUDA_CHECK(cudaMalloc((void**)&d_at_val_, a_nnz_ * sizeof(double)));
1695+
1696+
CUDA_CHECK(cudaMemcpy(d_at_row_ptr_, h_at_row_ptr.data(), (a_num_rows_ + 1) * sizeof(int), cudaMemcpyHostToDevice));
1697+
CUDA_CHECK(cudaMemcpy(d_at_col_ind_, h_at_col_ind.data(), a_nnz_ * sizeof(int), cudaMemcpyHostToDevice));
1698+
CUDA_CHECK(cudaMemcpy(d_at_val_, h_at_val.data(), a_nnz_ * sizeof(double), cudaMemcpyHostToDevice));
1699+
1700+
CUSPARSE_CHECK(cusparseCreateCsr(&mat_a_csr_, a_num_rows_, a_num_cols_, a_nnz_,
1701+
d_at_row_ptr_, d_at_col_ind_, d_at_val_,
1702+
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
1703+
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
1704+
1705+
//5. Allocate device vectors
1706+
CUDA_CHECK(cudaMalloc(&d_x_, a_num_cols_ * sizeof(double)));
1707+
CUDA_CHECK(cudaMalloc(&d_y_, a_num_rows_ * sizeof(double)));
1708+
CUDA_CHECK(cudaMalloc(&d_ax_, a_num_rows_ * sizeof(double)));
1709+
CUDA_CHECK(cudaMalloc(&d_aty_, a_num_cols_ * sizeof(double)));
1710+
1711+
//6. Preallocate buffer for cuSPARSE SpMV
1712+
//Buffer for Ax
1713+
double alpha = 1.0;
1714+
double beta = 0.0;
1715+
cusparseDnVecDescr_t vec_x, vec_ax;
1716+
CUSPARSE_CHECK(cusparseCreateDnVec(&vec_x, a_num_cols_, d_x_, CUDA_R_64F));
1717+
CUSPARSE_CHECK(cusparseCreateDnVec(&vec_ax, a_num_rows_, d_ax_, CUDA_R_64F));
1718+
1719+
CUSPARSE_CHECK(cusparseSpMV_bufferSize(
1720+
cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1721+
&alpha, mat_a_csr_, vec_x, &beta, vec_ax, CUDA_R_64F,
1722+
CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_ax_));
1723+
CUDA_CHECK(cudaMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_));
1724+
1725+
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_x));
1726+
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_ax));
1727+
1728+
//Buffer for ATy
1729+
cusparseDnVecDescr_t vec_y, vec_aty;
1730+
CUSPARSE_CHECK(cusparseCreateDnVec(&vec_y, a_num_rows_, d_y_, CUDA_R_64F));
1731+
CUSPARSE_CHECK(cusparseCreateDnVec(&vec_aty, a_num_cols_, d_aty_, CUDA_R_64F));
1732+
CUSPARSE_CHECK(cusparseSpMV_bufferSize(
1733+
cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1734+
&alpha, mat_a_T_csr_, vec_y, &beta, vec_aty, CUDA_R_64F,
1735+
CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_aty_));
1736+
CUDA_CHECK(cudaMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_));
1737+
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_y));
1738+
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_aty));
1739+
1740+
highsLogUser(params_.log_options_, HighsLogType::kInfo, "GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred to device.\n");
1741+
}
1742+
1743+
void PDLPSolver::cleanupGpu(){
1744+
if (cusparse_handle_) CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
1745+
if (mat_a_csr_) CUSPARSE_CHECK(cusparseDestroySpMat(mat_a_csr_));
1746+
if (mat_a_T_csr_) CUSPARSE_CHECK(cusparseDestroySpMat(mat_a_T_csr_));
1747+
CUDA_CHECK(cudaFree(d_a_row_ptr_));
1748+
CUDA_CHECK(cudaFree(d_a_col_ind_));
1749+
CUDA_CHECK(cudaFree(d_a_val_));
1750+
CUDA_CHECK(cudaFree(d_at_row_ptr_));
1751+
CUDA_CHECK(cudaFree(d_at_col_ind_));
1752+
CUDA_CHECK(cudaFree(d_at_val_));
1753+
CUDA_CHECK(cudaFree(d_x_));
1754+
CUDA_CHECK(cudaFree(d_y_));
1755+
CUDA_CHECK(cudaFree(d_ax_));
1756+
CUDA_CHECK(cudaFree(d_aty_));
1757+
CUDA_CHECK(cudaFree(d_spmv_buffer_ax_));
1758+
CUDA_CHECK(cudaFree(d_spmv_buffer_aty_));
1759+
}
1760+
1761+
void PDLPSolver::linalgGpuAx(const std::vector<double>& x, std::vector<double>& ax){
1762+
CUDA_CHECK(cudaMemcpy(d_x_, x.data(), a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice));
1763+
// Ax = 1.0 * A * x + 0.0 * ax
1764+
double alpha = 1.0;
1765+
double beta = 0.0;
1766+
cusparseDnVecDescr_t vec_x, vec_ax;
1767+
CUSPARSE_CHECK(cusparseCreateDnVec(&vec_x, a_num_cols_, d_x_, CUDA_R_64F));
1768+
CUSPARSE_CHECK(cusparseCreateDnVec(&vec_ax, a_num_rows_, d_ax_, CUDA_R_64F));
1769+
if(spmv_buffer_size_ax_ == 0){
1770+
CUSPARSE_CHECK(cusparseSpMV_bufferSize(
1771+
cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1772+
&alpha, mat_a_csr_, vec_x, &beta, vec_ax, CUDA_R_64F,
1773+
CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_ax_));
1774+
CUDA_CHECK(cudaMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_));
1775+
}
1776+
1777+
CUSPARSE_CHECK(cusparseSpMV(
1778+
cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1779+
&alpha, mat_a_csr_, vec_x, &beta, vec_ax, CUDA_R_64F,
1780+
CUSPARSE_SPMV_CSR_ALG2, d_spmv_buffer_ax_));
1781+
CUDA_CHECK(cudaMemcpy(ax.data(), d_ax_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
1782+
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_x));
1783+
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_ax));
1784+
}
1785+
1786+
void PDLPSolver::linalgGpuATy(const std::vector<double>& y,
1787+
std::vector<double>& aty){
1788+
CUDA_CHECK(cudaMemcpy(d_y_, y.data(), a_num_rows_ * sizeof(double), cudaMemcpyHostToDevice));
1789+
// ATy = 1.0 * A^T * y + 0.0 * aty
1790+
double alpha = 1.0;
1791+
double beta = 0.0;
1792+
cusparseDnVecDescr_t vec_y, vec_aty;
1793+
CUSPARSE_CHECK(cusparseCreateDnVec(&vec_y, a_num_rows_, d_y_, CUDA_R_64F));
1794+
CUSPARSE_CHECK(cusparseCreateDnVec(&vec_aty, a_num_cols_, d_aty_, CUDA_R_64F));
1795+
if(spmv_buffer_size_aty_ == 0){
1796+
CUSPARSE_CHECK(cusparseSpMV_bufferSize(
1797+
cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1798+
&alpha, mat_a_T_csr_, vec_y, &beta, vec_aty, CUDA_R_64F,
1799+
CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_aty_));
1800+
CUDA_CHECK(cudaMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_));
1801+
}
1802+
CUSPARSE_CHECK(cusparseSpMV(
1803+
cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1804+
&alpha, mat_a_T_csr_, vec_y, &beta, vec_aty, CUDA_R_64F,
1805+
CUSPARSE_SPMV_CSR_ALG2, d_spmv_buffer_aty_));
1806+
CUDA_CHECK(cudaMemcpy(aty.data(), d_aty_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
1807+
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_y));
1808+
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_aty));
1809+
}

highs/pdlp/hipdlp/pdhg.hpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include <string>
1919
#include <tuple>
2020
#include <vector>
21+
#include <cuda_runtime.h>
22+
#include <cusparse.h>
2123

2224
#include "Highs.h"
2325
#include "linalg.hpp"
@@ -166,6 +168,64 @@ class PDLPSolver {
166168
std::vector<double> ATy_cache_;
167169
std::vector<double> Ax_next_, ATy_next_;
168170
std::vector<double> K_times_x_diff_;
171+
172+
// --- GPU methods ---
173+
void setupGpu();
174+
void cleanupGpu();
175+
void linalgGpuAx(const std::vector<double>& x,
176+
std::vector<double>& ax);
177+
void linalgGpuATy(const std::vector<double>& y,
178+
std::vector<double>& aty);
179+
180+
// --- Helpers for error checking ---
181+
#define CUDA_CHECK(call) \
182+
do { \
183+
cudaError_t err = (call); \
184+
if (err != cudaSuccess) { \
185+
fprintf(stderr, "CUDA Error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
186+
exit(EXIT_FAILURE); \
187+
} \
188+
} while (0)
189+
190+
#define CUSPARSE_CHECK(call) \
191+
do { \
192+
cusparseStatus_t status = (call); \
193+
if (status != CUSPARSE_STATUS_SUCCESS) { \
194+
fprintf(stderr, "cuSPARSE Error at %s:%d: %s\n", __FILE__, __LINE__, cusparseGetErrorString(status)); \
195+
exit(EXIT_FAILURE); \
196+
} \
197+
} while (0)
198+
199+
// --- GPU Members ---
200+
cusparseHandle_t cusparse_handle_ = nullptr;
201+
202+
// Matrix A in CSR format (for Ax)
203+
cusparseSpMatDescr_t mat_a_csr_ = nullptr;
204+
int* d_a_row_ptr_ = nullptr;
205+
int* d_a_col_ind_ = nullptr;
206+
double* d_a_val_ = nullptr;
207+
208+
// Matrix A^T in CSR format (for Aty)
209+
// creat by resuing A's CSC data on GPU
210+
cusparseSpMatDescr_t mat_a_T_csr_ = nullptr;
211+
int* d_at_row_ptr_ = nullptr;
212+
int* d_at_col_ind_ = nullptr;
213+
double* d_at_val_ = nullptr;
214+
215+
// GPU vectors
216+
int a_num_rows_ = 0;
217+
int a_num_cols_ = 0;
218+
int a_nnz_ = 0;
219+
double* d_x_ = nullptr;
220+
double* d_y_ = nullptr;
221+
double* d_ax_ = nullptr;
222+
double* d_aty_ = nullptr;
223+
224+
// Temporary buffer for SpMV
225+
void* d_spmv_buffer_ax_ = nullptr;
226+
size_t spmv_buffer_size_ax_ = 0;
227+
void* d_spmv_buffer_aty_ = nullptr;
228+
size_t spmv_buffer_size_aty_ = 0;
169229
};
170230

171231
#endif

0 commit comments

Comments
 (0)