@@ -491,7 +491,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
491491
492492 // --- 0. Using PowerMethod to estimate the largest eigenvalue ---
493493 initializeStepSizes ();
494-
494+ setupGpu ();
495495 PrimalDualParams working_params = params_;
496496 working_params.omega = std::sqrt (stepsize_.dual_step / stepsize_.primal_step );
497497 working_params.eta = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
@@ -515,7 +515,8 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
515515 linalg::project_bounds (lp_, x_current_);
516516 linalg::project_bounds (lp_, x_sum_);
517517 linalg::project_bounds (lp_, x_avg_);
518- linalg::Ax (lp, x_current_, Ax_cache_);
518+ // linalg::Ax(lp, x_current_, Ax_cache_);
519+ linalgGpuAx (x_current_, Ax_cache_);
519520 std::vector<double > Ax_avg = Ax_cache_;
520521 std::vector<double > ATy_avg (lp.num_col_ , 0.0 );
521522
@@ -727,6 +728,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
727728}
728729
729730void PDLPSolver::solveReturn (const TerminationStatus term_code) {
731+ cleanupGpu ();
730732 results_.term_code = term_code;
731733 hipdlpTimerStop (kHipdlpClockSolve );
732734}
@@ -1647,3 +1649,161 @@ void PDLPSolver::hipdlpTimerStop(const HighsInt hipdlp_clock) {
16471649void PDLPSolver::closeDebugLog () {
16481650 if (debug_pdlp_log_file_) fclose (debug_pdlp_log_file_);
16491651}
1652+
1653+ // =============================================================================
1654+ // SECTION 5: GPU Part
1655+ // =============================================================================
1656+
1657+ void PDLPSolver::setupGpu (){
1658+ // 1. Initialize cuSPARSE
1659+ CUSPARSE_CHECK (cusparseCreate (&cusparse_handle_));
1660+
1661+ // 2. Get matrix data from lp_ (CSC)
1662+ a_num_rows_ = lp_.num_row_ ;
1663+ a_num_cols_ = lp_.num_col_ ;
1664+ a_nnz_ = lp_.a_matrix_ .numNz ();
1665+
1666+ HighsSparseMatrix lp_csr = lp_.a_matrix_ ;
1667+ lp_csr.ensureRowwise ();
1668+
1669+ const std::vector<int >& h_a_row_ptr = lp_csr.start_ ;
1670+ const std::vector<int >& h_a_col_ind = lp_csr.index_ ;
1671+ const std::vector<double >& h_a_val = lp_csr.value_ ;
1672+
1673+ // 3. Allocate and copy A's CSR data to GPU
1674+ CUDA_CHECK (cudaMalloc ((void **)&d_a_row_ptr_, (a_num_cols_ + 1 ) * sizeof (int )));
1675+ CUDA_CHECK (cudaMalloc ((void **)&d_a_col_ind_, a_nnz_ * sizeof (int )));
1676+ CUDA_CHECK (cudaMalloc ((void **)&d_a_val_, a_nnz_ * sizeof (double )));
1677+
1678+ CUDA_CHECK (cudaMemcpy (d_a_row_ptr_, h_a_row_ptr.data (), (a_num_cols_ + 1 ) * sizeof (int ), cudaMemcpyHostToDevice));
1679+ CUDA_CHECK (cudaMemcpy (d_a_col_ind_, h_a_col_ind.data (), a_nnz_ * sizeof (int ), cudaMemcpyHostToDevice));
1680+ CUDA_CHECK (cudaMemcpy (d_a_val_, h_a_val.data (), a_nnz_ * sizeof (double ), cudaMemcpyHostToDevice));
1681+
1682+ CUSPARSE_CHECK (cusparseCreateCsr (&mat_a_T_csr_, a_num_cols_, a_num_rows_, a_nnz_,
1683+ d_a_row_ptr_, d_a_col_ind_, d_a_val_,
1684+ CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
1685+ CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
1686+
1687+ // 4. Create matrix AT in CSR format = A in CSC
1688+ const std::vector<int >& h_at_row_ptr = lp_.a_matrix_ .start_ ;
1689+ const std::vector<int >& h_at_col_ind = lp_.a_matrix_ .index_ ;
1690+ const std::vector<double >& h_at_val = lp_.a_matrix_ .value_ ;
1691+
1692+ CUDA_CHECK (cudaMalloc ((void **)&d_at_row_ptr_, (a_num_rows_ + 1 ) * sizeof (int )));
1693+ CUDA_CHECK (cudaMalloc ((void **)&d_at_col_ind_, a_nnz_ * sizeof (int )));
1694+ CUDA_CHECK (cudaMalloc ((void **)&d_at_val_, a_nnz_ * sizeof (double )));
1695+
1696+ CUDA_CHECK (cudaMemcpy (d_at_row_ptr_, h_at_row_ptr.data (), (a_num_rows_ + 1 ) * sizeof (int ), cudaMemcpyHostToDevice));
1697+ CUDA_CHECK (cudaMemcpy (d_at_col_ind_, h_at_col_ind.data (), a_nnz_ * sizeof (int ), cudaMemcpyHostToDevice));
1698+ CUDA_CHECK (cudaMemcpy (d_at_val_, h_at_val.data (), a_nnz_ * sizeof (double ), cudaMemcpyHostToDevice));
1699+
1700+ CUSPARSE_CHECK (cusparseCreateCsr (&mat_a_csr_, a_num_rows_, a_num_cols_, a_nnz_,
1701+ d_at_row_ptr_, d_at_col_ind_, d_at_val_,
1702+ CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
1703+ CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
1704+
1705+ // 5. Allocate device vectors
1706+ CUDA_CHECK (cudaMalloc (&d_x_, a_num_cols_ * sizeof (double )));
1707+ CUDA_CHECK (cudaMalloc (&d_y_, a_num_rows_ * sizeof (double )));
1708+ CUDA_CHECK (cudaMalloc (&d_ax_, a_num_rows_ * sizeof (double )));
1709+ CUDA_CHECK (cudaMalloc (&d_aty_, a_num_cols_ * sizeof (double )));
1710+
1711+ // 6. Preallocate buffer for cuSPARSE SpMV
1712+ // Buffer for Ax
1713+ double alpha = 1.0 ;
1714+ double beta = 0.0 ;
1715+ cusparseDnVecDescr_t vec_x, vec_ax;
1716+ CUSPARSE_CHECK (cusparseCreateDnVec (&vec_x, a_num_cols_, d_x_, CUDA_R_64F));
1717+ CUSPARSE_CHECK (cusparseCreateDnVec (&vec_ax, a_num_rows_, d_ax_, CUDA_R_64F));
1718+
1719+ CUSPARSE_CHECK (cusparseSpMV_bufferSize (
1720+ cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1721+ &alpha, mat_a_csr_, vec_x, &beta, vec_ax, CUDA_R_64F,
1722+ CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_ax_));
1723+ CUDA_CHECK (cudaMalloc (&d_spmv_buffer_ax_, spmv_buffer_size_ax_));
1724+
1725+ CUSPARSE_CHECK (cusparseDestroyDnVec (vec_x));
1726+ CUSPARSE_CHECK (cusparseDestroyDnVec (vec_ax));
1727+
1728+ // Buffer for ATy
1729+ cusparseDnVecDescr_t vec_y, vec_aty;
1730+ CUSPARSE_CHECK (cusparseCreateDnVec (&vec_y, a_num_rows_, d_y_, CUDA_R_64F));
1731+ CUSPARSE_CHECK (cusparseCreateDnVec (&vec_aty, a_num_cols_, d_aty_, CUDA_R_64F));
1732+ CUSPARSE_CHECK (cusparseSpMV_bufferSize (
1733+ cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1734+ &alpha, mat_a_T_csr_, vec_y, &beta, vec_aty, CUDA_R_64F,
1735+ CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_aty_));
1736+ CUDA_CHECK (cudaMalloc (&d_spmv_buffer_aty_, spmv_buffer_size_aty_));
1737+ CUSPARSE_CHECK (cusparseDestroyDnVec (vec_y));
1738+ CUSPARSE_CHECK (cusparseDestroyDnVec (vec_aty));
1739+
1740+ highsLogUser (params_.log_options_ , HighsLogType::kInfo , " GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred to device.\n " );
1741+ }
1742+
1743+ void PDLPSolver::cleanupGpu (){
1744+ if (cusparse_handle_) CUSPARSE_CHECK (cusparseDestroy (cusparse_handle_));
1745+ if (mat_a_csr_) CUSPARSE_CHECK (cusparseDestroySpMat (mat_a_csr_));
1746+ if (mat_a_T_csr_) CUSPARSE_CHECK (cusparseDestroySpMat (mat_a_T_csr_));
1747+ CUDA_CHECK (cudaFree (d_a_row_ptr_));
1748+ CUDA_CHECK (cudaFree (d_a_col_ind_));
1749+ CUDA_CHECK (cudaFree (d_a_val_));
1750+ CUDA_CHECK (cudaFree (d_at_row_ptr_));
1751+ CUDA_CHECK (cudaFree (d_at_col_ind_));
1752+ CUDA_CHECK (cudaFree (d_at_val_));
1753+ CUDA_CHECK (cudaFree (d_x_));
1754+ CUDA_CHECK (cudaFree (d_y_));
1755+ CUDA_CHECK (cudaFree (d_ax_));
1756+ CUDA_CHECK (cudaFree (d_aty_));
1757+ CUDA_CHECK (cudaFree (d_spmv_buffer_ax_));
1758+ CUDA_CHECK (cudaFree (d_spmv_buffer_aty_));
1759+ }
1760+
1761+ void PDLPSolver::linalgGpuAx (const std::vector<double >& x, std::vector<double >& ax){
1762+ CUDA_CHECK (cudaMemcpy (d_x_, x.data (), a_num_cols_ * sizeof (double ), cudaMemcpyHostToDevice));
1763+ // Ax = 1.0 * A * x + 0.0 * ax
1764+ double alpha = 1.0 ;
1765+ double beta = 0.0 ;
1766+ cusparseDnVecDescr_t vec_x, vec_ax;
1767+ CUSPARSE_CHECK (cusparseCreateDnVec (&vec_x, a_num_cols_, d_x_, CUDA_R_64F));
1768+ CUSPARSE_CHECK (cusparseCreateDnVec (&vec_ax, a_num_rows_, d_ax_, CUDA_R_64F));
1769+ if (spmv_buffer_size_ax_ == 0 ){
1770+ CUSPARSE_CHECK (cusparseSpMV_bufferSize (
1771+ cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1772+ &alpha, mat_a_csr_, vec_x, &beta, vec_ax, CUDA_R_64F,
1773+ CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_ax_));
1774+ CUDA_CHECK (cudaMalloc (&d_spmv_buffer_ax_, spmv_buffer_size_ax_));
1775+ }
1776+
1777+ CUSPARSE_CHECK (cusparseSpMV (
1778+ cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1779+ &alpha, mat_a_csr_, vec_x, &beta, vec_ax, CUDA_R_64F,
1780+ CUSPARSE_SPMV_CSR_ALG2, d_spmv_buffer_ax_));
1781+ CUDA_CHECK (cudaMemcpy (ax.data (), d_ax_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
1782+ CUSPARSE_CHECK (cusparseDestroyDnVec (vec_x));
1783+ CUSPARSE_CHECK (cusparseDestroyDnVec (vec_ax));
1784+ }
1785+
1786+ void PDLPSolver::linalgGpuATy (const std::vector<double >& y,
1787+ std::vector<double >& aty){
1788+ CUDA_CHECK (cudaMemcpy (d_y_, y.data (), a_num_rows_ * sizeof (double ), cudaMemcpyHostToDevice));
1789+ // ATy = 1.0 * A^T * y + 0.0 * aty
1790+ double alpha = 1.0 ;
1791+ double beta = 0.0 ;
1792+ cusparseDnVecDescr_t vec_y, vec_aty;
1793+ CUSPARSE_CHECK (cusparseCreateDnVec (&vec_y, a_num_rows_, d_y_, CUDA_R_64F));
1794+ CUSPARSE_CHECK (cusparseCreateDnVec (&vec_aty, a_num_cols_, d_aty_, CUDA_R_64F));
1795+ if (spmv_buffer_size_aty_ == 0 ){
1796+ CUSPARSE_CHECK (cusparseSpMV_bufferSize (
1797+ cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1798+ &alpha, mat_a_T_csr_, vec_y, &beta, vec_aty, CUDA_R_64F,
1799+ CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_aty_));
1800+ CUDA_CHECK (cudaMalloc (&d_spmv_buffer_aty_, spmv_buffer_size_aty_));
1801+ }
1802+ CUSPARSE_CHECK (cusparseSpMV (
1803+ cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
1804+ &alpha, mat_a_T_csr_, vec_y, &beta, vec_aty, CUDA_R_64F,
1805+ CUSPARSE_SPMV_CSR_ALG2, d_spmv_buffer_aty_));
1806+ CUDA_CHECK (cudaMemcpy (aty.data (), d_aty_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
1807+ CUSPARSE_CHECK (cusparseDestroyDnVec (vec_y));
1808+ CUSPARSE_CHECK (cusparseDestroyDnVec (vec_aty));
1809+ }
0 commit comments