@@ -1844,7 +1844,7 @@ void PDLPSolver::closeDebugLog() {
18441844void PDLPSolver::setupGpu (){
18451845 // 1. Initialize cuSPARSE
18461846 CUSPARSE_CHECK (cusparseCreate (&cusparse_handle_));
1847-
1847+ CUBLAS_CHECK ( cublasCreate (&cublas_handle_));
18481848 // 2. Get matrix data from lp_ (CSC)
18491849 a_num_rows_ = lp_.num_row_ ;
18501850 a_num_cols_ = lp_.num_col_ ;
@@ -1986,11 +1986,16 @@ void PDLPSolver::setupGpu(){
19861986 cudaFree (d_row_scale_); d_row_scale_ = nullptr ;
19871987 }
19881988
1989+ size_t max_size = std::max (a_num_cols_, a_num_rows_);
1990+ CUDA_CHECK (cudaMalloc (&d_buffer_, max_size * sizeof (double )));
1991+ CUDA_CHECK (cudaMalloc (&d_buffer2_, max_size * sizeof (double )));
1992+
19891993 highsLogUser (params_.log_options_ , HighsLogType::kInfo , " GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred to device.\n " );
19901994}
19911995
19921996void PDLPSolver::cleanupGpu (){
19931997 if (cusparse_handle_) CUSPARSE_CHECK (cusparseDestroy (cusparse_handle_));
1998+ if (cublas_handle_) CUBLAS_CHECK (cublasDestroy (cublas_handle_));
19941999 if (mat_a_csr_) CUSPARSE_CHECK (cusparseDestroySpMat (mat_a_csr_));
19952000 if (mat_a_T_csr_) CUSPARSE_CHECK (cusparseDestroySpMat (mat_a_T_csr_));
19962001 CUDA_CHECK (cudaFree (d_a_row_ptr_));
@@ -2029,6 +2034,8 @@ void PDLPSolver::cleanupGpu(){
20292034 CUDA_CHECK (cudaFree (d_dSlackNegAvg_));
20302035 if (d_col_scale_) CUDA_CHECK (cudaFree (d_col_scale_));
20312036 if (d_row_scale_) CUDA_CHECK (cudaFree (d_row_scale_));
2037+ CUDA_CHECK (cudaFree (d_buffer_));
2038+ CUDA_CHECK (cudaFree (d_buffer2_));
20322039}
20332040
20342041void PDLPSolver::linalgGpuAx (const double * d_x_in, double * d_ax_out){
@@ -2154,39 +2161,33 @@ bool PDLPSolver::checkConvergenceGpu(
21542161 return primal_feasible && dual_feasible && gap_small;
21552162
21562163}
2157- void PDLPSolver::computeStepSizeRatioGpu (PrimalDualParams& working_params) {
2158- // 1. Compute ||x_last - x_current||^2 on GPU
2159- launchKernelDiffTwoNormSquared_wrapper (d_x_at_last_restart_, d_x_current_, d_x_temp_diff_norm_result_, a_num_cols_);
2160-
2161- double primal_diff_sq;
2162- CUDA_CHECK (cudaMemcpy (&primal_diff_sq, d_x_temp_diff_norm_result_, sizeof (double ), cudaMemcpyDeviceToHost));
2163- double primal_diff_norm = std::sqrt (primal_diff_sq);
2164-
2165- // 2. Compute ||y_last - y_current||^2 on GPU
2166- launchKernelDiffTwoNormSquared_wrapper (d_y_at_last_restart_, d_y_current_, d_y_temp_diff_norm_result_, a_num_rows_);
2167-
2168- double dual_diff_sq;
2169- CUDA_CHECK (cudaMemcpy (&dual_diff_sq, d_y_temp_diff_norm_result_, sizeof (double ), cudaMemcpyDeviceToHost));
2170- double dual_diff_norm = std::sqrt (dual_diff_sq);
2171-
2172- double dMeanStepSize = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
21732164
2174- // 3. Compute new beta
2175- if (std::min (primal_diff_norm, dual_diff_norm) > 1e-10 ) {
2176- double beta_update_ratio = dual_diff_norm / primal_diff_norm;
2177- double old_beta = stepsize_.beta ;
2178-
2179- double dLogBetaUpdate =
2180- 0.5 * std::log (beta_update_ratio) + 0.5 * std::log (std::sqrt (old_beta));
2181- stepsize_.beta = std::exp (2.0 * dLogBetaUpdate);
2182- }
2165+ void PDLPSolver::computeStepSizeRatioGpu (PrimalDualParams& working_params) {
2166+ // 1. Compute ||x_last - x_current||^2 using cuBLAS
2167+ double primal_diff_norm = computeDiffNormCuBLAS (
2168+ d_x_at_last_restart_, d_x_current_, a_num_cols_);
2169+
2170+ // 2. Compute ||y_last - y_current||^2 using cuBLAS
2171+ double dual_diff_norm = computeDiffNormCuBLAS (
2172+ d_y_at_last_restart_, d_y_current_, a_num_rows_);
2173+
2174+ double dMeanStepSize = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
2175+
2176+ // 3. Update beta (same CPU logic)
2177+ if (std::min (primal_diff_norm, dual_diff_norm) > 1e-10 ) {
2178+ double beta_update_ratio = dual_diff_norm / primal_diff_norm;
2179+ double old_beta = stepsize_.beta ;
2180+ double dLogBetaUpdate =
2181+ 0.5 * std::log (beta_update_ratio) + 0.5 * std::log (std::sqrt (old_beta));
2182+ stepsize_.beta = std::exp (2.0 * dLogBetaUpdate);
2183+ }
21832184
2184- // Update steps
2185- stepsize_.primal_step = dMeanStepSize / std::sqrt (stepsize_.beta );
2186- stepsize_.dual_step = stepsize_.primal_step * stepsize_.beta ;
2187- working_params.eta = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
2188- working_params.omega = std::sqrt (stepsize_.beta );
2189- restart_scheme_.UpdateBeta (stepsize_.beta );
2185+ // Update step sizes
2186+ stepsize_.primal_step = dMeanStepSize / std::sqrt (stepsize_.beta );
2187+ stepsize_.dual_step = stepsize_.primal_step * stepsize_.beta ;
2188+ working_params.eta = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
2189+ working_params.omega = std::sqrt (stepsize_.beta );
2190+ restart_scheme_.UpdateBeta (stepsize_.beta );
21902191}
21912192
21922193void PDLPSolver::updateAverageIteratesGpu (int inner_iter) {
@@ -2211,33 +2212,65 @@ void PDLPSolver::computeAverageIterateGpu() {
22112212 linalgGpuATy (d_y_avg_, d_aty_avg_);
22122213}
22132214
2214- double PDLPSolver::computeMovementGpu (const double * d_x_new, const double * d_x_old,
2215- const double * d_y_new, const double * d_y_old) {
2216- // 1. Compute ||x_new - x_old||^2
2217- launchKernelDiffTwoNormSquared_wrapper (d_x_new, d_x_old, d_x_temp_diff_norm_result_, a_num_cols_);
2218- double primal_diff_sq;
2219- CUDA_CHECK (cudaMemcpy (&primal_diff_sq, d_x_temp_diff_norm_result_, sizeof (double ), cudaMemcpyDeviceToHost));
2220-
2221- // 2. Compute ||y_new - y_old||^2
2222- launchKernelDiffTwoNormSquared_wrapper (d_y_new, d_y_old, d_x_temp_diff_norm_result_, a_num_rows_);
2223- double dual_diff_sq;
2224- CUDA_CHECK (cudaMemcpy (&dual_diff_sq, d_x_temp_diff_norm_result_, sizeof (double ), cudaMemcpyDeviceToHost));
2215+ double PDLPSolver::computeMovementGpu (
2216+ const double * d_x_new, const double * d_x_old,
2217+ const double * d_y_new, const double * d_y_old)
2218+ {
2219+ // 1. Compute ||x_new - x_old|| using cuBLAS
2220+ double primal_diff_norm = computeDiffNormCuBLAS (d_x_new, d_x_old, a_num_cols_);
2221+
2222+ // 2. Compute ||y_new - y_old|| using cuBLAS
2223+ double dual_diff_norm = computeDiffNormCuBLAS (d_y_new, d_y_old, a_num_rows_);
2224+
2225+ // 3. Combine on CPU
2226+ double primal_weight = std::sqrt (stepsize_.beta );
2227+ double primal_diff_sq = primal_diff_norm * primal_diff_norm;
2228+ double dual_diff_sq = dual_diff_norm * dual_diff_norm;
2229+
2230+ return (0.5 * primal_weight * primal_diff_sq) +
2231+ (0.5 / primal_weight * dual_diff_sq);
2232+ }
22252233
2226- // 3. Combine scalar results on CPU
2227- double primal_weight = std::sqrt (stepsize_.beta );
2228- return (0.5 * primal_weight * primal_diff_sq) +
2229- (0.5 / primal_weight) * dual_diff_sq;
2234+ double PDLPSolver::computeNonlinearityGpu (
2235+ const double * d_x_new, const double * d_x_old,
2236+ const double * d_aty_new, const double * d_aty_old)
2237+ {
2238+ // 1. Compute delta_x = x_new - x_old
2239+ CUDA_CHECK (cudaMemcpy (d_buffer_, d_x_new, a_num_cols_ * sizeof (double ),
2240+ cudaMemcpyDeviceToDevice));
2241+ double alpha = -1.0 ;
2242+ CUBLAS_CHECK (cublasDaxpy (cublas_handle_, a_num_cols_, &alpha,
2243+ d_x_old, 1 , d_buffer_, 1 ));
2244+
2245+ // 2. Compute delta_aty = aty_new - aty_old
2246+ CUDA_CHECK (cudaMemcpy (d_buffer2_, d_aty_new, a_num_cols_ * sizeof (double ),
2247+ cudaMemcpyDeviceToDevice));
2248+ CUBLAS_CHECK (cublasDaxpy (cublas_handle_, a_num_cols_, &alpha,
2249+ d_aty_old, 1 , d_buffer2_, 1 ));
2250+
2251+ // 3. Compute dot product: delta_x' * delta_aty
2252+ double result;
2253+ CUBLAS_CHECK (cublasDdot (cublas_handle_, a_num_cols_,
2254+ d_buffer_, 1 , d_buffer2_, 1 , &result));
2255+
2256+ return result;
22302257}
22312258
2232- double PDLPSolver::computeNonlinearityGpu (const double * d_x_new, const double * d_x_old,
2233- const double * d_aty_new, const double * d_aty_old) {
2234- // Compute dot( (x_new - x_old), (aty_new - aty_old) )
2235- launchKernelDiffDotDiff_wrapper (d_x_new, d_x_old, d_aty_new, d_aty_old,
2236- d_x_temp_diff_norm_result_, a_num_cols_);
2237-
2238- double interaction;
2239- CUDA_CHECK (cudaMemcpy (&interaction, d_x_temp_diff_norm_result_, sizeof (double ), cudaMemcpyDeviceToHost));
2240-
2241- return interaction; // cupdlp does not take absolute value here, it handles fabs in the check
2259+ double PDLPSolver::computeDiffNormCuBLAS (
2260+ const double * d_a, const double * d_b, int n)
2261+ {
2262+ // 1. Copy a to buffer: buffer = a
2263+ CUDA_CHECK (cudaMemcpy (d_buffer_, d_a, n * sizeof (double ),
2264+ cudaMemcpyDeviceToDevice));
2265+
2266+ // 2. buffer = buffer - b (using cuBLAS axpy)
2267+ double alpha = -1.0 ;
2268+ CUBLAS_CHECK (cublasDaxpy (cublas_handle_, n, &alpha, d_b, 1 , d_buffer_, 1 ));
2269+
2270+ // 3. result = ||buffer||_2 (using cuBLAS nrm2)
2271+ double norm;
2272+ CUBLAS_CHECK (cublasDnrm2 (cublas_handle_, n, d_buffer_, 1 , &norm));
2273+
2274+ return norm;
22422275}
22432276#endif
0 commit comments