Skip to content

Commit ed9ebf7

Browse files
committed
use cublas
1 parent 21fa43f commit ed9ebf7

File tree

6 files changed

+130
-142
lines changed

6 files changed

+130
-142
lines changed

check/TestPdlp.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ TEST_CASE("pdlp-restart-add-row", "[pdlp]") {
342342

343343
TEST_CASE("hi-pdlp", "[pdlp]") {
344344
std::string model =
345-
"avgas"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p //avgas
345+
"adlittle"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p //avgas
346346
std::string model_file =
347347
std::string(HIGHS_DIR) + "/check/instances/" + model + ".mps";
348348
Highs h;
@@ -352,10 +352,10 @@ TEST_CASE("hi-pdlp", "[pdlp]") {
352352
h.setOptionValue("kkt_tolerance", kkt_tolerance);
353353
h.setOptionValue("presolve", "off");
354354

355-
HighsInt pdlp_features_off = 0 +
356-
kPdlpScalingOff +
357-
kPdlpRestartOff
358-
//kPdlpAdaptiveStepSizeOff
355+
HighsInt pdlp_features_off = 0
356+
//+kPdlpScalingOff
357+
//+kPdlpRestartOff
358+
//+kPdlpAdaptiveStepSizeOff
359359
;
360360
h.setOptionValue("pdlp_features_off", pdlp_features_off);
361361

@@ -408,7 +408,7 @@ TEST_CASE("hi-pdlp", "[pdlp]") {
408408
std::cout << "Objective: " << h.getInfo().objective_function_value
409409
<< std::endl;
410410
}
411-
assert(hipdlp_iteration_count == h.getInfo().pdlp_iteration_count);
411+
//assert(hipdlp_iteration_count == h.getInfo().pdlp_iteration_count);
412412
h.resetGlobalScheduler(true);
413413
}
414414

highs/pdlp/cupdlp/cupdlp_utils.c

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1796,32 +1796,32 @@ void debugPdlpDataInitialise(struct DebugPdlpData* debug_pdlp) {
17961796

17971797
void debugPdlpIterLog(FILE* file, const int iter_num, const struct DebugPdlpData* debug_pdlp, const double beta, const double primal_step, const double dual_step) {
17981798
if (!file) return;
1799-
fprintf(file, "%6d %11.4g %11.4g %11.4g %11.4g %11.4g %11.4g %11.4g %11.4g\n",
1800-
iter_num,
1801-
debug_pdlp->ax_norm,
1802-
debug_pdlp->aty_norm,
1803-
debug_pdlp->ax_average_norm,
1804-
debug_pdlp->aty_average_norm,
1805-
debug_pdlp->x_average_norm,
1806-
beta,
1807-
primal_step,
1808-
dual_step);
1799+
fprintf(file, "%6d %16.12g %16.12g %16.12g %16.12g %16.12g %16.12g %16.12g %16.12g\n",
1800+
iter_num,
1801+
debug_pdlp->ax_norm,
1802+
debug_pdlp->aty_norm,
1803+
debug_pdlp->ax_average_norm,
1804+
debug_pdlp->aty_average_norm,
1805+
debug_pdlp->x_average_norm,
1806+
beta,
1807+
primal_step,
1808+
dual_step);
18091809
}
18101810

18111811
void debugPdlpFeasOptLog(FILE* file,
1812-
const int iter_num,
1813-
const double primal_obj, const double dual_obj,
1814-
const double gap, const double primal_feas, const double dual_feas,
1815-
const char* type) {
1812+
const int iter_num,
1813+
const double primal_obj, const double dual_obj,
1814+
const double gap, const double primal_feas, const double dual_feas,
1815+
const char* type) {
18161816
if (!file) return;
18171817
fprintf(file,
1818-
"%6d Feasibility-optimality %s\n"
1819-
" primal_obj = %11.4g\n"
1820-
" dual_obj = %11.4g\n"
1821-
" gap = %11.4g\n"
1822-
" primal_feas = %11.4g\n"
1823-
" dual_feas = %11.4g\n",
1824-
iter_num, type, primal_obj, dual_obj, gap, primal_feas, dual_feas);
1818+
"%6d Feasibility-optimality %s\n"
1819+
" primal_obj = %16.12g\n"
1820+
" dual_obj = %16.12g\n"
1821+
" gap = %16.12g\n"
1822+
" primal_feas = %16.12g\n"
1823+
" dual_feas = %16.12g\n",
1824+
iter_num, type, primal_obj, dual_obj, gap, primal_feas, dual_feas);
18251825
}
18261826

18271827
void debugPdlpRestartLog(FILE* file, const int iter_num, const double current_score, const double average_score) {

highs/pdlp/hipdlp/pdhg.cc

Lines changed: 90 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1844,7 +1844,7 @@ void PDLPSolver::closeDebugLog() {
18441844
void PDLPSolver::setupGpu(){
18451845
//1. Initialize cuSPARSE
18461846
CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
1847-
1847+
CUBLAS_CHECK(cublasCreate(&cublas_handle_));
18481848
//2. Get matrix data from lp_ (CSC)
18491849
a_num_rows_ = lp_.num_row_;
18501850
a_num_cols_ = lp_.num_col_;
@@ -1986,11 +1986,16 @@ void PDLPSolver::setupGpu(){
19861986
cudaFree(d_row_scale_); d_row_scale_ = nullptr;
19871987
}
19881988

1989+
size_t max_size = std::max(a_num_cols_, a_num_rows_);
1990+
CUDA_CHECK(cudaMalloc(&d_buffer_, max_size * sizeof(double)));
1991+
CUDA_CHECK(cudaMalloc(&d_buffer2_, max_size * sizeof(double)));
1992+
19891993
highsLogUser(params_.log_options_, HighsLogType::kInfo, "GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred to device.\n");
19901994
}
19911995

19921996
void PDLPSolver::cleanupGpu(){
19931997
if (cusparse_handle_) CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
1998+
if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
19941999
if (mat_a_csr_) CUSPARSE_CHECK(cusparseDestroySpMat(mat_a_csr_));
19952000
if (mat_a_T_csr_) CUSPARSE_CHECK(cusparseDestroySpMat(mat_a_T_csr_));
19962001
CUDA_CHECK(cudaFree(d_a_row_ptr_));
@@ -2029,6 +2034,8 @@ void PDLPSolver::cleanupGpu(){
20292034
CUDA_CHECK(cudaFree(d_dSlackNegAvg_));
20302035
if(d_col_scale_) CUDA_CHECK(cudaFree(d_col_scale_));
20312036
if(d_row_scale_) CUDA_CHECK(cudaFree(d_row_scale_));
2037+
CUDA_CHECK(cudaFree(d_buffer_));
2038+
CUDA_CHECK(cudaFree(d_buffer2_));
20322039
}
20332040

20342041
void PDLPSolver::linalgGpuAx(const double* d_x_in, double* d_ax_out){
@@ -2154,39 +2161,33 @@ bool PDLPSolver::checkConvergenceGpu(
21542161
return primal_feasible && dual_feasible && gap_small;
21552162

21562163
}
2157-
void PDLPSolver::computeStepSizeRatioGpu(PrimalDualParams& working_params) {
2158-
// 1. Compute ||x_last - x_current||^2 on GPU
2159-
launchKernelDiffTwoNormSquared_wrapper(d_x_at_last_restart_, d_x_current_, d_x_temp_diff_norm_result_, a_num_cols_);
2160-
2161-
double primal_diff_sq;
2162-
CUDA_CHECK(cudaMemcpy(&primal_diff_sq, d_x_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
2163-
double primal_diff_norm = std::sqrt(primal_diff_sq);
2164-
2165-
// 2. Compute ||y_last - y_current||^2 on GPU
2166-
launchKernelDiffTwoNormSquared_wrapper(d_y_at_last_restart_, d_y_current_, d_y_temp_diff_norm_result_, a_num_rows_);
2167-
2168-
double dual_diff_sq;
2169-
CUDA_CHECK(cudaMemcpy(&dual_diff_sq, d_y_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
2170-
double dual_diff_norm = std::sqrt(dual_diff_sq);
2171-
2172-
double dMeanStepSize = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
21732164

2174-
// 3. Compute new beta
2175-
if (std::min(primal_diff_norm, dual_diff_norm) > 1e-10) {
2176-
double beta_update_ratio = dual_diff_norm / primal_diff_norm;
2177-
double old_beta = stepsize_.beta;
2178-
2179-
double dLogBetaUpdate =
2180-
0.5 * std::log(beta_update_ratio) + 0.5 * std::log(std::sqrt(old_beta));
2181-
stepsize_.beta = std::exp(2.0 * dLogBetaUpdate);
2182-
}
2165+
void PDLPSolver::computeStepSizeRatioGpu(PrimalDualParams& working_params) {
2166+
// 1. Compute ||x_last - x_current||^2 using cuBLAS
2167+
double primal_diff_norm = computeDiffNormCuBLAS(
2168+
d_x_at_last_restart_, d_x_current_, a_num_cols_);
2169+
2170+
// 2. Compute ||y_last - y_current||^2 using cuBLAS
2171+
double dual_diff_norm = computeDiffNormCuBLAS(
2172+
d_y_at_last_restart_, d_y_current_, a_num_rows_);
2173+
2174+
double dMeanStepSize = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
2175+
2176+
// 3. Update beta (same CPU logic)
2177+
if (std::min(primal_diff_norm, dual_diff_norm) > 1e-10) {
2178+
double beta_update_ratio = dual_diff_norm / primal_diff_norm;
2179+
double old_beta = stepsize_.beta;
2180+
double dLogBetaUpdate =
2181+
0.5 * std::log(beta_update_ratio) + 0.5 * std::log(std::sqrt(old_beta));
2182+
stepsize_.beta = std::exp(2.0 * dLogBetaUpdate);
2183+
}
21832184

2184-
// Update steps
2185-
stepsize_.primal_step = dMeanStepSize / std::sqrt(stepsize_.beta);
2186-
stepsize_.dual_step = stepsize_.primal_step * stepsize_.beta;
2187-
working_params.eta = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
2188-
working_params.omega = std::sqrt(stepsize_.beta);
2189-
restart_scheme_.UpdateBeta(stepsize_.beta);
2185+
// Update step sizes
2186+
stepsize_.primal_step = dMeanStepSize / std::sqrt(stepsize_.beta);
2187+
stepsize_.dual_step = stepsize_.primal_step * stepsize_.beta;
2188+
working_params.eta = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
2189+
working_params.omega = std::sqrt(stepsize_.beta);
2190+
restart_scheme_.UpdateBeta(stepsize_.beta);
21902191
}
21912192

21922193
void PDLPSolver::updateAverageIteratesGpu(int inner_iter) {
@@ -2211,33 +2212,65 @@ void PDLPSolver::computeAverageIterateGpu() {
22112212
linalgGpuATy(d_y_avg_, d_aty_avg_);
22122213
}
22132214

2214-
double PDLPSolver::computeMovementGpu(const double* d_x_new, const double* d_x_old,
2215-
const double* d_y_new, const double* d_y_old) {
2216-
// 1. Compute ||x_new - x_old||^2
2217-
launchKernelDiffTwoNormSquared_wrapper(d_x_new, d_x_old, d_x_temp_diff_norm_result_, a_num_cols_);
2218-
double primal_diff_sq;
2219-
CUDA_CHECK(cudaMemcpy(&primal_diff_sq, d_x_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
2220-
2221-
// 2. Compute ||y_new - y_old||^2
2222-
launchKernelDiffTwoNormSquared_wrapper(d_y_new, d_y_old, d_x_temp_diff_norm_result_, a_num_rows_);
2223-
double dual_diff_sq;
2224-
CUDA_CHECK(cudaMemcpy(&dual_diff_sq, d_x_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
2215+
double PDLPSolver::computeMovementGpu(
2216+
const double* d_x_new, const double* d_x_old,
2217+
const double* d_y_new, const double* d_y_old)
2218+
{
2219+
// 1. Compute ||x_new - x_old|| using cuBLAS
2220+
double primal_diff_norm = computeDiffNormCuBLAS(d_x_new, d_x_old, a_num_cols_);
2221+
2222+
// 2. Compute ||y_new - y_old|| using cuBLAS
2223+
double dual_diff_norm = computeDiffNormCuBLAS(d_y_new, d_y_old, a_num_rows_);
2224+
2225+
// 3. Combine on CPU
2226+
double primal_weight = std::sqrt(stepsize_.beta);
2227+
double primal_diff_sq = primal_diff_norm * primal_diff_norm;
2228+
double dual_diff_sq = dual_diff_norm * dual_diff_norm;
2229+
2230+
return (0.5 * primal_weight * primal_diff_sq) +
2231+
(0.5 / primal_weight * dual_diff_sq);
2232+
}
22252233

2226-
// 3. Combine scalar results on CPU
2227-
double primal_weight = std::sqrt(stepsize_.beta);
2228-
return (0.5 * primal_weight * primal_diff_sq) +
2229-
(0.5 / primal_weight) * dual_diff_sq;
2234+
double PDLPSolver::computeNonlinearityGpu(
2235+
const double* d_x_new, const double* d_x_old,
2236+
const double* d_aty_new, const double* d_aty_old)
2237+
{
2238+
// 1. Compute delta_x = x_new - x_old
2239+
CUDA_CHECK(cudaMemcpy(d_buffer_, d_x_new, a_num_cols_ * sizeof(double),
2240+
cudaMemcpyDeviceToDevice));
2241+
double alpha = -1.0;
2242+
CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha,
2243+
d_x_old, 1, d_buffer_, 1));
2244+
2245+
// 2. Compute delta_aty = aty_new - aty_old
2246+
CUDA_CHECK(cudaMemcpy(d_buffer2_, d_aty_new, a_num_cols_ * sizeof(double),
2247+
cudaMemcpyDeviceToDevice));
2248+
CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha,
2249+
d_aty_old, 1, d_buffer2_, 1));
2250+
2251+
// 3. Compute dot product: delta_x' * delta_aty
2252+
double result;
2253+
CUBLAS_CHECK(cublasDdot(cublas_handle_, a_num_cols_,
2254+
d_buffer_, 1, d_buffer2_, 1, &result));
2255+
2256+
return result;
22302257
}
22312258

2232-
double PDLPSolver::computeNonlinearityGpu(const double* d_x_new, const double* d_x_old,
2233-
const double* d_aty_new, const double* d_aty_old) {
2234-
// Compute dot( (x_new - x_old), (aty_new - aty_old) )
2235-
launchKernelDiffDotDiff_wrapper(d_x_new, d_x_old, d_aty_new, d_aty_old,
2236-
d_x_temp_diff_norm_result_, a_num_cols_);
2237-
2238-
double interaction;
2239-
CUDA_CHECK(cudaMemcpy(&interaction, d_x_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
2240-
2241-
return interaction; // cupdlp does not take absolute value here, it handles fabs in the check
2259+
double PDLPSolver::computeDiffNormCuBLAS(
2260+
const double* d_a, const double* d_b, int n)
2261+
{
2262+
// 1. Copy a to buffer: buffer = a
2263+
CUDA_CHECK(cudaMemcpy(d_buffer_, d_a, n * sizeof(double),
2264+
cudaMemcpyDeviceToDevice));
2265+
2266+
// 2. buffer = buffer - b (using cuBLAS axpy)
2267+
double alpha = -1.0;
2268+
CUBLAS_CHECK(cublasDaxpy(cublas_handle_, n, &alpha, d_b, 1, d_buffer_, 1));
2269+
2270+
// 3. result = ||buffer||_2 (using cuBLAS nrm2)
2271+
double norm;
2272+
CUBLAS_CHECK(cublasDnrm2(cublas_handle_, n, d_buffer_, 1, &norm));
2273+
2274+
return norm;
22422275
}
22432276
#endif

highs/pdlp/hipdlp/pdhg.cu

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -182,33 +182,6 @@ __global__ void kernelCheckDual(
182182
atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
183183
}
184184

185-
__global__ void kernelDiffTwoNormSquared(
186-
const double* a, const double* b,
187-
double* result, int n){
188-
double local_diff_sq = 0.0;
189-
CUDA_GRID_STRIDE_LOOP(i, n){
190-
double diff = a[i] - b[i];
191-
local_diff_sq += diff * diff;
192-
}
193-
194-
atomicAdd(result, local_diff_sq);
195-
}
196-
197-
// Computes sum( (a_new[i] - a_old[i]) * (b_new[i] - b_old[i]) )
198-
__global__ void kernelDiffDotDiff(
199-
const double* a_new, const double* a_old,
200-
const double* b_new, const double* b_old,
201-
double* result, int n)
202-
{
203-
double local_sum = 0.0;
204-
CUDA_GRID_STRIDE_LOOP(i, n) {
205-
double diff_a = a_new[i] - a_old[i];
206-
double diff_b = b_new[i] - b_old[i];
207-
local_sum += diff_a * diff_b;
208-
}
209-
atomicAdd(result, local_sum);
210-
}
211-
212185
// Add C++ wrapper functions to launch the kernels
213186
extern "C" {
214187
void launchKernelUpdateX_wrapper(
@@ -305,29 +278,4 @@ void launchCheckConvergenceKernels_wrapper(
305278
cudaGetLastError();
306279
}
307280

308-
void launchKernelDiffTwoNormSquared_wrapper(
309-
const double* d_a, const double* d_b, double* d_result, int n) {
310-
311-
// Reset result on device first
312-
cudaMemset(d_result, 0, sizeof(double));
313-
314-
const int block_size = 256;
315-
dim3 config = GetLaunchConfig(n, block_size);
316-
kernelDiffTwoNormSquared<<<config.x, block_size>>>(d_a, d_b, d_result, n);
317-
cudaGetLastError();
318-
}
319-
320-
void launchKernelDiffDotDiff_wrapper(
321-
const double* d_a_new, const double* d_a_old,
322-
const double* d_b_new, const double* d_b_old,
323-
double* d_result, int n)
324-
{
325-
cudaMemset(d_result, 0, sizeof(double));
326-
const int block_size = 256;
327-
dim3 config = GetLaunchConfig(n, block_size);
328-
329-
kernelDiffDotDiff<<<config.x, block_size>>>(
330-
d_a_new, d_a_old, d_b_new, d_b_old, d_result, n);
331-
cudaGetLastError();
332-
}
333281
} // extern "C"

highs/pdlp/hipdlp/pdhg.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,18 @@ class PDLPSolver {
195195
} \
196196
} while (0)
197197

198+
#define CUBLAS_CHECK(call) \
199+
do { \
200+
cublasStatus_t status = call; \
201+
if (status != CUBLAS_STATUS_SUCCESS) { \
202+
fprintf(stderr, "cuBLAS Error at %s:%d: %d\n", \
203+
__FILE__, __LINE__, status); \
204+
exit(EXIT_FAILURE); \
205+
} \
206+
} while(0)
198207
// --- GPU Members ---
199208
cusparseHandle_t cusparse_handle_ = nullptr;
209+
cublasHandle_t cublas_handle_ = nullptr;
200210

201211
// Matrix A in CSR format (for Ax)
202212
cusparseSpMatDescr_t mat_a_csr_ = nullptr;
@@ -257,6 +267,8 @@ class PDLPSolver {
257267
size_t spmv_buffer_size_ax_ = 0;
258268
void* d_spmv_buffer_aty_ = nullptr;
259269
size_t spmv_buffer_size_aty_ = 0;
270+
double* d_buffer_; //for cublas
271+
double* d_buffer2_;
260272

261273
void launchKernelUpdateX(double primal_step);
262274
void launchKernelUpdateY(double dual_step);
@@ -270,6 +282,7 @@ class PDLPSolver {
270282

271283
double computeNonlinearityGpu(const double* d_x_new, const double* d_x_old,
272284
const double* d_aty_new, const double* d_aty_old);
285+
double computeDiffNormCuBLAS(const double* d_a, const double* d_b, int n);
273286
#endif
274287
};
275288

highs/pdlp/hipdlp/pdhg_kernels.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,7 @@ void launchCheckConvergenceKernels_wrapper(
3535
const double* d_col_scale, const double* d_row_scale,
3636
int n_cols, int n_rows);
3737

38-
void launchKernelDiffTwoNormSquared_wrapper(
39-
const double* d_a, const double* d_b, double* d_result, int n);
4038

41-
void launchKernelDiffDotDiff_wrapper(
42-
const double* d_a_new, const double* d_a_old,
43-
const double* d_b_new, const double* d_b_old,
44-
double* d_result, int n);
4539
#ifdef __cplusplus
4640
}
4741
#endif

0 commit comments

Comments
 (0)