Skip to content

Commit 6f235e5

Browse files
committed
wip
1 parent 82a11ff commit 6f235e5

File tree

5 files changed

+92
-32
lines changed

5 files changed

+92
-32
lines changed

check/TestPdlp.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ TEST_CASE("pdlp-restart-add-row", "[pdlp]") {
342342

343343
TEST_CASE("hi-pdlp", "[pdlp]") {
344344
std::string model =
345-
"adlittle"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
345+
"afiro"; //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
346346
std::string model_file =
347347
std::string(HIGHS_DIR) + "/check/instances/" + model + ".mps";
348348
Highs h;

highs/pdlp/hipdlp/pdhg.cc

Lines changed: 76 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
518518
launchKernelUpdateX(0.0);
519519
CUDA_CHECK(cudaMemcpy(d_x_current_, d_x_next_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToDevice));
520520
CUDA_CHECK(cudaMemcpy(d_x_sum_, d_x_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToDevice));
521+
CUDA_CHECK(cudaMemcpy(d_x_avg_, d_x_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToDevice));
521522
linalgGpuAx(d_x_current_, d_ax_current_);
522523
#endif
523524

@@ -570,6 +571,10 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
570571
CUDA_CHECK(cudaMemcpy(ATy_cache_.data(), d_aty_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
571572
CUDA_CHECK(cudaMemcpy(x_sum_.data(), d_x_sum_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
572573
CUDA_CHECK(cudaMemcpy(y_sum_.data(), d_y_sum_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
574+
CUDA_CHECK(cudaMemcpy(x_avg_.data(), d_x_avg_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
575+
CUDA_CHECK(cudaMemcpy(y_avg_.data(), d_y_avg_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
576+
577+
573578
#endif
574579
hipdlpTimerStart(kHipdlpClockAverageIterate);
575580
computeAverageIterate(Ax_avg, ATy_avg);
@@ -703,6 +708,11 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
703708
debug_pdlp_data_.ax_norm = linalg::vector_norm(Ax_cache_);
704709
debug_pdlp_data_.aty_norm = linalg::vector_norm(ATy_cache_);
705710

711+
#ifdef CUPDLP_GPU
712+
CUDA_CHECK(cudaMemcpy(d_x_next_, d_x_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToDevice));
713+
CUDA_CHECK(cudaMemcpy(d_y_next_, d_y_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToDevice));
714+
#endif
715+
706716
switch (params_.step_size_strategy) {
707717
case StepSizeStrategy::FIXED:
708718
updateIteratesFixed();
@@ -744,10 +754,22 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
744754
updateAverageIterates(x_next_, y_next_, working_params, inner_iter);
745755
hipdlpTimerStop(kHipdlpClockAverageIterate);
746756

757+
#ifdef CUPDLP_GPU
758+
// Update average iterates on GPU
759+
double dMeanStepSize = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
760+
launchKernelUpdateAverages(dMeanStepSize);
761+
sum_weights_gpu_ += dMeanStepSize;
762+
#endif
763+
747764
// --- 7. Prepare for next iteration ---
748765
x_current_ = x_next_;
749766
y_current_ = y_next_;
750-
// iteration
767+
#ifdef CUPDLP_GPU
768+
CUDA_CHECK(cudaMemcpy(d_x_current_, d_x_next_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToDevice));
769+
CUDA_CHECK(cudaMemcpy(d_y_current_, d_y_next_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToDevice));
770+
CUDA_CHECK(cudaMemcpy(d_ax_current_, d_ax_next_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToDevice));
771+
CUDA_CHECK(cudaMemcpy(d_aty_current_, d_aty_next_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToDevice));
772+
#endif
751773
}
752774

753775
// --- 8. Handle Max Iterations Reached ---
@@ -793,12 +815,6 @@ void PDLPSolver::initialize() {
793815
dSlackNeg_.resize(lp_.num_col_, 0.0);
794816
dSlackPosAvg_.resize(lp_.num_col_, 0.0);
795817
dSlackNegAvg_.resize(lp_.num_col_, 0.0);
796-
797-
CUDA_CHECK(cudaMemset(d_x_current_, 0, a_num_cols_ * sizeof(double)));
798-
CUDA_CHECK(cudaMemset(d_y_current_, 0, a_num_rows_ * sizeof(double)));
799-
CUDA_CHECK(cudaMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double)));
800-
CUDA_CHECK(cudaMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double)));
801-
CUDA_CHECK(cudaMemset(d_aty_current_, 0, a_num_cols_ * sizeof(double)));
802818
}
803819

804820
// Update primal weight
@@ -1483,19 +1499,39 @@ void PDLPSolver::updateIteratesFixed() {
14831499
hipdlpTimerStop(kHipdlpClockMatrixTransposeMultiply);
14841500

14851501
#ifdef CUPDLP_GPU
1486-
launchKernelUpdateX(stepsize_.primal_step);
1487-
linalgGpuAx(d_x_next_, d_ax_next_);
1488-
launchKernelUpdateY(stepsize_.dual_step);
1489-
linalgGpuATy(d_y_next_, d_aty_next_);
1502+
// Add this check before the memcpy
1503+
if (d_x_next_ == nullptr) {
1504+
std::cerr << "Error1: d_x_next_ is null!" << std::endl;
1505+
return;
1506+
}
1507+
launchKernelUpdateX(stepsize_.primal_step);
1508+
CUDA_CHECK(cudaDeviceSynchronize());
1509+
linalgGpuAx(d_x_next_, d_ax_next_);
1510+
CUDA_CHECK(cudaDeviceSynchronize());
1511+
launchKernelUpdateY(stepsize_.dual_step);
1512+
CUDA_CHECK(cudaDeviceSynchronize());
1513+
linalgGpuATy(d_y_next_, d_aty_next_);
1514+
CUDA_CHECK(cudaDeviceSynchronize());
1515+
1516+
// Add this check before the memcpy
1517+
if (d_x_next_ == nullptr) {
1518+
std::cerr << "Error2: d_x_next_ is null!" << std::endl;
1519+
return;
1520+
}
14901521

1491-
std::vector<double> x_next_gpu(a_num_cols_);
1492-
std::vector<double> y_next_gpu(a_num_rows_);
1493-
std::vector<double> ax_next_gpu(a_num_rows_);
1494-
std::vector<double> aty_next_gpu(a_num_cols_);
1495-
CUDA_CHECK(cudaMemcpy(x_next_gpu.data(), d_x_next_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
1496-
CUDA_CHECK(cudaMemcpy(y_next_gpu.data(), d_y_next_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
1497-
CUDA_CHECK(cudaMemcpy(ax_next_gpu.data(), d_ax_next_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
1498-
CUDA_CHECK(cudaMemcpy(aty_next_gpu.data(), d_aty_next_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
1522+
std::vector<double> x_next_gpu(a_num_cols_);
1523+
std::vector<double> y_next_gpu(a_num_rows_);
1524+
std::vector<double> ax_next_gpu(a_num_rows_);
1525+
std::vector<double> aty_next_gpu(a_num_cols_);
1526+
CUDA_CHECK(cudaMemcpy(x_next_gpu.data(), d_x_next_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
1527+
CUDA_CHECK(cudaMemcpy(y_next_gpu.data(), d_y_next_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
1528+
CUDA_CHECK(cudaMemcpy(ax_next_gpu.data(), d_ax_next_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToHost));
1529+
CUDA_CHECK(cudaMemcpy(aty_next_gpu.data(), d_aty_next_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToHost));
1530+
1531+
bool x_match = vecDiff(x_next_gpu, x_next_, 1e-12, "UpdateIteratesFixed x");
1532+
bool y_match = vecDiff(y_next_gpu, y_next_, 1e-12, "UpdateIteratesFixed y");
1533+
bool ax_match = vecDiff(ax_next_gpu, Ax_next_, 1e-12, "UpdateIteratesFixed Ax");
1534+
bool aty_match = vecDiff(aty_next_gpu, ATy_next_, 1e-12, "UpdateIteratesFixed ATy");
14991535
#endif
15001536
}
15011537

@@ -1765,6 +1801,8 @@ void PDLPSolver::setupGpu(){
17651801
CUDA_CHECK(cudaMalloc(&d_is_equality_row_, a_num_rows_ * sizeof(bool)));
17661802
CUDA_CHECK(cudaMalloc(&d_x_current_, a_num_cols_ * sizeof(double)));
17671803
CUDA_CHECK(cudaMalloc(&d_y_current_, a_num_rows_ * sizeof(double)));
1804+
CUDA_CHECK(cudaMalloc(&d_x_avg_, a_num_cols_ * sizeof(double)));
1805+
CUDA_CHECK(cudaMalloc(&d_y_avg_, a_num_rows_ * sizeof(double)));
17681806
CUDA_CHECK(cudaMalloc(&d_x_next_, a_num_cols_ * sizeof(double)));
17691807
CUDA_CHECK(cudaMalloc(&d_y_next_, a_num_rows_ * sizeof(double)));
17701808
CUDA_CHECK(cudaMalloc(&d_ax_current_, a_num_rows_ * sizeof(double)));
@@ -1816,6 +1854,19 @@ void PDLPSolver::setupGpu(){
18161854
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_y));
18171855
CUSPARSE_CHECK(cusparseDestroyDnVec(vec_aty));
18181856

1857+
CUDA_CHECK(cudaMemset(d_x_current_, 0, a_num_cols_ * sizeof(double)));
1858+
CUDA_CHECK(cudaMemset(d_y_current_, 0, a_num_rows_ * sizeof(double)));
1859+
CUDA_CHECK(cudaMemset(d_x_avg_, 0, a_num_cols_ * sizeof(double)));
1860+
CUDA_CHECK(cudaMemset(d_y_avg_, 0, a_num_rows_ * sizeof(double)));
1861+
CUDA_CHECK(cudaMemset(d_x_next_, 0, a_num_cols_ * sizeof(double)));
1862+
CUDA_CHECK(cudaMemset(d_y_next_, 0, a_num_rows_ * sizeof(double)));
1863+
CUDA_CHECK(cudaMemset(d_ax_current_, 0, a_num_rows_ * sizeof(double)));
1864+
CUDA_CHECK(cudaMemset(d_ax_next_, 0, a_num_rows_ * sizeof(double)));
1865+
CUDA_CHECK(cudaMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double)));
1866+
CUDA_CHECK(cudaMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double)));
1867+
CUDA_CHECK(cudaMemset(d_aty_current_, 0, a_num_cols_ * sizeof(double)));
1868+
sum_weights_gpu_ = 0.0;
1869+
18191870
highsLogUser(params_.log_options_, HighsLogType::kInfo, "GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred to device.\n");
18201871
}
18211872

@@ -1902,11 +1953,12 @@ void PDLPSolver::launchKernelUpdateX(double primal_step) {
19021953
}
19031954

19041955
void PDLPSolver::launchKernelUpdateY(double dual_step) {
1905-
launchKernelUpdateY_wrapper(
1906-
d_y_next_, d_y_current_, d_ax_current_,
1907-
d_row_lower_, d_is_equality_row_,
1908-
dual_step, a_num_rows_);
1909-
CUDA_CHECK(cudaGetLastError());
1956+
launchKernelUpdateY_wrapper(
1957+
d_y_next_, d_y_current_,
1958+
d_ax_current_, d_ax_next_,
1959+
d_row_lower_, d_is_equality_row_,
1960+
dual_step, a_num_rows_);
1961+
CUDA_CHECK(cudaGetLastError());
19101962
}
19111963

19121964
void PDLPSolver::launchKernelUpdateAverages(double weight) {

highs/pdlp/hipdlp/pdhg.cu

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ __global__ void kernelUpdateX(
3434
// === KERNEL 2: Update Y (Dual Step) ===
3535
__global__ void kernelUpdateY(
3636
double* d_y_new, const double* d_y_old,
37-
const double* d_ax_old, const double* d_ax_new,
37+
const double* d_ax_old, const double* d_ax_new,
3838
const double* d_rhs, const bool* d_is_equality,
3939
double dual_step, int n_rows)
4040
{
@@ -80,11 +80,12 @@ void launchKernelUpdateX_wrapper(
8080
d_cost, d_lower, d_upper,
8181
primal_step, n_cols);
8282

83-
cudaGetLastError(); // or your error checking
83+
cudaGetLastError();
8484
}
85+
8586
void launchKernelUpdateY_wrapper(
8687
double* d_y_new, const double* d_y_old,
87-
const double* d_ax_old, const double* d_ax_new,
88+
const double* d_ax_old, const double* d_ax_new,
8889
const double* d_rhs, const bool* d_is_equality,
8990
double dual_step, int n_rows)
9091
{
@@ -97,8 +98,9 @@ void launchKernelUpdateY_wrapper(
9798
d_rhs, d_is_equality,
9899
dual_step, n_rows);
99100

100-
cudaGetLastError(); // or your error checking
101+
cudaGetLastError();
101102
}
103+
102104
void launchKernelUpdateAverages_wrapper(
103105
double* d_x_sum, double* d_y_sum,
104106
const double* d_x_next, const double* d_y_next,
@@ -111,6 +113,6 @@ void launchKernelUpdateAverages_wrapper(
111113
d_x_sum, d_y_sum,
112114
d_x_next, d_y_next,
113115
weight, n_cols, n_rows);
114-
cudaGetLastError(); // or your error checking
116+
cudaGetLastError();
115117
}
116118
} // extern "C"

highs/pdlp/hipdlp/pdhg.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ class PDLPSolver {
220220
bool* d_is_equality_row_ = nullptr;
221221
double* d_x_current_ = nullptr;
222222
double* d_y_current_ = nullptr;
223+
double* d_x_avg_ = nullptr;
224+
double* d_y_avg_ = nullptr;
223225
double* d_x_next_ = nullptr;
224226
double* d_y_next_ = nullptr;
225227
double* d_ax_current_ = nullptr; // Replaces host-side Ax_cache_
@@ -229,6 +231,9 @@ class PDLPSolver {
229231
double* d_x_sum_ = nullptr;
230232
double* d_y_sum_ = nullptr;
231233

234+
//States
235+
double sum_weights_gpu_ = 0.0;
236+
232237
// Temporary buffer for SpMV
233238
void* d_spmv_buffer_ax_ = nullptr;
234239
size_t spmv_buffer_size_ax_ = 0;

highs/pdlp/hipdlp/pdhg_kernels.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@ void launchKernelUpdateX_wrapper(
1010
double primal_step, int n_cols);
1111

1212
void launchKernelUpdateY_wrapper(
13-
double* d_y_new, const double* d_y_old, const double* d_ax,
13+
double* d_y_new, const double* d_y_old,
14+
const double* d_ax_old, const double* d_ax_new,
1415
const double* d_row_lower, const bool* d_is_equality,
15-
double dual_step, int n_rows);
16+
double dual_step, int n_rows);
1617

1718
void launchKernelUpdateAverages_wrapper(
1819
double* d_x_sum, double* d_y_sum,

0 commit comments

Comments
 (0)