@@ -518,6 +518,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
518518 launchKernelUpdateX (0.0 );
519519 CUDA_CHECK (cudaMemcpy (d_x_current_, d_x_next_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
520520 CUDA_CHECK (cudaMemcpy (d_x_sum_, d_x_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
521+ CUDA_CHECK (cudaMemcpy (d_x_avg_, d_x_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
521522 linalgGpuAx (d_x_current_, d_ax_current_);
522523#endif
523524
@@ -570,6 +571,10 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
570571 CUDA_CHECK (cudaMemcpy (ATy_cache_.data (), d_aty_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
571572 CUDA_CHECK (cudaMemcpy (x_sum_.data (), d_x_sum_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
572573 CUDA_CHECK (cudaMemcpy (y_sum_.data (), d_y_sum_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
574+ CUDA_CHECK (cudaMemcpy (x_avg_.data (), d_x_avg_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
575+ CUDA_CHECK (cudaMemcpy (y_avg_.data (), d_y_avg_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
576+
577+
573578#endif
574579 hipdlpTimerStart (kHipdlpClockAverageIterate );
575580 computeAverageIterate (Ax_avg, ATy_avg);
@@ -703,6 +708,11 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
703708 debug_pdlp_data_.ax_norm = linalg::vector_norm (Ax_cache_);
704709 debug_pdlp_data_.aty_norm = linalg::vector_norm (ATy_cache_);
705710
711+ #ifdef CUPDLP_GPU
712+ CUDA_CHECK (cudaMemcpy (d_x_next_, d_x_current_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
713+ CUDA_CHECK (cudaMemcpy (d_y_next_, d_y_current_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
714+ #endif
715+
706716 switch (params_.step_size_strategy ) {
707717 case StepSizeStrategy::FIXED:
708718 updateIteratesFixed ();
@@ -744,10 +754,22 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
744754 updateAverageIterates (x_next_, y_next_, working_params, inner_iter);
745755 hipdlpTimerStop (kHipdlpClockAverageIterate );
746756
757+ #ifdef CUPDLP_GPU
758+ // Update average iterates on GPU
759+ double dMeanStepSize = std::sqrt (stepsize_.primal_step * stepsize_.dual_step );
760+ launchKernelUpdateAverages (dMeanStepSize);
761+ sum_weights_gpu_ += dMeanStepSize;
762+ #endif
763+
747764 // --- 7. Prepare for next iteration ---
748765 x_current_ = x_next_;
749766 y_current_ = y_next_;
750- // iteration
767+ #ifdef CUPDLP_GPU
768+ CUDA_CHECK (cudaMemcpy (d_x_current_, d_x_next_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
769+ CUDA_CHECK (cudaMemcpy (d_y_current_, d_y_next_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
770+ CUDA_CHECK (cudaMemcpy (d_ax_current_, d_ax_next_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToDevice));
771+ CUDA_CHECK (cudaMemcpy (d_aty_current_, d_aty_next_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToDevice));
772+ #endif
751773 }
752774
753775 // --- 8. Handle Max Iterations Reached ---
@@ -793,12 +815,6 @@ void PDLPSolver::initialize() {
793815 dSlackNeg_.resize (lp_.num_col_ , 0.0 );
794816 dSlackPosAvg_.resize (lp_.num_col_ , 0.0 );
795817 dSlackNegAvg_.resize (lp_.num_col_ , 0.0 );
796-
797- CUDA_CHECK (cudaMemset (d_x_current_, 0 , a_num_cols_ * sizeof (double )));
798- CUDA_CHECK (cudaMemset (d_y_current_, 0 , a_num_rows_ * sizeof (double )));
799- CUDA_CHECK (cudaMemset (d_x_sum_, 0 , a_num_cols_ * sizeof (double )));
800- CUDA_CHECK (cudaMemset (d_y_sum_, 0 , a_num_rows_ * sizeof (double )));
801- CUDA_CHECK (cudaMemset (d_aty_current_, 0 , a_num_cols_ * sizeof (double )));
802818}
803819
804820// Update primal weight
@@ -1483,19 +1499,39 @@ void PDLPSolver::updateIteratesFixed() {
14831499 hipdlpTimerStop (kHipdlpClockMatrixTransposeMultiply );
14841500
14851501#ifdef CUPDLP_GPU
1486- launchKernelUpdateX (stepsize_.primal_step );
1487- linalgGpuAx (d_x_next_, d_ax_next_);
1488- launchKernelUpdateY (stepsize_.dual_step );
1489- linalgGpuATy (d_y_next_, d_aty_next_);
1502+ // Add this check before the memcpy
1503+ if (d_x_next_ == nullptr ) {
1504+ std::cerr << " Error1: d_x_next_ is null!" << std::endl;
1505+ return ;
1506+ }
1507+ launchKernelUpdateX (stepsize_.primal_step );
1508+ CUDA_CHECK (cudaDeviceSynchronize ());
1509+ linalgGpuAx (d_x_next_, d_ax_next_);
1510+ CUDA_CHECK (cudaDeviceSynchronize ());
1511+ launchKernelUpdateY (stepsize_.dual_step );
1512+ CUDA_CHECK (cudaDeviceSynchronize ());
1513+ linalgGpuATy (d_y_next_, d_aty_next_);
1514+ CUDA_CHECK (cudaDeviceSynchronize ());
1515+
1516+ // Add this check before the memcpy
1517+ if (d_x_next_ == nullptr ) {
1518+ std::cerr << " Error2: d_x_next_ is null!" << std::endl;
1519+ return ;
1520+ }
14901521
1491- std::vector<double > x_next_gpu (a_num_cols_);
1492- std::vector<double > y_next_gpu (a_num_rows_);
1493- std::vector<double > ax_next_gpu (a_num_rows_);
1494- std::vector<double > aty_next_gpu (a_num_cols_);
1495- CUDA_CHECK (cudaMemcpy (x_next_gpu.data (), d_x_next_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
1496- CUDA_CHECK (cudaMemcpy (y_next_gpu.data (), d_y_next_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
1497- CUDA_CHECK (cudaMemcpy (ax_next_gpu.data (), d_ax_next_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
1498- CUDA_CHECK (cudaMemcpy (aty_next_gpu.data (), d_aty_next_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
1522+ std::vector<double > x_next_gpu (a_num_cols_);
1523+ std::vector<double > y_next_gpu (a_num_rows_);
1524+ std::vector<double > ax_next_gpu (a_num_rows_);
1525+ std::vector<double > aty_next_gpu (a_num_cols_);
1526+ CUDA_CHECK (cudaMemcpy (x_next_gpu.data (), d_x_next_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
1527+ CUDA_CHECK (cudaMemcpy (y_next_gpu.data (), d_y_next_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
1528+ CUDA_CHECK (cudaMemcpy (ax_next_gpu.data (), d_ax_next_, a_num_rows_ * sizeof (double ), cudaMemcpyDeviceToHost));
1529+ CUDA_CHECK (cudaMemcpy (aty_next_gpu.data (), d_aty_next_, a_num_cols_ * sizeof (double ), cudaMemcpyDeviceToHost));
1530+
1531+ bool x_match = vecDiff (x_next_gpu, x_next_, 1e-12 , " UpdateIteratesFixed x" );
1532+ bool y_match = vecDiff (y_next_gpu, y_next_, 1e-12 , " UpdateIteratesFixed y" );
1533+ bool ax_match = vecDiff (ax_next_gpu, Ax_next_, 1e-12 , " UpdateIteratesFixed Ax" );
1534+ bool aty_match = vecDiff (aty_next_gpu, ATy_next_, 1e-12 , " UpdateIteratesFixed ATy" );
14991535#endif
15001536}
15011537
@@ -1765,6 +1801,8 @@ void PDLPSolver::setupGpu(){
17651801 CUDA_CHECK (cudaMalloc (&d_is_equality_row_, a_num_rows_ * sizeof (bool )));
17661802 CUDA_CHECK (cudaMalloc (&d_x_current_, a_num_cols_ * sizeof (double )));
17671803 CUDA_CHECK (cudaMalloc (&d_y_current_, a_num_rows_ * sizeof (double )));
1804+ CUDA_CHECK (cudaMalloc (&d_x_avg_, a_num_cols_ * sizeof (double )));
1805+ CUDA_CHECK (cudaMalloc (&d_y_avg_, a_num_rows_ * sizeof (double )));
17681806 CUDA_CHECK (cudaMalloc (&d_x_next_, a_num_cols_ * sizeof (double )));
17691807 CUDA_CHECK (cudaMalloc (&d_y_next_, a_num_rows_ * sizeof (double )));
17701808 CUDA_CHECK (cudaMalloc (&d_ax_current_, a_num_rows_ * sizeof (double )));
@@ -1816,6 +1854,19 @@ void PDLPSolver::setupGpu(){
18161854 CUSPARSE_CHECK (cusparseDestroyDnVec (vec_y));
18171855 CUSPARSE_CHECK (cusparseDestroyDnVec (vec_aty));
18181856
1857+ CUDA_CHECK (cudaMemset (d_x_current_, 0 , a_num_cols_ * sizeof (double )));
1858+ CUDA_CHECK (cudaMemset (d_y_current_, 0 , a_num_rows_ * sizeof (double )));
1859+ CUDA_CHECK (cudaMemset (d_x_avg_, 0 , a_num_cols_ * sizeof (double )));
1860+ CUDA_CHECK (cudaMemset (d_y_avg_, 0 , a_num_rows_ * sizeof (double )));
1861+ CUDA_CHECK (cudaMemset (d_x_next_, 0 , a_num_cols_ * sizeof (double )));
1862+ CUDA_CHECK (cudaMemset (d_y_next_, 0 , a_num_rows_ * sizeof (double )));
1863+ CUDA_CHECK (cudaMemset (d_ax_current_, 0 , a_num_rows_ * sizeof (double )));
1864+ CUDA_CHECK (cudaMemset (d_ax_next_, 0 , a_num_rows_ * sizeof (double )));
1865+ CUDA_CHECK (cudaMemset (d_x_sum_, 0 , a_num_cols_ * sizeof (double )));
1866+ CUDA_CHECK (cudaMemset (d_y_sum_, 0 , a_num_rows_ * sizeof (double )));
1867+ CUDA_CHECK (cudaMemset (d_aty_current_, 0 , a_num_cols_ * sizeof (double )));
1868+ sum_weights_gpu_ = 0.0 ;
1869+
18191870 highsLogUser (params_.log_options_ , HighsLogType::kInfo , " GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred to device.\n " );
18201871}
18211872
@@ -1902,11 +1953,12 @@ void PDLPSolver::launchKernelUpdateX(double primal_step) {
19021953}
19031954
19041955void PDLPSolver::launchKernelUpdateY (double dual_step) {
1905- launchKernelUpdateY_wrapper (
1906- d_y_next_, d_y_current_, d_ax_current_,
1907- d_row_lower_, d_is_equality_row_,
1908- dual_step, a_num_rows_);
1909- CUDA_CHECK (cudaGetLastError ());
1956+ launchKernelUpdateY_wrapper (
1957+ d_y_next_, d_y_current_,
1958+ d_ax_current_, d_ax_next_,
1959+ d_row_lower_, d_is_equality_row_,
1960+ dual_step, a_num_rows_);
1961+ CUDA_CHECK (cudaGetLastError ());
19101962}
19111963
19121964void PDLPSolver::launchKernelUpdateAverages (double weight) {
0 commit comments