computeMovementGpu and computeNonlinearityGpu

Yanyu000 · Yanyu000 · commit 21fa43ff6947 · 2025-12-02T11:55:04.000Z
diff --git a/check/TestPdlp.cpp b/check/TestPdlp.cpp
@@ -342,7 +342,7 @@ TEST_CASE("pdlp-restart-add-row", "[pdlp]") {
 
 TEST_CASE("hi-pdlp", "[pdlp]") {
   std::string model =
-      "shell";  //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p
+      "avgas";  //"adlittle";//"afiro";// shell// stair //25fv47 //fit2p //avgas
   std::string model_file =
       std::string(HIGHS_DIR) + "/check/instances/" + model + ".mps";
   Highs h;
@@ -354,8 +354,8 @@ TEST_CASE("hi-pdlp", "[pdlp]") {
 
   HighsInt pdlp_features_off = 0 +
        kPdlpScalingOff +
-       kPdlpRestartOff +
-       kPdlpAdaptiveStepSizeOff
+       kPdlpRestartOff 
+       //kPdlpAdaptiveStepSizeOff
       ;
   h.setOptionValue("pdlp_features_off", pdlp_features_off);
 
@@ -366,7 +366,7 @@ TEST_CASE("hi-pdlp", "[pdlp]") {
   h.setOptionValue("pdlp_scaling_mode", pdlp_scaling);
   h.setOptionValue("pdlp_step_size_strategy", 1);
   h.setOptionValue("pdlp_restart_strategy", 2);
-  //h.setOptionValue("pdlp_iteration_limit", 100);
+  //h.setOptionValue("pdlp_iteration_limit", 10);
   //  h.setOptionValue("log_dev_level", kHighsLogDevLevelVerbose);
   auto start_hipdlp = std::chrono::high_resolution_clock::now();
   HighsStatus run_status = h.run();
diff --git a/highs/pdlp/hipdlp/pdhg.cc b/highs/pdlp/hipdlp/pdhg.cc
@@ -715,8 +715,14 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
         // Perform the primal weight update using z^{n,0} and z^{n-1,0}
 #ifdef CUPDLP_GPU
         computeStepSizeRatioGpu(working_params);
-#endif
+        double cpu_beta = stepsize_.beta;
+        double cpu_primal_step = stepsize_.primal_step;
+        double cpu_dual_step = stepsize_.dual_step;
+        double cpu_eta = working_params.eta;
+        double cpu_omega = working_params.omega;
+#else
         computeStepSizeRatio(working_params);
+#endif
         current_eta_ = working_params.eta;
         restart_scheme_.passParams(&working_params);
 
@@ -1641,6 +1647,25 @@ void PDLPSolver::updateIteratesAdaptive() {
     double primal_step_update = dStepSizeUpdate / std::sqrt(stepsize_.beta);
     double dual_step_update = dStepSizeUpdate * std::sqrt(stepsize_.beta);
 
+#ifdef CUPDLP_GPU
+    launchKernelUpdateX_wrapper(
+        d_x_next_,          // Output (Trial)
+        d_x_current_,       // Input (Base)
+        d_aty_current_,     // Input (Base ATy)
+        d_col_cost_, d_col_lower_, d_col_upper_,
+        primal_step_update, a_num_cols_);
+    linalgGpuAx(d_x_next_, d_ax_next_);
+    launchKernelUpdateY_wrapper(
+        d_y_next_,          // Output (Trial)
+        d_y_current_,       // Input (Base)
+        d_ax_current_,      // Input (Base Ax)
+        d_ax_next_,         // Input (Trial Ax)
+        d_row_lower_, d_is_equality_row_,
+        dual_step_update, a_num_rows_);
+    linalgGpuATy(d_y_next_, d_aty_next_);   
+    double movement = computeMovementGpu(d_x_next_, d_x_current_, d_y_next_, d_y_current_);
+    double nonlinearity = computeNonlinearityGpu(d_x_next_, d_x_current_, d_aty_next_, d_aty_current_);
+#else
     // Primal update
     hipdlpTimerStart(kHipdlpClockProjectX);
     xupdate = updateX(x_candidate, aty_candidate, primal_step_update);
@@ -1686,6 +1711,7 @@ void PDLPSolver::updateIteratesAdaptive() {
     // Compute movement and nonlinearity
     double movement = computeMovement(delta_x, delta_y);
     double nonlinearity = computeNonlinearity(delta_x, delta_aty);
+#endif
     // Compute step size limit
     double step_size_limit = (nonlinearity != 0.0)
                                  ? (movement / std::fabs(nonlinearity))
@@ -2184,4 +2210,34 @@ void PDLPSolver::computeAverageIterateGpu() {
   linalgGpuAx(d_x_avg_, d_ax_avg_);
   linalgGpuATy(d_y_avg_, d_aty_avg_);
 }
+
+double PDLPSolver::computeMovementGpu(const double* d_x_new, const double* d_x_old,
+                                      const double* d_y_new, const double* d_y_old) {
+  // 1. Compute ||x_new - x_old||^2
+  launchKernelDiffTwoNormSquared_wrapper(d_x_new, d_x_old, d_x_temp_diff_norm_result_, a_num_cols_);
+  double primal_diff_sq;
+  CUDA_CHECK(cudaMemcpy(&primal_diff_sq, d_x_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
+
+  // 2. Compute ||y_new - y_old||^2
+  launchKernelDiffTwoNormSquared_wrapper(d_y_new, d_y_old, d_x_temp_diff_norm_result_, a_num_rows_);
+  double dual_diff_sq;
+  CUDA_CHECK(cudaMemcpy(&dual_diff_sq, d_x_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
+
+  // 3. Combine scalar results on CPU
+  double primal_weight = std::sqrt(stepsize_.beta);
+  return (0.5 * primal_weight * primal_diff_sq) +
+         (0.5 / primal_weight) * dual_diff_sq;
+}
+
+double PDLPSolver::computeNonlinearityGpu(const double* d_x_new, const double* d_x_old,
+                                          const double* d_aty_new, const double* d_aty_old) {
+  // Compute dot( (x_new - x_old), (aty_new - aty_old) )
+  launchKernelDiffDotDiff_wrapper(d_x_new, d_x_old, d_aty_new, d_aty_old, 
+                                  d_x_temp_diff_norm_result_, a_num_cols_);
+  
+  double interaction;
+  CUDA_CHECK(cudaMemcpy(&interaction, d_x_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
+  
+  return interaction; // cupdlp does not take absolute value here, it handles fabs in the check
+}
 #endif
diff --git a/highs/pdlp/hipdlp/pdhg.cu b/highs/pdlp/hipdlp/pdhg.cu
@@ -194,6 +194,20 @@ __global__ void kernelDiffTwoNormSquared(
   atomicAdd(result, local_diff_sq);
 }
 
+// Computes sum( (a_new[i] - a_old[i]) * (b_new[i] - b_old[i]) )
+__global__ void kernelDiffDotDiff(
+    const double* a_new, const double* a_old,
+    const double* b_new, const double* b_old,
+    double* result, int n) 
+{
+  double local_sum = 0.0;
+  CUDA_GRID_STRIDE_LOOP(i, n) {
+    double diff_a = a_new[i] - a_old[i];
+    double diff_b = b_new[i] - b_old[i];
+    local_sum += diff_a * diff_b;
+  }
+  atomicAdd(result, local_sum);
+}
 
 // Add C++ wrapper functions to launch the kernels
 extern "C" {
@@ -302,4 +316,18 @@ void launchKernelDiffTwoNormSquared_wrapper(
     kernelDiffTwoNormSquared<<<config.x, block_size>>>(d_a, d_b, d_result, n);
     cudaGetLastError();
 }
+
+void launchKernelDiffDotDiff_wrapper(
+    const double* d_a_new, const double* d_a_old,
+    const double* d_b_new, const double* d_b_old,
+    double* d_result, int n) 
+{
+    cudaMemset(d_result, 0, sizeof(double));
+    const int block_size = 256;
+    dim3 config = GetLaunchConfig(n, block_size);
+    
+    kernelDiffDotDiff<<<config.x, block_size>>>(
+        d_a_new, d_a_old, d_b_new, d_b_old, d_result, n);
+    cudaGetLastError();
+}
 } // extern "C"
diff --git a/highs/pdlp/hipdlp/pdhg.hpp b/highs/pdlp/hipdlp/pdhg.hpp
@@ -265,6 +265,11 @@ class PDLPSolver {
   void computeStepSizeRatioGpu(PrimalDualParams& working_params);
   void updateAverageIteratesGpu(int inner_iter);
   void computeAverageIterateGpu();
+  double computeMovementGpu(const double* d_x_new, const double* d_x_old,
+                            const double* d_y_new, const double* d_y_old);
+                            
+  double computeNonlinearityGpu(const double* d_x_new, const double* d_x_old,
+                                const double* d_aty_new, const double* d_aty_old);
 #endif
 };
 
diff --git a/highs/pdlp/hipdlp/pdhg_kernels.h b/highs/pdlp/hipdlp/pdhg_kernels.h
@@ -37,6 +37,11 @@ void launchCheckConvergenceKernels_wrapper(
 
 void launchKernelDiffTwoNormSquared_wrapper(
     const double* d_a, const double* d_b, double* d_result, int n);
+
+void launchKernelDiffDotDiff_wrapper(
+    const double* d_a_new, const double* d_a_old,
+    const double* d_b_new, const double* d_b_old,
+    double* d_result, int n);
 #ifdef __cplusplus
 }
 #endif