Skip to content

Commit 5676802

Browse files
committed
restart partially on GPU
1 parent 8305291 commit 5676802

File tree

4 files changed

+96
-0
lines changed

4 files changed

+96
-0
lines changed

highs/pdlp/hipdlp/pdhg.cc

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,13 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
691691

692692
Ax_cache_ = Ax_avg;
693693
ATy_cache_ = ATy_avg;
694+
695+
#ifdef CUPDLP_GPU
696+
CUDA_CHECK(cudaMemcpy(d_x_current_, d_x_avg_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToDevice));
697+
CUDA_CHECK(cudaMemcpy(d_y_current_, d_y_avg_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToDevice));
698+
linalgGpuAx(d_x_current_, d_ax_current_);
699+
linalgGpuATy(d_y_current_, d_aty_current_);
700+
#endif
694701
} else {
695702
restart_scheme_.primal_feas_last_restart_ =
696703
current_results.primal_feasibility;
@@ -701,6 +708,9 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
701708
}
702709

703710
// Perform the primal weight update using z^{n,0} and z^{n-1,0}
711+
#ifdef CUPDLP_GPU
712+
computeStepSizeRatioGpu(working_params);
713+
#endif
704714
computeStepSizeRatio(working_params);
705715
current_eta_ = working_params.eta;
706716
restart_scheme_.passParams(&working_params);
@@ -712,6 +722,14 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
712722
std::fill(y_sum_.begin(), y_sum_.end(), 0.0);
713723
sum_weights_ = 0.0;
714724

725+
#ifdef CUPDLP_GPU
726+
CUDA_CHECK(cudaMemcpy(d_x_at_last_restart_, d_x_current_, a_num_cols_ * sizeof(double), cudaMemcpyDeviceToDevice));
727+
CUDA_CHECK(cudaMemcpy(d_y_at_last_restart_, d_y_current_, a_num_rows_ * sizeof(double), cudaMemcpyDeviceToDevice));
728+
CUDA_CHECK(cudaMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double)));
729+
CUDA_CHECK(cudaMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double)));
730+
sum_weights_gpu_ = 0.0;
731+
#endif
732+
715733
restart_scheme_.last_restart_iter_ = iter;
716734
// Recompute Ax and ATy for the restarted iterates
717735
hipdlpTimerStart(kHipdlpClockMatrixMultiply);
@@ -1851,6 +1869,10 @@ void PDLPSolver::setupGpu(){
18511869
CUDA_CHECK(cudaMalloc(&d_y_avg_, a_num_rows_ * sizeof(double)));
18521870
CUDA_CHECK(cudaMalloc(&d_x_next_, a_num_cols_ * sizeof(double)));
18531871
CUDA_CHECK(cudaMalloc(&d_y_next_, a_num_rows_ * sizeof(double)));
1872+
CUDA_CHECK(cudaMalloc(&d_x_at_last_restart_, a_num_cols_ * sizeof(double)));
1873+
CUDA_CHECK(cudaMalloc(&d_y_at_last_restart_, a_num_rows_ * sizeof(double)));
1874+
CUDA_CHECK(cudaMalloc(&d_x_temp_diff_norm_result_, a_num_cols_ * sizeof(double)));
1875+
CUDA_CHECK(cudaMalloc(&d_y_temp_diff_norm_result_, a_num_rows_ * sizeof(double)));
18541876
CUDA_CHECK(cudaMalloc(&d_ax_current_, a_num_rows_ * sizeof(double)));
18551877
CUDA_CHECK(cudaMalloc(&d_aty_current_, a_num_cols_ * sizeof(double)));
18561878
CUDA_CHECK(cudaMalloc(&d_ax_next_, a_num_rows_ * sizeof(double)));
@@ -1951,6 +1973,10 @@ void PDLPSolver::cleanupGpu(){
19511973
CUDA_CHECK(cudaFree(d_col_upper_));
19521974
CUDA_CHECK(cudaFree(d_row_lower_));
19531975
CUDA_CHECK(cudaFree(d_is_equality_row_));
1976+
CUDA_CHECK(cudaFree(d_x_at_last_restart_));
1977+
CUDA_CHECK(cudaFree(d_y_at_last_restart_));
1978+
CUDA_CHECK(cudaFree(d_x_temp_diff_norm_result_));
1979+
CUDA_CHECK(cudaFree(d_y_temp_diff_norm_result_));
19541980
CUDA_CHECK(cudaFree(d_x_current_));
19551981
CUDA_CHECK(cudaFree(d_y_current_));
19561982
CUDA_CHECK(cudaFree(d_x_next_));
@@ -2096,4 +2122,38 @@ bool PDLPSolver::checkConvergenceGpu(
20962122

20972123
return primal_feasible && dual_feasible && gap_small;
20982124

2125+
}
2126+
void PDLPSolver::computeStepSizeRatioGpu(PrimalDualParams& working_params) {
2127+
// 1. Compute ||x_last - x_current||^2 on GPU
2128+
launchKernelDiffTwoNormSquared_wrapper(d_x_at_last_restart_, d_x_current_, d_x_temp_diff_norm_result_, a_num_cols_);
2129+
2130+
double primal_diff_sq;
2131+
CUDA_CHECK(cudaMemcpy(&primal_diff_sq, d_x_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
2132+
double primal_diff_norm = std::sqrt(primal_diff_sq);
2133+
2134+
// 2. Compute ||y_last - y_current||^2 on GPU
2135+
launchKernelDiffTwoNormSquared_wrapper(d_y_at_last_restart_, d_y_current_, d_y_temp_diff_norm_result_, a_num_rows_);
2136+
2137+
double dual_diff_sq;
2138+
CUDA_CHECK(cudaMemcpy(&dual_diff_sq, d_y_temp_diff_norm_result_, sizeof(double), cudaMemcpyDeviceToHost));
2139+
double dual_diff_norm = std::sqrt(dual_diff_sq);
2140+
2141+
double dMeanStepSize = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
2142+
2143+
// 3. Compute new beta
2144+
if (std::min(primal_diff_norm, dual_diff_norm) > 1e-10) {
2145+
double beta_update_ratio = dual_diff_norm / primal_diff_norm;
2146+
double old_beta = stepsize_.beta;
2147+
2148+
double dLogBetaUpdate =
2149+
0.5 * std::log(beta_update_ratio) + 0.5 * std::log(std::sqrt(old_beta));
2150+
stepsize_.beta = std::exp(2.0 * dLogBetaUpdate);
2151+
}
2152+
2153+
// Update steps
2154+
stepsize_.primal_step = dMeanStepSize / std::sqrt(stepsize_.beta);
2155+
stepsize_.dual_step = stepsize_.primal_step * stepsize_.beta;
2156+
working_params.eta = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
2157+
working_params.omega = std::sqrt(stepsize_.beta);
2158+
restart_scheme_.UpdateBeta(stepsize_.beta);
20992159
}

highs/pdlp/hipdlp/pdhg.cu

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,19 @@ __global__ void kernelCheckDual(
182182
atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
183183
}
184184

185+
__global__ void kernelDiffTwoNormSquared(
186+
const double* a, const double* b,
187+
double* result, int n){
188+
double local_diff_sq = 0.0;
189+
CUDA_GRID_STRIDE_LOOP(i, n){
190+
double diff = a[i] - b[i];
191+
local_diff_sq += diff * diff;
192+
}
193+
194+
atomicAdd(result, local_diff_sq);
195+
}
196+
197+
185198
// Add C++ wrapper functions to launch the kernels
186199
extern "C" {
187200
void launchKernelUpdateX_wrapper(
@@ -277,4 +290,16 @@ void launchCheckConvergenceKernels_wrapper(
277290

278291
cudaGetLastError();
279292
}
293+
294+
void launchKernelDiffTwoNormSquared_wrapper(
295+
const double* d_a, const double* d_b, double* d_result, int n) {
296+
297+
// Reset result on device first
298+
cudaMemset(d_result, 0, sizeof(double));
299+
300+
const int block_size = 256;
301+
dim3 config = GetLaunchConfig(n, block_size);
302+
kernelDiffTwoNormSquared<<<config.x, block_size>>>(d_a, d_b, d_result, n);
303+
cudaGetLastError();
304+
}
280305
} // extern "C"

highs/pdlp/hipdlp/pdhg.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,10 @@ class PDLPSolver {
224224
double* d_y_avg_ = nullptr;
225225
double* d_x_next_ = nullptr;
226226
double* d_y_next_ = nullptr;
227+
double* d_x_at_last_restart_ = nullptr;
228+
double* d_y_at_last_restart_ = nullptr;
229+
double* d_x_temp_diff_norm_result_ = nullptr;
230+
double* d_y_temp_diff_norm_result_ = nullptr; // Temporary buffer for reduction result
227231
double* d_ax_current_ = nullptr; // Replaces host-side Ax_cache_
228232
double* d_aty_current_ = nullptr; // Replaces host-side ATy_cache_
229233
double* d_ax_next_ = nullptr;
@@ -242,6 +246,7 @@ class PDLPSolver {
242246
double* d_dSlackNegAvg_ = nullptr;
243247
double* d_col_scale_ = nullptr;
244248
double* d_row_scale_ = nullptr;
249+
double* d_reduction_result_ = nullptr; //size 1
245250
bool checkConvergenceGpu(const int iter,
246251
const double* d_x, const double* d_y,
247252
const double* d_ax, const double* d_aty,
@@ -257,6 +262,9 @@ class PDLPSolver {
257262
void launchKernelUpdateY(double dual_step);
258263
void launchKernelUpdateAverages(double weight);
259264
void launchKernelScaleVector(double* d_out, const double* d_in, double scale, int n);
265+
void computeStepSizeRatioGpu(PrimalDualParams& working_params);
266+
void updateAverageIteratesGpu(int inner_iter);
267+
void computeAverageIterateGpu();
260268
};
261269

262270
#endif

highs/pdlp/hipdlp/pdhg_kernels.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ void launchCheckConvergenceKernels_wrapper(
3434
const bool* d_is_equality,
3535
const double* d_col_scale, const double* d_row_scale,
3636
int n_cols, int n_rows);
37+
38+
void launchKernelDiffTwoNormSquared_wrapper(
39+
const double* d_a, const double* d_b, double* d_result, int n);
3740
#ifdef __cplusplus
3841
}
3942
#endif

0 commit comments

Comments
 (0)