Skip to content

Commit a76b9ab

Browse files
committed
refactor: update y_obj computation for improved clarity and performance
1 parent f06cb47 commit a76b9ab

File tree

2 files changed

+47
-25
lines changed

2 files changed

+47
-25
lines changed

src/algorithm.jl

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -528,9 +528,6 @@ function allocate_workspace_gpu(lp::LP_info_gpu, scaling_info::Scaling_info_gpu,
528528
ws.y_bar .= ws.y
529529
ws.last_y .= ws.y
530530

531-
# Compute y_obj based on y_bar
532-
compute_y_obj_gpu!(ws.y_obj, ws.y_bar, ws.AL, ws.AU, m)
533-
534531
# Compute z_bar = c - AT * y_bar
535532
CUDA.CUSPARSE.cusparseSpMV(ws.spmv_AT.handle, ws.spmv_AT.operator, ws.spmv_AT.alpha,
536533
ws.spmv_AT.desc_AT, ws.spmv_AT.desc_y_bar, ws.spmv_AT.beta, ws.spmv_AT.desc_ATy,
@@ -610,17 +607,6 @@ function allocate_workspace_cpu(lp::LP_info_cpu, scaling_info::Scaling_info_cpu,
610607
ws.y_bar .= scaled_y
611608
ws.last_y .= scaled_y
612609

613-
# Compute y_obj based on y_bar
614-
for i in 1:m
615-
y_bar_val = ws.y_bar[i]
616-
if y_bar_val > 0.0
617-
ws.y_obj[i] = ws.AL[i]
618-
elseif y_bar_val < 0.0
619-
ws.y_obj[i] = ws.AU[i]
620-
# else keep y_obj[i] as 0
621-
end
622-
end
623-
624610
# Compute z_bar = c - AT * y_bar
625611
mul!(ws.ATy, ws.AT, ws.y_bar)
626612
ws.z_bar .= ws.c .- ws.ATy
@@ -1138,6 +1124,23 @@ function solve(model::LP_info_cpu, params::HPRLP_parameters)
11381124
# Power iteration to estimate lambda_max
11391125
power_time = compute_maximum_eigenvalue!(lp, ws, params)
11401126

1127+
# Compute y_obj if initial_y was provided (now that lambda_max is known)
1128+
if params.initial_y !== nothing
1129+
fact1 = ws.lambda_max * ws.sigma
1130+
if params.use_gpu
1131+
# Compute Ax for initial x (or use zeros if no initial x)
1132+
CUDA.CUSPARSE.cusparseSpMV(ws.spmv_A.handle, ws.spmv_A.operator, ws.spmv_A.alpha,
1133+
ws.spmv_A.desc_A, ws.spmv_A.desc_x_bar, ws.spmv_A.beta, ws.spmv_A.desc_Ax,
1134+
ws.spmv_A.compute_type, ws.spmv_A.alg, ws.spmv_A.buf)
1135+
compute_y_obj_gpu!(ws.y_obj, ws.y, ws.AL, ws.AU, ws.Ax, fact1, ws.m)
1136+
else
1137+
# Compute Ax for initial x (or use zeros if no initial x)
1138+
mul!(ws.Ax, ws.A, ws.x_bar)
1139+
compute_y_obj_cpu!(ws.y_obj, ws.y, ws.AL, ws.AU, ws.Ax, fact1)
1140+
end
1141+
end
1142+
1143+
11411144
if params.verbose
11421145
println(" iter errRp errRd p_obj d_obj gap sigma time")
11431146
end

src/kernels.jl

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,48 @@ function axpby_gpu!(a::Float64, x::CuVector{Float64}, b::Float64, y::CuVector{Fl
1111
@cuda threads = 256 blocks = ceil(Int, n / 256) axpby_kernel!(a, x, b, y, z, n)
1212
end
1313

14-
# kernel to compute y_obj from y_bar for initialization
14+
# kernel to compute y_obj from y for initialization
15+
# Computes y_obj as the projection of v onto [AL, AU] where v = Ax - fact1 * y
1516
function compute_y_obj_kernel!(y_obj::CuDeviceVector{Float64},
16-
y_bar::CuDeviceVector{Float64},
17+
y::CuDeviceVector{Float64},
1718
AL::CuDeviceVector{Float64},
1819
AU::CuDeviceVector{Float64},
20+
Ax::CuDeviceVector{Float64},
21+
fact1::Float64,
1922
m::Int)
2023
i = threadIdx().x + (blockDim().x * (blockIdx().x - 1))
2124
if i <= m
2225
@inbounds begin
23-
y_bar_val = y_bar[i]
24-
if y_bar_val > 0.0
25-
y_obj[i] = AL[i]
26-
elseif y_bar_val < 0.0
27-
y_obj[i] = AU[i]
28-
# else keep y_obj[i] unchanged (already initialized to 0)
29-
end
26+
yi = y[i]
27+
ai = Ax[i]
28+
li = AL[i]
29+
ui = AU[i]
30+
v = ai - fact1 * yi
31+
# Branchless projection: clamp(v, li, ui)
32+
d = max(li - v, min(ui - v, 0.0))
33+
y_obj[i] = v + d
3034
end
3135
end
3236
return
3337
end
3438

35-
function compute_y_obj_gpu!(y_obj::CuVector{Float64}, y_bar::CuVector{Float64}, AL::CuVector{Float64}, AU::CuVector{Float64}, m::Int)
36-
@cuda threads = 256 blocks = ceil(Int, m / 256) compute_y_obj_kernel!(y_obj, y_bar, AL, AU, m)
39+
function compute_y_obj_gpu!(y_obj::CuVector{Float64}, y::CuVector{Float64}, AL::CuVector{Float64}, AU::CuVector{Float64}, Ax::CuVector{Float64}, fact1::Float64, m::Int)
40+
@cuda threads = 256 blocks = ceil(Int, m / 256) compute_y_obj_kernel!(y_obj, y, AL, AU, Ax, fact1, m)
41+
end
42+
43+
function compute_y_obj_cpu!(y_obj::Vector{Float64}, y::Vector{Float64}, AL::Vector{Float64}, AU::Vector{Float64}, Ax::Vector{Float64}, fact1::Float64)
44+
@simd for i in eachindex(y_obj)
45+
@inbounds begin
46+
yi = y[i]
47+
ai = Ax[i]
48+
li = AL[i]
49+
ui = AU[i]
50+
v = ai - fact1 * yi
51+
# Branchless projection: clamp(v, li, ui)
52+
d = max(li - v, min(ui - v, 0.0))
53+
y_obj[i] = v + d
54+
end
55+
end
3756
end
3857

3958
function combined_kernel_x_z_1!(dx::CuDeviceVector{Float64},

0 commit comments

Comments
 (0)