refactor: update y_obj computation for improved clarity and performance

ChenKaihuang · ChenKaihuang · commit a76b9abbb012 · 2025-12-09T22:38:29.000+08:00
diff --git a/src/algorithm.jl b/src/algorithm.jl
@@ -528,9 +528,6 @@ function allocate_workspace_gpu(lp::LP_info_gpu, scaling_info::Scaling_info_gpu,
         ws.y_bar .= ws.y
         ws.last_y .= ws.y
 
-        # Compute y_obj based on y_bar
-        compute_y_obj_gpu!(ws.y_obj, ws.y_bar, ws.AL, ws.AU, m)
-
         # Compute z_bar = c - AT * y_bar
         CUDA.CUSPARSE.cusparseSpMV(ws.spmv_AT.handle, ws.spmv_AT.operator, ws.spmv_AT.alpha,
             ws.spmv_AT.desc_AT, ws.spmv_AT.desc_y_bar, ws.spmv_AT.beta, ws.spmv_AT.desc_ATy,
@@ -610,17 +607,6 @@ function allocate_workspace_cpu(lp::LP_info_cpu, scaling_info::Scaling_info_cpu,
         ws.y_bar .= scaled_y
         ws.last_y .= scaled_y
 
-        # Compute y_obj based on y_bar
-        for i in 1:m
-            y_bar_val = ws.y_bar[i]
-            if y_bar_val > 0.0
-                ws.y_obj[i] = ws.AL[i]
-            elseif y_bar_val < 0.0
-                ws.y_obj[i] = ws.AU[i]
-                # else keep y_obj[i] as 0
-            end
-        end
-
         # Compute z_bar = c - AT * y_bar
         mul!(ws.ATy, ws.AT, ws.y_bar)
         ws.z_bar .= ws.c .- ws.ATy
@@ -1138,6 +1124,23 @@ function solve(model::LP_info_cpu, params::HPRLP_parameters)
     # Power iteration to estimate lambda_max
     power_time = compute_maximum_eigenvalue!(lp, ws, params)
 
+    # Compute y_obj if initial_y was provided (now that lambda_max is known)
+    if params.initial_y !== nothing
+        fact1 = ws.lambda_max * ws.sigma
+        if params.use_gpu
+            # Compute Ax for initial x (or use zeros if no initial x)
+            CUDA.CUSPARSE.cusparseSpMV(ws.spmv_A.handle, ws.spmv_A.operator, ws.spmv_A.alpha,
+                ws.spmv_A.desc_A, ws.spmv_A.desc_x_bar, ws.spmv_A.beta, ws.spmv_A.desc_Ax,
+                ws.spmv_A.compute_type, ws.spmv_A.alg, ws.spmv_A.buf)
+            compute_y_obj_gpu!(ws.y_obj, ws.y, ws.AL, ws.AU, ws.Ax, fact1, ws.m)
+        else
+            # Compute Ax for initial x (or use zeros if no initial x)
+            mul!(ws.Ax, ws.A, ws.x_bar)
+            compute_y_obj_cpu!(ws.y_obj, ws.y, ws.AL, ws.AU, ws.Ax, fact1)
+        end
+    end
+
+
     if params.verbose
         println(" iter     errRp        errRd         p_obj            d_obj          gap         sigma       time")
     end
diff --git a/src/kernels.jl b/src/kernels.jl
@@ -11,29 +11,48 @@ function axpby_gpu!(a::Float64, x::CuVector{Float64}, b::Float64, y::CuVector{Fl
     @cuda threads = 256 blocks = ceil(Int, n / 256) axpby_kernel!(a, x, b, y, z, n)
 end
 
-# kernel to compute y_obj from y_bar for initialization
+# kernel to compute y_obj from y for initialization
+# Computes y_obj as the projection of v onto [AL, AU] where v = Ax - fact1 * y
 function compute_y_obj_kernel!(y_obj::CuDeviceVector{Float64},
-    y_bar::CuDeviceVector{Float64},
+    y::CuDeviceVector{Float64},
     AL::CuDeviceVector{Float64},
     AU::CuDeviceVector{Float64},
+    Ax::CuDeviceVector{Float64},
+    fact1::Float64,
     m::Int)
     i = threadIdx().x + (blockDim().x * (blockIdx().x - 1))
     if i <= m
         @inbounds begin
-            y_bar_val = y_bar[i]
-            if y_bar_val > 0.0
-                y_obj[i] = AL[i]
-            elseif y_bar_val < 0.0
-                y_obj[i] = AU[i]
-            # else keep y_obj[i] unchanged (already initialized to 0)
-            end
+            yi = y[i]
+            ai = Ax[i]
+            li = AL[i]
+            ui = AU[i]
+            v = ai - fact1 * yi
+            # Branchless projection: clamp(v, li, ui)
+            d = max(li - v, min(ui - v, 0.0))
+            y_obj[i] = v + d
         end
     end
     return
 end
 
-function compute_y_obj_gpu!(y_obj::CuVector{Float64}, y_bar::CuVector{Float64}, AL::CuVector{Float64}, AU::CuVector{Float64}, m::Int)
-    @cuda threads = 256 blocks = ceil(Int, m / 256) compute_y_obj_kernel!(y_obj, y_bar, AL, AU, m)
+function compute_y_obj_gpu!(y_obj::CuVector{Float64}, y::CuVector{Float64}, AL::CuVector{Float64}, AU::CuVector{Float64}, Ax::CuVector{Float64}, fact1::Float64, m::Int)
+    @cuda threads = 256 blocks = ceil(Int, m / 256) compute_y_obj_kernel!(y_obj, y, AL, AU, Ax, fact1, m)
+end
+
+function compute_y_obj_cpu!(y_obj::Vector{Float64}, y::Vector{Float64}, AL::Vector{Float64}, AU::Vector{Float64}, Ax::Vector{Float64}, fact1::Float64)
+    @simd for i in eachindex(y_obj)
+        @inbounds begin
+            yi = y[i]
+            ai = Ax[i]
+            li = AL[i]
+            ui = AU[i]
+            v = ai - fact1 * yi
+            # Branchless projection: clamp(v, li, ui)
+            d = max(li - v, min(ui - v, 0.0))
+            y_obj[i] = v + d
+        end
+    end
 end
 
 function combined_kernel_x_z_1!(dx::CuDeviceVector{Float64},