Updates in agentic examples

emmanuellujan · emmanuellujan · commit cb9595589eee · 2025-12-11T12:59:03.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -24,7 +24,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [sources]
-Dagger = {rev = "master", url = "https://github.com/JuliaParallel/Dagger.jl"}
+Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"}
 
 [compat]
 BSON = "0.3"
diff --git a/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf b/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf
diff --git a/examples/agentic/generate-cpu-linear-solver/solver.jl b/examples/agentic/generate-cpu-linear-solver/solver.jl
@@ -1,65 +1,44 @@
-function proposed_fn(A::SparseMatrixCSC, b::AbstractVector)
-    @assert size(A,1) == size(A,2) "A must be square"
-    n = length(b)
-    @assert size(A,2) == n "Dimensions of A and b must agree"
-
-    niters = 4
-
-    # Convert sparse matrix to dense double for accurate residual computation
-    # and to dense single for fast factorization/solves with multithreaded BLAS.
-    Ad64 = Array(A)                    # dense Float64
-    Ad32 = Array{Float32}(undef, n, n)
-    @inbounds for j in 1:n
-        for i in 1:n
-            Ad32[i,j] = Float32(Ad64[i,j])
-        end
+function proposed_fn(A, b)
+    # Cache Float32 factorizations and work buffers per matrix identity
+    if !isdefined(@__MODULE__, :LU32_CACHE)
+        global LU32_CACHE = IdDict{UInt64, Tuple{Any, Vector{Float64}, Vector{Float32}, Vector{Float32}}}()
     end
 
-    # Convert rhs to Float32 once
-    b32 = Vector{Float32}(undef, n)
-    @inbounds @simd for i in 1:n
-        b32[i] = Float32(b[i])
-    end
-
-    # Factorize dense single-precision matrix (uses LAPACK/BLAS and is multithreaded)
-    F32 = lu(Ad32)
-
-    # Initial solve in single precision, in-place if possible
-    x32 = copy(b32)
-    try
-        LinearAlgebra.ldiv!(F32, x32)   # in-place: x32 <- Ad32 \ b32
-    catch
-        x32 = F32 \ b32                 # fallback
+    # Ensure b as Float64 vector (avoid copy if already Float64)
+    b64 = eltype(b) === Float64 ? b : Vector{Float64}(b)
+    n = length(b64)
+
+    key = objectid(A)
+    F32, r64, work32, bf32 = get!(LU32_CACHE, key) do
+        # Build a single-precision copy of the numeric values (structure reuse)
+        nz32 = Float32.(A.nzval)
+        Af = SparseMatrixCSC{Float32, Int}(size(A,1), size(A,2),
+                                           copy(A.colptr), copy(A.rowval),
+                                           nz32)
+        F32_local = lu(Af)                           # single-precision sparse LU
+        r64_local = Vector{Float64}(undef, n)       # residual buffer (double)
+        work32_local = Vector{Float32}(undef, n)    # temp residual in single
+        bf32_local = Vector{Float32}(undef, n)      # temp right-hand side in single
+        return (F32_local, r64_local, work32_local, bf32_local)
     end
 
-    # Promote to double precision for accumulation and residual computation
-    x = Vector{Float64}(undef, n)
-    @inbounds @simd for i in 1:n
-        x[i] = Float64(x32[i])
+    # Initial solve in single precision, accumulate in double
+    @inbounds for i = 1:n
+        bf32[i] = Float32(b64[i])
     end
-
-    # Preallocate working vectors
-    r = similar(b)                     # Float64 residual
-    r32 = Vector{Float32}(undef, n)    # single-precision correction (in-place)
-
-    for iter in 1:niters
-        # r = b - Ad64 * x   (use BLAS for dense matvec)
-        mul!(r, Ad64, x)              # r = Ad64 * x
-        @inbounds @simd for i in 1:n
-            r[i] = b[i] - r[i]
-            r32[i] = Float32(r[i])
+    xf32 = F32 \ bf32
+    x = Float64.(xf32)
+
+    # Iterative refinement: compute residual in double, solve correction in single, update double solution
+    for _ = 1:5
+        mul!(r64, A, x)                       # r64 = A * x (double)
+        @inbounds for i = 1:n
+            r64[i] = b64[i] - r64[i]          # r64 = b - A*x
+            work32[i] = Float32(r64[i])       # convert residual to single
         end
-
-        # Solve correction in single precision using the LU factorization
-        try
-            LinearAlgebra.ldiv!(F32, r32)   # r32 <- Ad32 \ r32 (in-place)
-        catch
-            r32 = F32 \ r32                 # fallback
-        end
-
-        # Update double-precision solution
-        @inbounds @simd for i in 1:n
-            x[i] += Float64(r32[i])
+        d32 = F32 \ work32
+        @inbounds for i = 1:n
+            x[i] += Float64(d32[i])           # update solution in double
         end
     end
 
diff --git a/examples/agentic/generate-dagger-linear-solver/Project.toml b/examples/agentic/generate-dagger-linear-solver/Project.toml
@@ -8,5 +8,5 @@ SmartSolve = "4fbb3a3c-2fa1-4c19-8d57-bae8bc1e16ac"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [sources]
-Dagger = {rev = "master", url = "https://github.com/JuliaParallel/Dagger.jl"}
+Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"}
 SmartSolve = {path = "../../.."}
diff --git a/examples/agentic/generate-dagger-linear-solver/generate.jl b/examples/agentic/generate-dagger-linear-solver/generate.jl
@@ -6,14 +6,97 @@ using BenchmarkTools
 using Dagger
 
 prompt = """
-Generate a high-performance Dagger.jl (https://juliaparallel.org/Dagger.jl/dev/) implementation in Julia of a linear solver for sparse matrices
-based on LU with iterative refinement (at least 5 refinement iterations), using the following
-reference: https://nhigham.com/2023/03/13/what-is-iterative-refinement
+- Task: Write a high-performance Julia implementation of a linear solver for sparse matrices, using Dagger.jl on the GPU. The solver must be based on Cholesky factorization with iterative refinement.
+
+- Requirements
+
+1) Libraries and references
+
+1.1) Use Dagger.jl as documented here:
+https://juliaparallel.org/Dagger.jl/dev/
+
+1.2) Follow the iterative refinement algorithm described here:
+https://nhigham.com/2023/03/13/what-is-iterative-refinement/
+
+2) Dagger.jl + Cholesky
+
+2.1) Dagger.jl already has an Cholesky routine that can be used for distributed linear solves: cholesky(A_d) where A_d is a distributed matrix (DMatrix). This implementation extends cholesky from LinearAlgebra.jl. See https://github.com/JuliaParallel/Dagger.jl/blob/master/src/array/cholesky.jl.
+
+2.2) Use that Cholesky implementation within Dagger.jl (do not re-implement Cholesky from scratch).
+
+2.3) The computation must be fully on GPU, using Dagger's GPU support.
+
+2.4) Do not move data back to the CPU for intermediate computations.
+
+2.5) All linear algebra operations (factorization, forward/back substitution, residual computation, refinement updates) must be performed on GPU-resident Dagger arrays.
+
+3) Function API
+
+3.1) Implement exactly one Julia function with the following signature:
+function proposed_fn(A_d, b_d)
+    # your code here
+end
+A_d: distributed sparse matrix (Dagger-distributed, GPU-resident).
+b_d: distributed vector (Dagger-distributed, GPU-resident).
+x: solution vector (Dagger-distributed, GPU-resident). You may treat x as an initial guess and overwrite it with the final refined solution.
+
+3.2) The function must return the final solution x (and anything else you consider useful, e.g., a residual norm, but the first return value must be the solution).
+
+4)Iterative refinement details
+
+4.1) Use Cholesky factorization of A_d to compute an initial solution x₀. 
+
+4.2) Then apply iterative refinement:
+
+    4.2.1) At each iteration k, compute residual r_k = b_d - A_d * x_k on the GPU.
+    4.2.2) Solve Ad d_k = r_k using the Cholesky factors (on GPU).
+    4.2.3) Update x_k+1 = x_k + d_k on GPU.
+
+4.3) Perform at least 5 refinement iterations (you can use a loop with a fixed number of iterations ≥ 5; optional extra stopping criteria are allowed but not required).
+
+5) Performance and style constraints
+
+5.1) Use Dagger tasks / computation graphs appropriately so that the Cholesky factorization and solves are executed in parallel where possible.
+
+5.2) Avoid unnecessary data movement or conversions.
+
+5.3) Do not use CPU-only arrays or operations (no Array, no collect to CPU, etc.).
+
+5.4) Assume that using LinearAlgebra, using SparseArrays, and using Dagger have already been executed.
+
+5.5) Focus on clarity and correctness first, but structure the code with performance in mind (e.g., reuse LU factors, avoid recomputing them each iteration).
+
+6) Output format
+
+6.1) Output only the Julia code for the function:
+
+function proposed_fn(A_d, b_d)
+    ...
+end
+
+6.2) Do not include any explanation, comments, or text outside the function definition.
+
 """
 
 secret_key = ENV["OPENAI_API_KEY"]
-solver, hist, conv = gen_linear_solver_dagger(prompt, secret_key; max_iters = 5)
+solver, hist, conv = gen_linear_solver_dagger(prompt, secret_key; max_iters = 50)
 
 println("Generated Code:\n")
 println(solver)
-write("solver.jl", solver)
+write("solver.jl", solver)
+
+
+
+
+# using Dagger, CUDA, LinearAlgebra
+# N = 2000
+# A = rand(N, N)
+# A = A * A'
+# A[diagind(A)] .+= size(A, 1)
+# A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+#     view(A, Blocks(500, 500))
+# end
+# b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+#     randn(Blocks(500), N)
+# end
+# cholesky(A_d) \ b_d
diff --git a/src/Agentic.jl b/src/Agentic.jl
@@ -22,7 +22,9 @@ end
 proposed_fn(x) = x
 evaluator(x) = (true, "")
 function generate_default_code(prompt, secret_key, checker_filename;
-                              model = "gpt-5-mini", dev_prompt_fn = dev_prompt_maker, max_iters = 3)
+                               model = "gpt-5-mini",
+                               dev_prompt_fn = dev_prompt_maker,
+                               max_iters = 3)
     """
         - checker_fn: proposed_fn -> check : Bool, performance_description : String
     """
@@ -42,6 +44,8 @@ function generate_default_code(prompt, secret_key, checker_filename;
         
         println("Iteration $iters")
 
+        println("Code:\n $gen_code")
+
         # println(gen_code)
         push!(chat_history, Dict("role" => "assistant", "content" => gen_code)) 
         try
@@ -53,14 +57,15 @@ function generate_default_code(prompt, secret_key, checker_filename;
 
             next_prompt = description_prompt_maker(check, performance_description)
             push!(chat_history, Dict("role" => "user", "content" => next_prompt))
-            converged = ~(iters == max_iters)
+            converevlged = ~(iters == max_iters)
             check && break
         catch e
             error_msg = sprint(showerror, e)
             st = sprint((io,v) -> show(io, "text/plain", v), stacktrace(catch_backtrace()))
            
             next_prompt = error_prompt_maker(error_msg * "\n" * st)
             push!(chat_history, Dict("role" => "user", "content" => next_prompt))
+            println("error: $error_msg\n$st")
         end
         converged = ~(iters == max_iters)
     end
@@ -84,14 +89,13 @@ function ls_cuda_dev_prompt_maker(fn_str)
 end
 
 function ls_dagger_dev_prompt_maker(fn_str)
-    return "You are a numerical linear algebra expert, and an expert Julia programmer. You are very experienced in GPU programming using CUDA." * 
+    return  "You are a numerical linear algebra expert, and an expert Julia programmer. You are very experienced in GPU programming using Dagger.jl" * 
             " The user will ask you to generate a function and use the following code the check if your solution is accurate and fast." * 
             " Make sure the code you produce uses Dagger." *
             " Here is the code: \n" * fn_str * "\nOnly return the function. Make sure the function name is proposed_fn. Do not return extra text." *
             " Assume that LinearAlgebra and SparseArrays is already imported." *
             " Assume that Dagger is already imported." *
-            " Use the following Dagger.jl documentation: https://juliaparallel.org/Dagger.jl/dev/" *
-            " Use the following Dagger.jl implementation of Cholesky as an example: https://github.com/JuliaParallel/Dagger.jl/blob/67211816781d59109d74940550ca2d80af96b13d/src/array/cholesky.jl"
+            " Use the following Dagger.jl documentation: https://juliaparallel.org/Dagger.jl/dev/"
 end
 
 src_dir = @__DIR__
diff --git a/src/test_performance_dagger.jl b/src/test_performance_dagger.jl
@@ -1,50 +1,48 @@
-test_matrices = []
-N = 2000#10_000
-# push!(test_matrices, randn(N, N))
-# push!(test_matrices, randn(N, N))
-# push!(test_matrices, randn(N, N))
-Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
-  push!(test_matrices, randn(Blocks(N, N), N, N))
-  push!(test_matrices, randn(Blocks(N, N), N, N))
-  push!(test_matrices, randn(Blocks(N, N), N, N))
-end
-
-function evaluator(proposed_fn;
-                        err_threshold::Float64 = 1.0,
-                        runtime_threshold::Float64 = 1.1,
-                        alloc_threshold::Float64 = 0.0)
-    error_ratios  = Float64[]
+function evaluator( proposed_fn;
+                    err_threshold::Float64 = 1.0,
+                    runtime_threshold::Float64 = 1.1,
+                    alloc_threshold::Float64 = 0.0)
+    N = 2000 #10_000
+    error_ratios   = Float64[]
     runtime_ratios = Float64[]
     alloc_ratios   = Float64[]
-    for A_d in test_matrices
-        # right-hand side on CPU
-        A_dim2 = size(A_d, 2)
+    for _ in 1:3
+
+        # SPD Matrix
+        A = randn(N, N)
+        A = A*A' + N*I
+
+        # Right-hand side
+        b = randn(N)
+
+        # SPD Matrix and right hand side on GPU (CUDA)
+        A_cuda = CuArray(A)
+        b_cuda = CuArray(b)
+
+        # SPD Matrix and right-hand side on GPU (Dagger Distributed)
+        A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+            distribute(A, Blocks(N÷4, N÷4))
+        end
         b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
-            randn(Blocks(A_dim2), A_dim2)
+            distribute(b, Blocks(N÷4))
         end
-        # move to GPU; here we use a dense GPU matrix
-        # If you have a sparse GPU solver, you can switch to CuSparseMatrixCSR(A_cpu)
-        A_cuda = CuArray(collect(A_d))
-        b_cuda = CuArray(collect(b_d))
+        
         # --- Solve once to ensure kernels are compiled (warm-up) ---
-        x_default = similar(b_cuda)
-        CUSOLVER.gesv!(x_default, A_cuda, b_cuda, irs_precision = "R_32F")
-        x_gen = similar(b_d)
-        Base.invokelatest(proposed_fn, x_gen, A_d, b_d)
+        x_default = cholesky(A_cuda) \ b_cuda
+        x_gen = Base.invokelatest(proposed_fn, A_d, b_d)
         CUDA.synchronize()
+
         # --- Error ratios (all on GPU, scalars on CPU) ---
         err_default = norm(A_cuda * x_default - b_cuda)
-        err_gen     = norm(A_d * x_gen     - b_d)
+        err_gen     = norm(A_d * x_gen - b_d)
         push!(error_ratios, err_default / err_gen)
         # --- Runtime ratios (GPU) ---
         b_default = @benchmark begin
-            x = similar($b_cuda)
-            CUSOLVER.gesv!($x, $A_cuda, $b_cuda, irs_precision = "R_32F")
+            cholesky($A_cuda) \ $b_cuda
             CUDA.synchronize()
         end
         b_gen = @benchmark begin
-            x = similar($b_d)
-            Base.invokelatest($proposed_fn, $x, $A_d, $b_d)
+            Base.invokelatest($proposed_fn, $A_d, $b_d)
         end
         push!(runtime_ratios, median(b_default.times) / median(b_gen.times))
         push!(alloc_ratios,   median(b_default.allocs) / median(b_gen.allocs))