Skip to content

Commit cb95955

Browse files
committed
Updates in agentic examples
1 parent 0f1b4ed commit cb95955

File tree

7 files changed

+165
-101
lines changed

7 files changed

+165
-101
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
2424
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2525

2626
[sources]
27-
Dagger = {rev = "master", url = "https://github.com/JuliaParallel/Dagger.jl"}
27+
Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"}
2828

2929
[compat]
3030
BSON = "0.3"
33 KB
Binary file not shown.

examples/agentic/generate-cpu-linear-solver/solver.jl

Lines changed: 35 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,44 @@
1-
function proposed_fn(A::SparseMatrixCSC, b::AbstractVector)
2-
@assert size(A,1) == size(A,2) "A must be square"
3-
n = length(b)
4-
@assert size(A,2) == n "Dimensions of A and b must agree"
5-
6-
niters = 4
7-
8-
# Convert sparse matrix to dense double for accurate residual computation
9-
# and to dense single for fast factorization/solves with multithreaded BLAS.
10-
Ad64 = Array(A) # dense Float64
11-
Ad32 = Array{Float32}(undef, n, n)
12-
@inbounds for j in 1:n
13-
for i in 1:n
14-
Ad32[i,j] = Float32(Ad64[i,j])
15-
end
1+
function proposed_fn(A, b)
2+
# Cache Float32 factorizations and work buffers per matrix identity
3+
if !isdefined(@__MODULE__, :LU32_CACHE)
4+
global LU32_CACHE = IdDict{UInt64, Tuple{Any, Vector{Float64}, Vector{Float32}, Vector{Float32}}}()
165
end
176

18-
# Convert rhs to Float32 once
19-
b32 = Vector{Float32}(undef, n)
20-
@inbounds @simd for i in 1:n
21-
b32[i] = Float32(b[i])
22-
end
23-
24-
# Factorize dense single-precision matrix (uses LAPACK/BLAS and is multithreaded)
25-
F32 = lu(Ad32)
26-
27-
# Initial solve in single precision, in-place if possible
28-
x32 = copy(b32)
29-
try
30-
LinearAlgebra.ldiv!(F32, x32) # in-place: x32 <- Ad32 \ b32
31-
catch
32-
x32 = F32 \ b32 # fallback
7+
# Ensure b as Float64 vector (avoid copy if already Float64)
8+
b64 = eltype(b) === Float64 ? b : Vector{Float64}(b)
9+
n = length(b64)
10+
11+
key = objectid(A)
12+
F32, r64, work32, bf32 = get!(LU32_CACHE, key) do
13+
# Build a single-precision copy of the numeric values (structure reuse)
14+
nz32 = Float32.(A.nzval)
15+
Af = SparseMatrixCSC{Float32, Int}(size(A,1), size(A,2),
16+
copy(A.colptr), copy(A.rowval),
17+
nz32)
18+
F32_local = lu(Af) # single-precision sparse LU
19+
r64_local = Vector{Float64}(undef, n) # residual buffer (double)
20+
work32_local = Vector{Float32}(undef, n) # temp residual in single
21+
bf32_local = Vector{Float32}(undef, n) # temp right-hand side in single
22+
return (F32_local, r64_local, work32_local, bf32_local)
3323
end
3424

35-
# Promote to double precision for accumulation and residual computation
36-
x = Vector{Float64}(undef, n)
37-
@inbounds @simd for i in 1:n
38-
x[i] = Float64(x32[i])
25+
# Initial solve in single precision, accumulate in double
26+
@inbounds for i = 1:n
27+
bf32[i] = Float32(b64[i])
3928
end
40-
41-
# Preallocate working vectors
42-
r = similar(b) # Float64 residual
43-
r32 = Vector{Float32}(undef, n) # single-precision correction (in-place)
44-
45-
for iter in 1:niters
46-
# r = b - Ad64 * x (use BLAS for dense matvec)
47-
mul!(r, Ad64, x) # r = Ad64 * x
48-
@inbounds @simd for i in 1:n
49-
r[i] = b[i] - r[i]
50-
r32[i] = Float32(r[i])
29+
xf32 = F32 \ bf32
30+
x = Float64.(xf32)
31+
32+
# Iterative refinement: compute residual in double, solve correction in single, update double solution
33+
for _ = 1:5
34+
mul!(r64, A, x) # r64 = A * x (double)
35+
@inbounds for i = 1:n
36+
r64[i] = b64[i] - r64[i] # r64 = b - A*x
37+
work32[i] = Float32(r64[i]) # convert residual to single
5138
end
52-
53-
# Solve correction in single precision using the LU factorization
54-
try
55-
LinearAlgebra.ldiv!(F32, r32) # r32 <- Ad32 \ r32 (in-place)
56-
catch
57-
r32 = F32 \ r32 # fallback
58-
end
59-
60-
# Update double-precision solution
61-
@inbounds @simd for i in 1:n
62-
x[i] += Float64(r32[i])
39+
d32 = F32 \ work32
40+
@inbounds for i = 1:n
41+
x[i] += Float64(d32[i]) # update solution in double
6342
end
6443
end
6544

examples/agentic/generate-dagger-linear-solver/Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ SmartSolve = "4fbb3a3c-2fa1-4c19-8d57-bae8bc1e16ac"
88
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
99

1010
[sources]
11-
Dagger = {rev = "master", url = "https://github.com/JuliaParallel/Dagger.jl"}
11+
Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"}
1212
SmartSolve = {path = "../../.."}

examples/agentic/generate-dagger-linear-solver/generate.jl

Lines changed: 88 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,97 @@ using BenchmarkTools
66
using Dagger
77

88
prompt = """
9-
Generate a high-performance Dagger.jl (https://juliaparallel.org/Dagger.jl/dev/) implementation in Julia of a linear solver for sparse matrices
10-
based on LU with iterative refinement (at least 5 refinement iterations), using the following
11-
reference: https://nhigham.com/2023/03/13/what-is-iterative-refinement
9+
- Task: Write a high-performance Julia implementation of a linear solver for sparse matrices, using Dagger.jl on the GPU. The solver must be based on Cholesky factorization with iterative refinement.
10+
11+
- Requirements
12+
13+
1) Libraries and references
14+
15+
1.1) Use Dagger.jl as documented here:
16+
https://juliaparallel.org/Dagger.jl/dev/
17+
18+
1.2) Follow the iterative refinement algorithm described here:
19+
https://nhigham.com/2023/03/13/what-is-iterative-refinement/
20+
21+
2) Dagger.jl + Cholesky
22+
23+
2.1) Dagger.jl already has an Cholesky routine that can be used for distributed linear solves: cholesky(A_d) where A_d is a distributed matrix (DMatrix). This implementation extends cholesky from LinearAlgebra.jl. See https://github.com/JuliaParallel/Dagger.jl/blob/master/src/array/cholesky.jl.
24+
25+
2.2) Use that Cholesky implementation within Dagger.jl (do not re-implement Cholesky from scratch).
26+
27+
2.3) The computation must be fully on GPU, using Dagger's GPU support.
28+
29+
2.4) Do not move data back to the CPU for intermediate computations.
30+
31+
2.5) All linear algebra operations (factorization, forward/back substitution, residual computation, refinement updates) must be performed on GPU-resident Dagger arrays.
32+
33+
3) Function API
34+
35+
3.1) Implement exactly one Julia function with the following signature:
36+
function proposed_fn(A_d, b_d)
37+
# your code here
38+
end
39+
A_d: distributed sparse matrix (Dagger-distributed, GPU-resident).
40+
b_d: distributed vector (Dagger-distributed, GPU-resident).
41+
x: solution vector (Dagger-distributed, GPU-resident). You may treat x as an initial guess and overwrite it with the final refined solution.
42+
43+
3.2) The function must return the final solution x (and anything else you consider useful, e.g., a residual norm, but the first return value must be the solution).
44+
45+
4)Iterative refinement details
46+
47+
4.1) Use Cholesky factorization of A_d to compute an initial solution x₀.
48+
49+
4.2) Then apply iterative refinement:
50+
51+
4.2.1) At each iteration k, compute residual r_k = b_d - A_d * x_k on the GPU.
52+
4.2.2) Solve Ad d_k = r_k using the Cholesky factors (on GPU).
53+
4.2.3) Update x_k+1 = x_k + d_k on GPU.
54+
55+
4.3) Perform at least 5 refinement iterations (you can use a loop with a fixed number of iterations ≥ 5; optional extra stopping criteria are allowed but not required).
56+
57+
5) Performance and style constraints
58+
59+
5.1) Use Dagger tasks / computation graphs appropriately so that the Cholesky factorization and solves are executed in parallel where possible.
60+
61+
5.2) Avoid unnecessary data movement or conversions.
62+
63+
5.3) Do not use CPU-only arrays or operations (no Array, no collect to CPU, etc.).
64+
65+
5.4) Assume that using LinearAlgebra, using SparseArrays, and using Dagger have already been executed.
66+
67+
5.5) Focus on clarity and correctness first, but structure the code with performance in mind (e.g., reuse LU factors, avoid recomputing them each iteration).
68+
69+
6) Output format
70+
71+
6.1) Output only the Julia code for the function:
72+
73+
function proposed_fn(A_d, b_d)
74+
...
75+
end
76+
77+
6.2) Do not include any explanation, comments, or text outside the function definition.
78+
1279
"""
1380

1481
secret_key = ENV["OPENAI_API_KEY"]
15-
solver, hist, conv = gen_linear_solver_dagger(prompt, secret_key; max_iters = 5)
82+
solver, hist, conv = gen_linear_solver_dagger(prompt, secret_key; max_iters = 50)
1683

1784
println("Generated Code:\n")
1885
println(solver)
19-
write("solver.jl", solver)
86+
write("solver.jl", solver)
87+
88+
89+
90+
91+
# using Dagger, CUDA, LinearAlgebra
92+
# N = 2000
93+
# A = rand(N, N)
94+
# A = A * A'
95+
# A[diagind(A)] .+= size(A, 1)
96+
# A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
97+
# view(A, Blocks(500, 500))
98+
# end
99+
# b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
100+
# randn(Blocks(500), N)
101+
# end
102+
# cholesky(A_d) \ b_d

src/Agentic.jl

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ end
2222
proposed_fn(x) = x
2323
evaluator(x) = (true, "")
2424
function generate_default_code(prompt, secret_key, checker_filename;
25-
model = "gpt-5-mini", dev_prompt_fn = dev_prompt_maker, max_iters = 3)
25+
model = "gpt-5-mini",
26+
dev_prompt_fn = dev_prompt_maker,
27+
max_iters = 3)
2628
"""
2729
- checker_fn: proposed_fn -> check : Bool, performance_description : String
2830
"""
@@ -42,6 +44,8 @@ function generate_default_code(prompt, secret_key, checker_filename;
4244

4345
println("Iteration $iters")
4446

47+
println("Code:\n $gen_code")
48+
4549
# println(gen_code)
4650
push!(chat_history, Dict("role" => "assistant", "content" => gen_code))
4751
try
@@ -53,14 +57,15 @@ function generate_default_code(prompt, secret_key, checker_filename;
5357

5458
next_prompt = description_prompt_maker(check, performance_description)
5559
push!(chat_history, Dict("role" => "user", "content" => next_prompt))
56-
converged = ~(iters == max_iters)
60+
converevlged = ~(iters == max_iters)
5761
check && break
5862
catch e
5963
error_msg = sprint(showerror, e)
6064
st = sprint((io,v) -> show(io, "text/plain", v), stacktrace(catch_backtrace()))
6165

6266
next_prompt = error_prompt_maker(error_msg * "\n" * st)
6367
push!(chat_history, Dict("role" => "user", "content" => next_prompt))
68+
println("error: $error_msg\n$st")
6469
end
6570
converged = ~(iters == max_iters)
6671
end
@@ -84,14 +89,13 @@ function ls_cuda_dev_prompt_maker(fn_str)
8489
end
8590

8691
function ls_dagger_dev_prompt_maker(fn_str)
87-
return "You are a numerical linear algebra expert, and an expert Julia programmer. You are very experienced in GPU programming using CUDA." *
92+
return "You are a numerical linear algebra expert, and an expert Julia programmer. You are very experienced in GPU programming using Dagger.jl" *
8893
" The user will ask you to generate a function and use the following code the check if your solution is accurate and fast." *
8994
" Make sure the code you produce uses Dagger." *
9095
" Here is the code: \n" * fn_str * "\nOnly return the function. Make sure the function name is proposed_fn. Do not return extra text." *
9196
" Assume that LinearAlgebra and SparseArrays is already imported." *
9297
" Assume that Dagger is already imported." *
93-
" Use the following Dagger.jl documentation: https://juliaparallel.org/Dagger.jl/dev/" *
94-
" Use the following Dagger.jl implementation of Cholesky as an example: https://github.com/JuliaParallel/Dagger.jl/blob/67211816781d59109d74940550ca2d80af96b13d/src/array/cholesky.jl"
98+
" Use the following Dagger.jl documentation: https://juliaparallel.org/Dagger.jl/dev/"
9599
end
96100

97101
src_dir = @__DIR__

src/test_performance_dagger.jl

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,48 @@
1-
test_matrices = []
2-
N = 2000#10_000
3-
# push!(test_matrices, randn(N, N))
4-
# push!(test_matrices, randn(N, N))
5-
# push!(test_matrices, randn(N, N))
6-
Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
7-
push!(test_matrices, randn(Blocks(N, N), N, N))
8-
push!(test_matrices, randn(Blocks(N, N), N, N))
9-
push!(test_matrices, randn(Blocks(N, N), N, N))
10-
end
11-
12-
function evaluator(proposed_fn;
13-
err_threshold::Float64 = 1.0,
14-
runtime_threshold::Float64 = 1.1,
15-
alloc_threshold::Float64 = 0.0)
16-
error_ratios = Float64[]
1+
function evaluator( proposed_fn;
2+
err_threshold::Float64 = 1.0,
3+
runtime_threshold::Float64 = 1.1,
4+
alloc_threshold::Float64 = 0.0)
5+
N = 2000 #10_000
6+
error_ratios = Float64[]
177
runtime_ratios = Float64[]
188
alloc_ratios = Float64[]
19-
for A_d in test_matrices
20-
# right-hand side on CPU
21-
A_dim2 = size(A_d, 2)
9+
for _ in 1:3
10+
11+
# SPD Matrix
12+
A = randn(N, N)
13+
A = A*A' + N*I
14+
15+
# Right-hand side
16+
b = randn(N)
17+
18+
# SPD Matrix and right hand side on GPU (CUDA)
19+
A_cuda = CuArray(A)
20+
b_cuda = CuArray(b)
21+
22+
# SPD Matrix and right-hand side on GPU (Dagger Distributed)
23+
A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
24+
distribute(A, Blocks(N÷4, N÷4))
25+
end
2226
b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
23-
randn(Blocks(A_dim2), A_dim2)
27+
distribute(b, Blocks(N÷4))
2428
end
25-
# move to GPU; here we use a dense GPU matrix
26-
# If you have a sparse GPU solver, you can switch to CuSparseMatrixCSR(A_cpu)
27-
A_cuda = CuArray(collect(A_d))
28-
b_cuda = CuArray(collect(b_d))
29+
2930
# --- Solve once to ensure kernels are compiled (warm-up) ---
30-
x_default = similar(b_cuda)
31-
CUSOLVER.gesv!(x_default, A_cuda, b_cuda, irs_precision = "R_32F")
32-
x_gen = similar(b_d)
33-
Base.invokelatest(proposed_fn, x_gen, A_d, b_d)
31+
x_default = cholesky(A_cuda) \ b_cuda
32+
x_gen = Base.invokelatest(proposed_fn, A_d, b_d)
3433
CUDA.synchronize()
34+
3535
# --- Error ratios (all on GPU, scalars on CPU) ---
3636
err_default = norm(A_cuda * x_default - b_cuda)
37-
err_gen = norm(A_d * x_gen - b_d)
37+
err_gen = norm(A_d * x_gen - b_d)
3838
push!(error_ratios, err_default / err_gen)
3939
# --- Runtime ratios (GPU) ---
4040
b_default = @benchmark begin
41-
x = similar($b_cuda)
42-
CUSOLVER.gesv!($x, $A_cuda, $b_cuda, irs_precision = "R_32F")
41+
cholesky($A_cuda) \ $b_cuda
4342
CUDA.synchronize()
4443
end
4544
b_gen = @benchmark begin
46-
x = similar($b_d)
47-
Base.invokelatest($proposed_fn, $x, $A_d, $b_d)
45+
Base.invokelatest($proposed_fn, $A_d, $b_d)
4846
end
4947
push!(runtime_ratios, median(b_default.times) / median(b_gen.times))
5048
push!(alloc_ratios, median(b_default.allocs) / median(b_gen.allocs))

0 commit comments

Comments
 (0)