Skip to content

Commit 5352275

Browse files
Implement sparse + sparse matrix addition with transpose/adjoint support for CSC, CSR, and COO formats (#27)
* Initial plan * Add sparse + sparse matrix addition for CSC, CSR, and COO formats Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> * Fix scalar indexing issue and add sparse+sparse benchmarks Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> * Fix COO addition to merge duplicates and remove unnecessary test dependency Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> * Add transpose/adjoint support for sparse matrix addition Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> * Simplify transpose/adjoint addition by using existing conversion methods Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> * Consolidate transpose/adjoint tests into main addition testset using iterator pattern Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> * Replace allowed_getindex with only() for GPU compatibility Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> * Simplify test structure by merging conditionals Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> * Fix allowscalar issue --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: albertomercurio <61953577+albertomercurio@users.noreply.github.com> Co-authored-by: Alberto Mercurio <alberto.mercurio96@gmail.com>
1 parent 131b789 commit 5352275

File tree

11 files changed

+896
-0
lines changed

11 files changed

+896
-0
lines changed

benchmarks/matrix_benchmarks.jl

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,66 @@ function benchmark_sparse_dense_add!(
228228
return nothing
229229
end
230230

231+
"""
232+
benchmark_sparse_sparse_add!(SUITE, array_constructor, array_type_name; N=10000, T=Float64)
233+
234+
Benchmark sparse + sparse matrix addition for CSC, CSR, and COO formats.
235+
236+
# Arguments
237+
- `SUITE`: The BenchmarkGroup to add benchmarks to
238+
- `array_constructor`: Function to construct arrays (e.g., `Array`, `JLArray`)
239+
- `array_type_name`: String name for the array type (for display)
240+
241+
# Keyword Arguments
242+
- `N`: Size of the matrix (default: 10000)
243+
- `T`: Element type (default: Float64)
244+
"""
245+
function benchmark_sparse_sparse_add!(
246+
SUITE,
247+
array_constructor,
248+
array_type_name;
249+
N = 10000,
250+
T = Float64,
251+
)
252+
# Create two sparse matrices with 1% density
253+
sm_a_csc_std = sprand(T, N, N, 0.01)
254+
sm_b_csc_std = sprand(T, N, N, 0.01)
255+
256+
# Convert to different formats
257+
sm_a_csc = DeviceSparseMatrixCSC(sm_a_csc_std)
258+
sm_b_csc = DeviceSparseMatrixCSC(sm_b_csc_std)
259+
sm_a_csr = DeviceSparseMatrixCSR(sm_a_csc_std)
260+
sm_b_csr = DeviceSparseMatrixCSR(sm_b_csc_std)
261+
sm_a_coo = DeviceSparseMatrixCOO(sm_a_csc_std)
262+
sm_b_coo = DeviceSparseMatrixCOO(sm_b_csc_std)
263+
264+
# Adapt to device
265+
dsm_a_csc = adapt(array_constructor, sm_a_csc)
266+
dsm_b_csc = adapt(array_constructor, sm_b_csc)
267+
dsm_a_csr = adapt(array_constructor, sm_a_csr)
268+
dsm_b_csr = adapt(array_constructor, sm_b_csr)
269+
dsm_a_coo = adapt(array_constructor, sm_a_coo)
270+
dsm_b_coo = adapt(array_constructor, sm_b_coo)
271+
272+
# Level 3: Format (CSC, CSR, COO - will be plotted together)
273+
SUITE["Sparse + Sparse Addition"][array_type_name]["CSC"] = @benchmarkable begin
274+
$dsm_a_csc + $dsm_b_csc
275+
_synchronize_backend($dsm_a_csc)
276+
end
277+
278+
SUITE["Sparse + Sparse Addition"][array_type_name]["CSR"] = @benchmarkable begin
279+
$dsm_a_csr + $dsm_b_csr
280+
_synchronize_backend($dsm_a_csr)
281+
end
282+
283+
SUITE["Sparse + Sparse Addition"][array_type_name]["COO"] = @benchmarkable begin
284+
$dsm_a_coo + $dsm_b_coo
285+
_synchronize_backend($dsm_a_coo)
286+
end
287+
288+
return nothing
289+
end
290+
231291
"""
232292
benchmark_kron!(SUITE, array_constructor, array_type_name; N=100, T=Float64)
233293

benchmarks/runbenchmarks.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ benchmark_matrix_vector_mul!(SUITE, Array, "Array")
2626
benchmark_matrix_matrix_mul!(SUITE, Array, "Array")
2727
benchmark_three_arg_dot!(SUITE, Array, "Array")
2828
benchmark_sparse_dense_add!(SUITE, Array, "Array")
29+
benchmark_sparse_sparse_add!(SUITE, Array, "Array")
2930
benchmark_kron!(SUITE, Array, "Array")
3031
benchmark_conversions!(SUITE, Array, "Array")
3132

@@ -37,6 +38,7 @@ benchmark_matrix_vector_mul!(SUITE, JLArray, "JLArray")
3738
benchmark_matrix_matrix_mul!(SUITE, JLArray, "JLArray")
3839
benchmark_three_arg_dot!(SUITE, JLArray, "JLArray")
3940
benchmark_sparse_dense_add!(SUITE, JLArray, "JLArray")
41+
benchmark_sparse_sparse_add!(SUITE, JLArray, "JLArray")
4042
benchmark_kron!(SUITE, JLArray, "JLArray")
4143
benchmark_conversions!(SUITE, JLArray, "JLArray")
4244

src/matrix_coo/matrix_coo.jl

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,216 @@ function _add_sparse_to_dense!(C::DenseMatrix, A::DeviceSparseMatrixCOO)
309309
return C
310310
end
311311

312+
"""
313+
+(A::DeviceSparseMatrixCOO, B::DeviceSparseMatrixCOO)
314+
315+
Add two sparse matrices in COO format. Both matrices must have the same dimensions
316+
and be on the same backend (device).
317+
318+
The result is a COO matrix with entries from both A and B properly merged,
319+
with duplicate entries (same row and column) combined by summing their values.
320+
321+
# Examples
322+
```jldoctest
323+
julia> using DeviceSparseArrays, SparseArrays
324+
325+
julia> A = DeviceSparseMatrixCOO(sparse([1, 2], [1, 2], [1.0, 2.0], 2, 2));
326+
327+
julia> B = DeviceSparseMatrixCOO(sparse([1, 2], [2, 1], [3.0, 4.0], 2, 2));
328+
329+
julia> C = A + B;
330+
331+
julia> collect(C)
332+
2×2 Matrix{Float64}:
333+
1.0 3.0
334+
4.0 2.0
335+
```
336+
"""
337+
function Base.:+(A::DeviceSparseMatrixCOO, B::DeviceSparseMatrixCOO)
338+
size(A) == size(B) || throw(
339+
DimensionMismatch(
340+
"dimensions must match: A has dims $(size(A)), B has dims $(size(B))",
341+
),
342+
)
343+
344+
backend_A = get_backend(A)
345+
backend_B = get_backend(B)
346+
backend_A == backend_B ||
347+
throw(ArgumentError("Both matrices must have the same backend"))
348+
349+
m, n = size(A)
350+
Ti = eltype(getrowind(A))
351+
Tv = promote_type(eltype(nonzeros(A)), eltype(nonzeros(B)))
352+
353+
# Concatenate the coordinate arrays
354+
nnz_A = nnz(A)
355+
nnz_B = nnz(B)
356+
nnz_concat = nnz_A + nnz_B
357+
358+
# Allocate concatenated arrays
359+
rowind_concat = similar(getrowind(A), nnz_concat)
360+
colind_concat = similar(getcolind(A), nnz_concat)
361+
nzval_concat = similar(nonzeros(A), Tv, nnz_concat)
362+
363+
# Copy entries from A and B
364+
rowind_concat[1:nnz_A] .= getrowind(A)
365+
colind_concat[1:nnz_A] .= getcolind(A)
366+
nzval_concat[1:nnz_A] .= nonzeros(A)
367+
rowind_concat[(nnz_A+1):end] .= getrowind(B)
368+
colind_concat[(nnz_A+1):end] .= getcolind(B)
369+
nzval_concat[(nnz_A+1):end] .= nonzeros(B)
370+
371+
# Sort by (row, col) using keys similar to COO->CSC conversion
372+
backend = backend_A
373+
keys = similar(rowind_concat, Ti, nnz_concat)
374+
kernel_make_keys! = kernel_make_csc_keys!(backend)
375+
kernel_make_keys!(keys, rowind_concat, colind_concat, m; ndrange = (nnz_concat,))
376+
377+
# Sort using AcceleratedKernels
378+
perm = _sortperm_AK(keys)
379+
380+
# Apply permutation to get sorted arrays
381+
rowind_sorted = rowind_concat[perm]
382+
colind_sorted = colind_concat[perm]
383+
nzval_sorted = nzval_concat[perm]
384+
385+
# Mark unique entries (first occurrence of each (row, col) pair)
386+
keep_mask = similar(rowind_sorted, Bool, nnz_concat)
387+
kernel_mark! = kernel_mark_unique_coo!(backend)
388+
kernel_mark!(keep_mask, rowind_sorted, colind_sorted, nnz_concat; ndrange = (nnz_concat,))
389+
390+
# Compute write indices using cumsum
391+
write_indices = _cumsum_AK(keep_mask)
392+
nnz_final = @allowscalar write_indices[nnz_concat]
393+
394+
# Allocate final arrays
395+
rowind_C = similar(getrowind(A), nnz_final)
396+
colind_C = similar(getcolind(A), nnz_final)
397+
nzval_C = similar(nonzeros(A), Tv, nnz_final)
398+
399+
# Compact: merge duplicates by summing
400+
kernel_compact! = kernel_compact_coo!(backend)
401+
kernel_compact!(
402+
rowind_C,
403+
colind_C,
404+
nzval_C,
405+
rowind_sorted,
406+
colind_sorted,
407+
nzval_sorted,
408+
write_indices,
409+
nnz_concat;
410+
ndrange = (nnz_concat,),
411+
)
412+
413+
return DeviceSparseMatrixCOO(m, n, rowind_C, colind_C, nzval_C)
414+
end
415+
416+
# Addition with transpose/adjoint support
417+
for (wrapa, transa, conja, unwrapa, whereT1) in trans_adj_wrappers(:DeviceSparseMatrixCOO)
418+
for (wrapb, transb, conjb, unwrapb, whereT2) in trans_adj_wrappers(:DeviceSparseMatrixCOO)
419+
# Skip the case where both are not transposed (already handled above)
420+
(transa == false && transb == false) && continue
421+
422+
TypeA = wrapa(:(T1))
423+
TypeB = wrapb(:(T2))
424+
425+
@eval function Base.:+(A::$TypeA, B::$TypeB) where {$(whereT1(:T1)),$(whereT2(:T2))}
426+
size(A) == size(B) || throw(
427+
DimensionMismatch(
428+
"dimensions must match: A has dims $(size(A)), B has dims $(size(B))",
429+
),
430+
)
431+
432+
_A = $(unwrapa(:A))
433+
_B = $(unwrapb(:B))
434+
435+
backend_A = get_backend(_A)
436+
backend_B = get_backend(_B)
437+
backend_A == backend_B ||
438+
throw(ArgumentError("Both matrices must have the same backend"))
439+
440+
m, n = size(A)
441+
Ti = eltype(getrowind(_A))
442+
Tv = promote_type(eltype(nonzeros(_A)), eltype(nonzeros(_B)))
443+
444+
# For transposed COO, swap row and column indices
445+
nnz_A = nnz(_A)
446+
nnz_B = nnz(_B)
447+
nnz_concat = nnz_A + nnz_B
448+
449+
# Allocate concatenated arrays
450+
rowind_concat = similar(getrowind(_A), nnz_concat)
451+
colind_concat = similar(getcolind(_A), nnz_concat)
452+
nzval_concat = similar(nonzeros(_A), Tv, nnz_concat)
453+
454+
# Copy entries from A (potentially swapping row/col for transpose)
455+
if $transa
456+
rowind_concat[1:nnz_A] .= getcolind(_A) # Swap for transpose
457+
colind_concat[1:nnz_A] .= getrowind(_A)
458+
else
459+
rowind_concat[1:nnz_A] .= getrowind(_A)
460+
colind_concat[1:nnz_A] .= getcolind(_A)
461+
end
462+
if $conja
463+
nzval_concat[1:nnz_A] .= conj.(nonzeros(_A))
464+
else
465+
nzval_concat[1:nnz_A] .= nonzeros(_A)
466+
end
467+
468+
# Copy entries from B (potentially swapping row/col for transpose)
469+
if $transb
470+
rowind_concat[(nnz_A+1):end] .= getcolind(_B) # Swap for transpose
471+
colind_concat[(nnz_A+1):end] .= getrowind(_B)
472+
else
473+
rowind_concat[(nnz_A+1):end] .= getrowind(_B)
474+
colind_concat[(nnz_A+1):end] .= getcolind(_B)
475+
end
476+
if $conjb
477+
nzval_concat[(nnz_A+1):end] .= conj.(nonzeros(_B))
478+
else
479+
nzval_concat[(nnz_A+1):end] .= nonzeros(_B)
480+
end
481+
482+
# Sort and compact (same as before)
483+
backend = backend_A
484+
keys = similar(rowind_concat, Ti, nnz_concat)
485+
kernel_make_keys! = kernel_make_csc_keys!(backend)
486+
kernel_make_keys!(keys, rowind_concat, colind_concat, m; ndrange = (nnz_concat,))
487+
488+
perm = _sortperm_AK(keys)
489+
rowind_sorted = rowind_concat[perm]
490+
colind_sorted = colind_concat[perm]
491+
nzval_sorted = nzval_concat[perm]
492+
493+
keep_mask = similar(rowind_sorted, Bool, nnz_concat)
494+
kernel_mark! = kernel_mark_unique_coo!(backend)
495+
kernel_mark!(keep_mask, rowind_sorted, colind_sorted, nnz_concat; ndrange = (nnz_concat,))
496+
497+
write_indices = _cumsum_AK(keep_mask)
498+
nnz_final = @allowscalar write_indices[nnz_concat]
499+
500+
rowind_C = similar(getrowind(_A), nnz_final)
501+
colind_C = similar(getcolind(_A), nnz_final)
502+
nzval_C = similar(nonzeros(_A), Tv, nnz_final)
503+
504+
kernel_compact! = kernel_compact_coo!(backend)
505+
kernel_compact!(
506+
rowind_C,
507+
colind_C,
508+
nzval_C,
509+
rowind_sorted,
510+
colind_sorted,
511+
nzval_sorted,
512+
write_indices,
513+
nnz_concat;
514+
ndrange = (nnz_concat,),
515+
)
516+
517+
return DeviceSparseMatrixCOO(m, n, rowind_C, colind_C, nzval_C)
518+
end
519+
end
520+
end
521+
312522
"""
313523
kron(A::DeviceSparseMatrixCOO, B::DeviceSparseMatrixCOO)
314524

src/matrix_coo/matrix_coo_kernels.jl

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,55 @@ end
181181
nzval_C[idx] = val_A * val_B
182182
end
183183
end
184+
185+
# Kernel for marking duplicate entries in sorted COO format
186+
# Returns a mask where mask[i] = true if entry i should be kept (first occurrence or sum)
187+
@kernel inbounds=true function kernel_mark_unique_coo!(
188+
keep_mask,
189+
@Const(rowind),
190+
@Const(colind),
191+
@Const(nnz_total),
192+
)
193+
i = @index(Global)
194+
195+
if i == 1
196+
# Always keep the first entry
197+
keep_mask[i] = true
198+
elseif i <= nnz_total
199+
# Keep if different from previous entry
200+
keep_mask[i] = (rowind[i] != rowind[i-1] || colind[i] != colind[i-1])
201+
end
202+
end
203+
204+
# Kernel for compacting COO by summing duplicate entries
205+
@kernel inbounds=true function kernel_compact_coo!(
206+
rowind_out,
207+
colind_out,
208+
nzval_out,
209+
@Const(rowind_in),
210+
@Const(colind_in),
211+
@Const(nzval_in),
212+
@Const(write_indices),
213+
@Const(nnz_in),
214+
)
215+
i = @index(Global)
216+
217+
if i <= nnz_in
218+
out_idx = write_indices[i]
219+
220+
# If this is a new entry (or first of duplicates), write it
221+
if i == 1 || (rowind_in[i] != rowind_in[i-1] || colind_in[i] != colind_in[i-1])
222+
rowind_out[out_idx] = rowind_in[i]
223+
colind_out[out_idx] = colind_in[i]
224+
225+
# Sum all duplicates
226+
val_sum = nzval_in[i]
227+
j = i + 1
228+
while j <= nnz_in && rowind_in[j] == rowind_in[i] && colind_in[j] == colind_in[i]
229+
val_sum += nzval_in[j]
230+
j += 1
231+
end
232+
nzval_out[out_idx] = val_sum
233+
end
234+
end
235+
end

0 commit comments

Comments
 (0)