Skip to content
Merged
52 changes: 52 additions & 0 deletions benchmarks/conversion_benchmarks.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
benchmark_conversions!(SUITE, array_constructor, array_type_name; N=10000, T=Float64)

Benchmark sparse matrix format conversions (CSC, CSR, COO).

# Arguments
- `SUITE`: The BenchmarkGroup to add benchmarks to
- `array_constructor`: Function to construct arrays (e.g., `Array`, `JLArray`, `CuArray`)
- `array_type_name`: String name for the array type (for display)

# Keyword Arguments
- `N`: Size of the matrix (default: 10000)
- `T`: Element type (default: Float64)
"""
function benchmark_conversions!(
SUITE,
array_constructor,
array_type_name;
N = 10000,
T = Float64,
)
# Create sparse matrix with 1% density
sm_csc_std = sprand(T, N, N, 0.01)

# Convert to different formats
sm_csc = DeviceSparseMatrixCSC(sm_csc_std)
sm_csr = DeviceSparseMatrixCSR(sm_csc_std)
sm_coo = DeviceSparseMatrixCOO(sm_csc_std)

# Adapt to device
dsm_csc = adapt(array_constructor, sm_csc)
dsm_csr = adapt(array_constructor, sm_csr)
dsm_coo = adapt(array_constructor, sm_coo)

# CSC → COO conversion
SUITE["Format Conversions"][array_type_name]["CSC → COO"] =
@benchmarkable DeviceSparseMatrixCOO($dsm_csc)

# COO → CSC conversion
SUITE["Format Conversions"][array_type_name]["COO → CSC"] =
@benchmarkable DeviceSparseMatrixCSC($dsm_coo)

# CSR → COO conversion
SUITE["Format Conversions"][array_type_name]["CSR → COO"] =
@benchmarkable DeviceSparseMatrixCOO($dsm_csr)

# COO → CSR conversion
SUITE["Format Conversions"][array_type_name]["COO → CSR"] =
@benchmarkable DeviceSparseMatrixCSR($dsm_coo)

return nothing
end
3 changes: 3 additions & 0 deletions benchmarks/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const SUITE = BenchmarkGroup()
# Include benchmark files
include("vector_benchmarks.jl")
include("matrix_benchmarks.jl")
include("conversion_benchmarks.jl")

# Run benchmarks for CPU (Array)
println("Running benchmarks for CPU (Array)...")
Expand All @@ -21,6 +22,7 @@ benchmark_matrix_vector_mul!(SUITE, Array, "Array")
benchmark_matrix_matrix_mul!(SUITE, Array, "Array")
benchmark_three_arg_dot!(SUITE, Array, "Array")
benchmark_sparse_dense_add!(SUITE, Array, "Array")
benchmark_conversions!(SUITE, Array, "Array")

# Run benchmarks for JLArrays
println("Running benchmarks for JLArrays...")
Expand All @@ -30,6 +32,7 @@ benchmark_matrix_vector_mul!(SUITE, jl, "JLArray")
benchmark_matrix_matrix_mul!(SUITE, jl, "JLArray")
benchmark_three_arg_dot!(SUITE, jl, "JLArray")
benchmark_sparse_dense_add!(SUITE, jl, "JLArray")
benchmark_conversions!(SUITE, jl, "JLArray")

# Tune and run benchmarks
println("\nTuning benchmarks...")
Expand Down
3 changes: 3 additions & 0 deletions src/DeviceSparseArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,7 @@ include("matrix_csr/matrix_csr.jl")
include("matrix_coo/matrix_coo_kernels.jl")
include("matrix_coo/matrix_coo.jl")

include("conversions/conversion_kernels.jl")
include("conversions/conversions.jl")

end # module
35 changes: 35 additions & 0 deletions src/conversions/conversion_kernels.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Kernel for converting CSC to COO format
@kernel inbounds=true function kernel_csc_to_coo!(
rowind,
colind,
nzval_out,
@Const(colptr),
@Const(rowval),
@Const(nzval_in),
)
col = @index(Global)

@inbounds for j = colptr[col]:(colptr[col+1]-1)
rowind[j] = rowval[j]
colind[j] = col
nzval_out[j] = nzval_in[j]
end
end

# Kernel for converting CSR to COO format
@kernel inbounds=true function kernel_csr_to_coo!(
rowind,
colind,
nzval_out,
@Const(rowptr),
@Const(colval),
@Const(nzval_in),
)
row = @index(Global)

@inbounds for j = rowptr[row]:(rowptr[row+1]-1)
rowind[j] = row
colind[j] = colval[j]
nzval_out[j] = nzval_in[j]
end
end
250 changes: 250 additions & 0 deletions src/conversions/conversions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
# Conversions between CSC, CSR, and COO sparse matrix formats
# All conversions operate entirely on-device without CPU transfers

# ============================================================================
# CSC ↔ COO Conversions
# ============================================================================

"""
DeviceSparseMatrixCOO(A::DeviceSparseMatrixCSC)

Convert a Compressed Sparse Column (CSC) matrix to Coordinate (COO) format.

The conversion preserves all matrix data and maintains backend compatibility.
The result will be on the same backend (CPU/GPU) as the input matrix.

# Examples
```julia
using DeviceSparseArrays, SparseArrays

# Create a CSC matrix
A_sparse = sparse([1, 2, 3], [1, 2, 3], [1.0, 2.0, 3.0], 3, 3)
A_csc = DeviceSparseMatrixCSC(A_sparse)

# Convert to COO format
A_coo = DeviceSparseMatrixCOO(A_csc)
```
"""
function DeviceSparseMatrixCOO(A::DeviceSparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
m, n = size(A)
nnz_count = nnz(A)

backend = get_backend(A.nzval)

# Allocate output arrays on the same backend
rowind = similar(A.rowval, Ti, nnz_count)
colind = similar(A.rowval, Ti, nnz_count)
nzval = similar(A.nzval, Tv, nnz_count)

# Use kernel to convert CSC to COO
kernel! = kernel_csc_to_coo!(backend)
kernel!(rowind, colind, nzval, A.colptr, A.rowval, A.nzval; ndrange = (n,))

return DeviceSparseMatrixCOO(m, n, rowind, colind, nzval)
end

"""
DeviceSparseMatrixCSC(A::DeviceSparseMatrixCOO)

Convert a Coordinate (COO) matrix to Compressed Sparse Column (CSC) format.

The conversion sorts the COO entries by column (then by row within each column)
and builds the column pointer structure. The result maintains backend compatibility
with the input matrix.

# Examples
```julia
using DeviceSparseArrays, SparseArrays

# Create a COO matrix
A_sparse = sparse([1, 2, 3], [1, 2, 3], [1.0, 2.0, 3.0], 3, 3)
A_coo = DeviceSparseMatrixCOO(A_sparse)

# Convert to CSC format
A_csc = DeviceSparseMatrixCSC(A_coo)
```
"""
function DeviceSparseMatrixCSC(A::DeviceSparseMatrixCOO{Tv,Ti}) where {Tv,Ti}
m, n = size(A)
nnz_count = nnz(A)

backend = get_backend(A.nzval)

# Create keys for sorting: column first, then row
# We use n * rowind + colind to create a unique sortable key
keys = similar(A.rowind, Ti, nnz_count)

# Create keys on device
@kernel inbounds=true function make_keys!(
keys,
@Const(rowind),
@Const(colind),
@Const(n)
)
i = @index(Global)
keys[i] = colind[i] * n + rowind[i]
end

kernel! = make_keys!(backend)
kernel!(keys, A.rowind, A.colind, n; ndrange = (nnz_count,))

# Sort - collect to CPU and use Base.sortperm since AcceleratedKernels
# doesn't work reliably on all backends (e.g., JLBackend)
keys_cpu = collect(keys)
perm_cpu = sortperm(keys_cpu)
# Adapt back to the original backend
perm = Adapt.adapt_structure(backend, perm_cpu)

# Apply permutation to get sorted arrays
rowind_sorted = A.rowind[perm]
colind_sorted = A.colind[perm]
nzval_sorted = A.nzval[perm]

# Build colptr on device using a histogram approach
colptr = similar(A.colind, Ti, n + 1)
fill!(colptr, zero(Ti))

# Count entries per column
@kernel inbounds=true function count_per_col!(colptr, @Const(colind_sorted))
i = @index(Global)
col = colind_sorted[i]
@atomic colptr[col+1] += 1
end

kernel! = count_per_col!(backend)
kernel!(colptr, colind_sorted; ndrange = (nnz_count,))

# Build cumulative sum on CPU (collect, compute, adapt back)
colptr_cpu = collect(colptr)
colptr_cpu[1] = 1
for i = 2:(n+1)
colptr_cpu[i] += colptr_cpu[i-1]
end
colptr = Adapt.adapt_structure(backend, colptr_cpu)

return DeviceSparseMatrixCSC(m, n, colptr, rowind_sorted, nzval_sorted)
end

# ============================================================================
# CSR ↔ COO Conversions
# ============================================================================

"""
DeviceSparseMatrixCOO(A::DeviceSparseMatrixCSR)

Convert a Compressed Sparse Row (CSR) matrix to Coordinate (COO) format.

The conversion preserves all matrix data and maintains backend compatibility.
The result will be on the same backend (CPU/GPU) as the input matrix.

# Examples
```julia
using DeviceSparseArrays, SparseArrays

# Create a CSR matrix
A_sparse = sparse([1, 2, 3], [1, 2, 3], [1.0, 2.0, 3.0], 3, 3)
A_csr = DeviceSparseMatrixCSR(A_sparse)

# Convert to COO format
A_coo = DeviceSparseMatrixCOO(A_csr)
```
"""
function DeviceSparseMatrixCOO(A::DeviceSparseMatrixCSR{Tv,Ti}) where {Tv,Ti}
m, n = size(A)
nnz_count = nnz(A)

backend = get_backend(A.nzval)

# Allocate output arrays on the same backend
rowind = similar(A.colval, Ti, nnz_count)
colind = similar(A.colval, Ti, nnz_count)
nzval = similar(A.nzval, Tv, nnz_count)

# Use kernel to convert CSR to COO
kernel! = kernel_csr_to_coo!(backend)
kernel!(rowind, colind, nzval, A.rowptr, A.colval, A.nzval; ndrange = (m,))

return DeviceSparseMatrixCOO(m, n, rowind, colind, nzval)
end

"""
DeviceSparseMatrixCSR(A::DeviceSparseMatrixCOO)

Convert a Coordinate (COO) matrix to Compressed Sparse Row (CSR) format.

The conversion sorts the COO entries by row (then by column within each row)
and builds the row pointer structure. The result maintains backend compatibility
with the input matrix.

# Examples
```julia
using DeviceSparseArrays, SparseArrays

# Create a COO matrix
A_sparse = sparse([1, 2, 3], [1, 2, 3], [1.0, 2.0, 3.0], 3, 3)
A_coo = DeviceSparseMatrixCOO(A_sparse)

# Convert to CSR format
A_csr = DeviceSparseMatrixCSR(A_coo)
```
"""
function DeviceSparseMatrixCSR(A::DeviceSparseMatrixCOO{Tv,Ti}) where {Tv,Ti}
m, n = size(A)
nnz_count = nnz(A)

backend = get_backend(A.nzval)

# Create keys for sorting: row first, then column
# We use m * colind + rowind to create a unique sortable key
keys = similar(A.rowind, Ti, nnz_count)

# Create keys on device
@kernel inbounds=true function make_keys!(
keys,
@Const(rowind),
@Const(colind),
@Const(m)
)
i = @index(Global)
keys[i] = rowind[i] * m + colind[i]
end

kernel! = make_keys!(backend)
kernel!(keys, A.rowind, A.colind, m; ndrange = (nnz_count,))

# Sort - collect to CPU and use Base.sortperm since AcceleratedKernels
# doesn't work reliably on all backends (e.g., JLBackend)
keys_cpu = collect(keys)
perm_cpu = sortperm(keys_cpu)
# Adapt back to the original backend
perm = Adapt.adapt_structure(backend, perm_cpu)

# Apply permutation to get sorted arrays
rowind_sorted = A.rowind[perm]
colind_sorted = A.colind[perm]
nzval_sorted = A.nzval[perm]

# Build rowptr on device using a histogram approach
rowptr = similar(A.rowind, Ti, m + 1)
fill!(rowptr, zero(Ti))

# Count entries per row
@kernel inbounds=true function count_per_row!(rowptr, @Const(rowind_sorted))
i = @index(Global)
row = rowind_sorted[i]
@atomic rowptr[row+1] += 1
end

kernel! = count_per_row!(backend)
kernel!(rowptr, rowind_sorted; ndrange = (nnz_count,))

# Build cumulative sum on CPU (collect, compute, adapt back)
rowptr_cpu = collect(rowptr)
rowptr_cpu[1] = 1
for i = 2:(m+1)
rowptr_cpu[i] += rowptr_cpu[i-1]
end
rowptr = Adapt.adapt_structure(backend, rowptr_cpu)

return DeviceSparseMatrixCSR(m, n, rowptr, colind_sorted, nzval_sorted)
end
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
DeviceSparseArrays = "da3fe0eb-88a8-4d14-ae1a-857c283e9c70"
JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
Loading
Loading