albertomercurio · albertomercurio · Nov 13, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/benchmarks/conversion_benchmarks.jl b/benchmarks/conversion_benchmarks.jl
@@ -0,0 +1,52 @@
+"""
+    benchmark_conversions!(SUITE, array_constructor, array_type_name; N=10000, T=Float64)
+
+Benchmark sparse matrix format conversions (CSC, CSR, COO).
+
+# Arguments
+- `SUITE`: The BenchmarkGroup to add benchmarks to
+- `array_constructor`: Function to construct arrays (e.g., `Array`, `JLArray`, `CuArray`)
+- `array_type_name`: String name for the array type (for display)
+
+# Keyword Arguments
+- `N`: Size of the matrix (default: 10000)
+- `T`: Element type (default: Float64)
+"""
+function benchmark_conversions!(
+    SUITE,
+    array_constructor,
+    array_type_name;
+    N = 10000,
+    T = Float64,
+)
+    # Create sparse matrix with 1% density
+    sm_csc_std = sprand(T, N, N, 0.01)
+
+    # Convert to different formats
+    sm_csc = DeviceSparseMatrixCSC(sm_csc_std)
+    sm_csr = DeviceSparseMatrixCSR(sm_csc_std)
+    sm_coo = DeviceSparseMatrixCOO(sm_csc_std)
+
+    # Adapt to device
+    dsm_csc = adapt(array_constructor, sm_csc)
+    dsm_csr = adapt(array_constructor, sm_csr)
+    dsm_coo = adapt(array_constructor, sm_coo)
+
+    # CSC → COO conversion
+    SUITE["Format Conversions"][array_type_name]["CSC → COO"] =
+        @benchmarkable DeviceSparseMatrixCOO($dsm_csc)
+
+    # COO → CSC conversion
+    SUITE["Format Conversions"][array_type_name]["COO → CSC"] =
+        @benchmarkable DeviceSparseMatrixCSC($dsm_coo)
+
+    # CSR → COO conversion
+    SUITE["Format Conversions"][array_type_name]["CSR → COO"] =
+        @benchmarkable DeviceSparseMatrixCOO($dsm_csr)
+
+    # COO → CSR conversion
+    SUITE["Format Conversions"][array_type_name]["COO → CSR"] =
+        @benchmarkable DeviceSparseMatrixCSR($dsm_coo)
+
+    return nothing
+end
diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl
@@ -12,6 +12,7 @@ const SUITE = BenchmarkGroup()
 # Include benchmark files
 include("vector_benchmarks.jl")
 include("matrix_benchmarks.jl")
+include("conversion_benchmarks.jl")
 
 # Run benchmarks for CPU (Array)
 println("Running benchmarks for CPU (Array)...")
@@ -21,6 +22,7 @@ benchmark_matrix_vector_mul!(SUITE, Array, "Array")
 benchmark_matrix_matrix_mul!(SUITE, Array, "Array")
 benchmark_three_arg_dot!(SUITE, Array, "Array")
 benchmark_sparse_dense_add!(SUITE, Array, "Array")
+benchmark_conversions!(SUITE, Array, "Array")
 
 # Run benchmarks for JLArrays
 println("Running benchmarks for JLArrays...")
@@ -30,6 +32,7 @@ benchmark_matrix_vector_mul!(SUITE, jl, "JLArray")
 benchmark_matrix_matrix_mul!(SUITE, jl, "JLArray")
 benchmark_three_arg_dot!(SUITE, jl, "JLArray")
 benchmark_sparse_dense_add!(SUITE, jl, "JLArray")
+benchmark_conversions!(SUITE, jl, "JLArray")
 
 # Tune and run benchmarks
 println("\nTuning benchmarks...")

diff --git a/src/DeviceSparseArrays.jl b/src/DeviceSparseArrays.jl
@@ -44,4 +44,7 @@ include("matrix_csr/matrix_csr.jl")
 include("matrix_coo/matrix_coo_kernels.jl")
 include("matrix_coo/matrix_coo.jl")
 
+include("conversions/conversion_kernels.jl")
+include("conversions/conversions.jl")
+
 end # module
diff --git a/src/conversions/conversion_kernels.jl b/src/conversions/conversion_kernels.jl
@@ -0,0 +1,35 @@
+# Kernel for converting CSC to COO format
+@kernel inbounds=true function kernel_csc_to_coo!(
+    rowind,
+    colind,
+    nzval_out,
+    @Const(colptr),
+    @Const(rowval),
+    @Const(nzval_in),
+)
+    col = @index(Global)
+
+    @inbounds for j = colptr[col]:(colptr[col+1]-1)
+        rowind[j] = rowval[j]
+        colind[j] = col
+        nzval_out[j] = nzval_in[j]
+    end
+end
+
+# Kernel for converting CSR to COO format
+@kernel inbounds=true function kernel_csr_to_coo!(
+    rowind,
+    colind,
+    nzval_out,
+    @Const(rowptr),
+    @Const(colval),
+    @Const(nzval_in),
+)
+    row = @index(Global)
+
+    @inbounds for j = rowptr[row]:(rowptr[row+1]-1)
+        rowind[j] = row
+        colind[j] = colval[j]
+        nzval_out[j] = nzval_in[j]
+    end
+end
diff --git a/src/conversions/conversions.jl b/src/conversions/conversions.jl
@@ -0,0 +1,250 @@
+# Conversions between CSC, CSR, and COO sparse matrix formats
+# All conversions operate entirely on-device without CPU transfers
+
+# ============================================================================
+# CSC ↔ COO Conversions
+# ============================================================================
+
+"""
+    DeviceSparseMatrixCOO(A::DeviceSparseMatrixCSC)
+
+Convert a Compressed Sparse Column (CSC) matrix to Coordinate (COO) format.
+
+The conversion preserves all matrix data and maintains backend compatibility.
+The result will be on the same backend (CPU/GPU) as the input matrix.
+
+# Examples
+```julia
+using DeviceSparseArrays, SparseArrays
+
+# Create a CSC matrix
+A_sparse = sparse([1, 2, 3], [1, 2, 3], [1.0, 2.0, 3.0], 3, 3)
+A_csc = DeviceSparseMatrixCSC(A_sparse)
+
+# Convert to COO format
+A_coo = DeviceSparseMatrixCOO(A_csc)
+```
+"""
+function DeviceSparseMatrixCOO(A::DeviceSparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    m, n = size(A)
+    nnz_count = nnz(A)
+
+    backend = get_backend(A.nzval)
+
+    # Allocate output arrays on the same backend
+    rowind = similar(A.rowval, Ti, nnz_count)
+    colind = similar(A.rowval, Ti, nnz_count)
+    nzval = similar(A.nzval, Tv, nnz_count)
+
+    # Use kernel to convert CSC to COO
+    kernel! = kernel_csc_to_coo!(backend)
+    kernel!(rowind, colind, nzval, A.colptr, A.rowval, A.nzval; ndrange = (n,))
+
+    return DeviceSparseMatrixCOO(m, n, rowind, colind, nzval)
+end
+
+"""
+    DeviceSparseMatrixCSC(A::DeviceSparseMatrixCOO)
+
+Convert a Coordinate (COO) matrix to Compressed Sparse Column (CSC) format.
+
+The conversion sorts the COO entries by column (then by row within each column)
+and builds the column pointer structure. The result maintains backend compatibility
+with the input matrix.
+
+# Examples
+```julia
+using DeviceSparseArrays, SparseArrays
+
+# Create a COO matrix
+A_sparse = sparse([1, 2, 3], [1, 2, 3], [1.0, 2.0, 3.0], 3, 3)
+A_coo = DeviceSparseMatrixCOO(A_sparse)
+
+# Convert to CSC format
+A_csc = DeviceSparseMatrixCSC(A_coo)
+```
+"""
+function DeviceSparseMatrixCSC(A::DeviceSparseMatrixCOO{Tv,Ti}) where {Tv,Ti}
+    m, n = size(A)
+    nnz_count = nnz(A)
+
+    backend = get_backend(A.nzval)
+
+    # Create keys for sorting: column first, then row
+    # We use n * rowind + colind to create a unique sortable key
+    keys = similar(A.rowind, Ti, nnz_count)
+
+    # Create keys on device
+    @kernel inbounds=true function make_keys!(
+        keys,
+        @Const(rowind),
+        @Const(colind),
+        @Const(n)
+    )
+        i = @index(Global)
+        keys[i] = colind[i] * n + rowind[i]
+    end
+
+    kernel! = make_keys!(backend)
+    kernel!(keys, A.rowind, A.colind, n; ndrange = (nnz_count,))
+
+    # Sort - collect to CPU and use Base.sortperm since AcceleratedKernels
+    # doesn't work reliably on all backends (e.g., JLBackend)
+    keys_cpu = collect(keys)
+    perm_cpu = sortperm(keys_cpu)
+    # Adapt back to the original backend
+    perm = Adapt.adapt_structure(backend, perm_cpu)
+
+    # Apply permutation to get sorted arrays
+    rowind_sorted = A.rowind[perm]
+    colind_sorted = A.colind[perm]
+    nzval_sorted = A.nzval[perm]
+
+    # Build colptr on device using a histogram approach
+    colptr = similar(A.colind, Ti, n + 1)
+    fill!(colptr, zero(Ti))
+
+    # Count entries per column
+    @kernel inbounds=true function count_per_col!(colptr, @Const(colind_sorted))
+        i = @index(Global)
+        col = colind_sorted[i]
+        @atomic colptr[col+1] += 1
+    end
+
+    kernel! = count_per_col!(backend)
+    kernel!(colptr, colind_sorted; ndrange = (nnz_count,))
+
+    # Build cumulative sum on CPU (collect, compute, adapt back)
+    colptr_cpu = collect(colptr)
+    colptr_cpu[1] = 1
+    for i = 2:(n+1)
+        colptr_cpu[i] += colptr_cpu[i-1]
+    end
+    colptr = Adapt.adapt_structure(backend, colptr_cpu)
+
+    return DeviceSparseMatrixCSC(m, n, colptr, rowind_sorted, nzval_sorted)
+end
+
+# ============================================================================
+# CSR ↔ COO Conversions
+# ============================================================================
+
+"""
+    DeviceSparseMatrixCOO(A::DeviceSparseMatrixCSR)
+
+Convert a Compressed Sparse Row (CSR) matrix to Coordinate (COO) format.
+
+The conversion preserves all matrix data and maintains backend compatibility.
+The result will be on the same backend (CPU/GPU) as the input matrix.
+
+# Examples
+```julia
+using DeviceSparseArrays, SparseArrays
+
+# Create a CSR matrix
+A_sparse = sparse([1, 2, 3], [1, 2, 3], [1.0, 2.0, 3.0], 3, 3)
+A_csr = DeviceSparseMatrixCSR(A_sparse)
+
+# Convert to COO format
+A_coo = DeviceSparseMatrixCOO(A_csr)
+```
+"""
+function DeviceSparseMatrixCOO(A::DeviceSparseMatrixCSR{Tv,Ti}) where {Tv,Ti}
+    m, n = size(A)
+    nnz_count = nnz(A)
+
+    backend = get_backend(A.nzval)
+
+    # Allocate output arrays on the same backend
+    rowind = similar(A.colval, Ti, nnz_count)
+    colind = similar(A.colval, Ti, nnz_count)
+    nzval = similar(A.nzval, Tv, nnz_count)
+
+    # Use kernel to convert CSR to COO
+    kernel! = kernel_csr_to_coo!(backend)
+    kernel!(rowind, colind, nzval, A.rowptr, A.colval, A.nzval; ndrange = (m,))
+
+    return DeviceSparseMatrixCOO(m, n, rowind, colind, nzval)
+end
+
+"""
+    DeviceSparseMatrixCSR(A::DeviceSparseMatrixCOO)
+
+Convert a Coordinate (COO) matrix to Compressed Sparse Row (CSR) format.
+
+The conversion sorts the COO entries by row (then by column within each row)
+and builds the row pointer structure. The result maintains backend compatibility
+with the input matrix.
+
+# Examples
+```julia
+using DeviceSparseArrays, SparseArrays
+
+# Create a COO matrix
+A_sparse = sparse([1, 2, 3], [1, 2, 3], [1.0, 2.0, 3.0], 3, 3)
+A_coo = DeviceSparseMatrixCOO(A_sparse)
+
+# Convert to CSR format
+A_csr = DeviceSparseMatrixCSR(A_coo)
+```
+"""
+function DeviceSparseMatrixCSR(A::DeviceSparseMatrixCOO{Tv,Ti}) where {Tv,Ti}
+    m, n = size(A)
+    nnz_count = nnz(A)
+
+    backend = get_backend(A.nzval)
+
+    # Create keys for sorting: row first, then column
+    # We use m * colind + rowind to create a unique sortable key
+    keys = similar(A.rowind, Ti, nnz_count)
+
+    # Create keys on device
+    @kernel inbounds=true function make_keys!(
+        keys,
+        @Const(rowind),
+        @Const(colind),
+        @Const(m)
+    )
+        i = @index(Global)
+        keys[i] = rowind[i] * m + colind[i]
+    end
+
+    kernel! = make_keys!(backend)
+    kernel!(keys, A.rowind, A.colind, m; ndrange = (nnz_count,))
+
+    # Sort - collect to CPU and use Base.sortperm since AcceleratedKernels
+    # doesn't work reliably on all backends (e.g., JLBackend)
+    keys_cpu = collect(keys)
+    perm_cpu = sortperm(keys_cpu)
+    # Adapt back to the original backend
+    perm = Adapt.adapt_structure(backend, perm_cpu)
+
+    # Apply permutation to get sorted arrays
+    rowind_sorted = A.rowind[perm]
+    colind_sorted = A.colind[perm]
+    nzval_sorted = A.nzval[perm]
+
+    # Build rowptr on device using a histogram approach
+    rowptr = similar(A.rowind, Ti, m + 1)
+    fill!(rowptr, zero(Ti))
+
+    # Count entries per row
+    @kernel inbounds=true function count_per_row!(rowptr, @Const(rowind_sorted))
+        i = @index(Global)
+        row = rowind_sorted[i]
+        @atomic rowptr[row+1] += 1
+    end
+
+    kernel! = count_per_row!(backend)
+    kernel!(rowptr, rowind_sorted; ndrange = (nnz_count,))
+
+    # Build cumulative sum on CPU (collect, compute, adapt back)
+    rowptr_cpu = collect(rowptr)
+    rowptr_cpu[1] = 1
+    for i = 2:(m+1)
+        rowptr_cpu[i] += rowptr_cpu[i-1]
+    end
+    rowptr = Adapt.adapt_structure(backend, rowptr_cpu)
+
+    return DeviceSparseMatrixCSR(m, n, rowptr, colind_sorted, nzval_sorted)
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+DeviceSparseArrays = "da3fe0eb-88a8-4d14-ae1a-857c283e9c70"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"