Remove JLArray dispatch and fix scalar indexing issue

albertomercurio · albertomercurio · commit 41830c29f004 · 2025-11-12T19:43:15.000+01:00
diff --git a/src/conversions/conversion_kernels.jl b/src/conversions/conversion_kernels.jl
@@ -60,12 +60,12 @@ end
 @kernel inbounds=true function kernel_count_per_col!(colptr, @Const(colind_sorted))
     i = @index(Global)
     col = colind_sorted[i]
-    @atomic colptr[col + 1] += 1
+    @atomic colptr[col+1] += 1
 end
 
 # Kernel for counting entries per row (for COO → CSR)
 @kernel inbounds=true function kernel_count_per_row!(rowptr, @Const(rowind_sorted))
     i = @index(Global)
     row = rowind_sorted[i]
-    @atomic rowptr[row + 1] += 1
+    @atomic rowptr[row+1] += 1
 end
diff --git a/src/conversions/conversions.jl b/src/conversions/conversions.jl
@@ -1,8 +1,5 @@
 # Conversions between CSC, CSR, and COO sparse matrix formats
-# All conversions operate on-device, with CPU fallback only for JLBackend
-
-# Helper function to check if backend is JLBackend (which doesn't support AcceleratedKernels)
-_is_jlbackend(backend) = string(typeof(backend)) == "JLBackend"
+# All conversions operate on-device
 
 # ============================================================================
 # CSC ↔ COO Conversions
@@ -80,16 +77,7 @@ function DeviceSparseMatrixCSC(A::DeviceSparseMatrixCOO{Tv,Ti}) where {Tv,Ti}
     kernel! = kernel_make_csc_keys!(backend)
     kernel!(keys, A.rowind, A.colind, n; ndrange = (nnz_count,))
 
-    # Sort - use AcceleratedKernels for GPU, CPU fallback for JLBackend
-    if _is_jlbackend(backend)
-        # JLBackend doesn't support AcceleratedKernels - use CPU fallback
-        keys_cpu = collect(keys)
-        perm_cpu = sortperm(keys_cpu)
-        perm = Adapt.adapt_structure(backend, perm_cpu)
-    else
-        # Use AcceleratedKernels for GPU and standard CPU backends
-        perm = AcceleratedKernels.sortperm(keys)
-    end
+    perm = AcceleratedKernels.sortperm(keys)
 
     # Apply permutation to get sorted arrays
     rowind_sorted = A.rowind[perm]
@@ -104,20 +92,9 @@ function DeviceSparseMatrixCSC(A::DeviceSparseMatrixCOO{Tv,Ti}) where {Tv,Ti}
     kernel! = kernel_count_per_col!(backend)
     kernel!(colptr, colind_sorted; ndrange = (nnz_count,))
 
-    # Compute cumulative sum - use CPU fallback for JLBackend
-    if _is_jlbackend(backend) || backend isa KernelAbstractions.CPU
-        # For CPU-like backends, use CPU cumsum
-        colptr_cpu = collect(colptr)
-        colptr_cpu[1] = 1
-        for i = 2:(n + 1)
-            colptr_cpu[i] += colptr_cpu[i - 1]
-        end
-        colptr = Adapt.adapt_structure(backend, colptr_cpu)
-    else
-        # For GPU backends, use AcceleratedKernels scan
-        colptr[1] = 1
-        colptr[2:end] .= AcceleratedKernels.cumsum(colptr[2:end]) .+ 1
-    end
+    # Compute cumulative sum
+    allowed_setindex!(colptr, 1, 1) # TODO: Is there a better way to do this?
+    colptr[2:end] .= AcceleratedKernels.cumsum(colptr[2:end]) .+ 1
 
     return DeviceSparseMatrixCSC(m, n, colptr, rowind_sorted, nzval_sorted)
 end
@@ -198,16 +175,8 @@ function DeviceSparseMatrixCSR(A::DeviceSparseMatrixCOO{Tv,Ti}) where {Tv,Ti}
     kernel! = kernel_make_csr_keys!(backend)
     kernel!(keys, A.rowind, A.colind, m; ndrange = (nnz_count,))
 
-    # Sort - use AcceleratedKernels for GPU, CPU fallback for JLBackend
-    if _is_jlbackend(backend)
-        # JLBackend doesn't support AcceleratedKernels - use CPU fallback
-        keys_cpu = collect(keys)
-        perm_cpu = sortperm(keys_cpu)
-        perm = Adapt.adapt_structure(backend, perm_cpu)
-    else
-        # Use AcceleratedKernels for GPU and standard CPU backends
-        perm = AcceleratedKernels.sortperm(keys)
-    end
+    # Sort - use AcceleratedKernels
+    perm = AcceleratedKernels.sortperm(keys)
 
     # Apply permutation to get sorted arrays
     rowind_sorted = A.rowind[perm]
@@ -222,20 +191,9 @@ function DeviceSparseMatrixCSR(A::DeviceSparseMatrixCOO{Tv,Ti}) where {Tv,Ti}
     kernel! = kernel_count_per_row!(backend)
     kernel!(rowptr, rowind_sorted; ndrange = (nnz_count,))
 
-    # Compute cumulative sum - use CPU fallback for JLBackend
-    if _is_jlbackend(backend) || backend isa KernelAbstractions.CPU
-        # For CPU-like backends, use CPU cumsum
-        rowptr_cpu = collect(rowptr)
-        rowptr_cpu[1] = 1
-        for i = 2:(m + 1)
-            rowptr_cpu[i] += rowptr_cpu[i - 1]
-        end
-        rowptr = Adapt.adapt_structure(backend, rowptr_cpu)
-    else
-        # For GPU backends, use AcceleratedKernels scan
-        rowptr[1] = 1
-        rowptr[2:end] .= AcceleratedKernels.cumsum(rowptr[2:end]) .+ 1
-    end
+    # Compute cumulative sum
+    allowed_setindex!(rowptr, 1, 1) # TODO: Is there a better way to do this?
+    rowptr[2:end] .= AcceleratedKernels.cumsum(rowptr[2:end]) .+ 1
 
     return DeviceSparseMatrixCSR(m, n, rowptr, colind_sorted, nzval_sorted)
 end
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,7 +1,6 @@
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
-DeviceSparseArrays = "da3fe0eb-88a8-4d14-ae1a-857c283e9c70"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
@@ -27,4 +27,11 @@
         (Float32, Float64),
         (ComplexF32, ComplexF64),
     )
+    shared_test_conversions(
+        CuArray,
+        "CUDA",
+        (Int32, Int64),
+        (Float32, Float64),
+        (ComplexF32, ComplexF64),
+    )
 end
diff --git a/test/metal/metal.jl b/test/metal/metal.jl
@@ -3,4 +3,5 @@
     shared_test_matrix_csc(MtlArray, "Metal", (Int32,), (Float32,), (ComplexF32,))
     shared_test_matrix_csr(MtlArray, "Metal", (Int32,), (Float32,), (ComplexF32,))
     shared_test_matrix_coo(MtlArray, "Metal", (Int32,), (Float32,), (ComplexF32,))
+    shared_test_conversions(MtlArray, "Metal", (Int32,), (Float32,), (ComplexF32,))
 end
diff --git a/test/reactant/reactant.jl b/test/reactant/reactant.jl
@@ -27,4 +27,11 @@
         (Float32, Float64),
         (ComplexF32, ComplexF64),
     )
+    shared_test_conversions(
+        Reactant.ConcreteRArray,
+        "Reactant",
+        (Int32, Int64),
+        (Float32, Float64),
+        (ComplexF32, ComplexF64),
+    )
 end
diff --git a/test/shared/conversions.jl b/test/shared/conversions.jl
@@ -6,90 +6,107 @@ function shared_test_conversions(
     complex_types::Tuple,
 )
     @testset "Format Conversions $array_type" verbose=true begin
-        # Test CSC → COO → CSC round-trip
-        @testset "CSC ↔ COO" begin
-            A = sparse([1, 2, 3, 1, 2], [1, 2, 3, 2, 3], float_types[end][1.0, 2.0, 3.0, 4.0, 5.0], 3, 3)
-            
-            # CSC → COO
-            A_csc = adapt(op, DeviceSparseMatrixCSC(A))
-            A_coo_from_csc = DeviceSparseMatrixCOO(A_csc)
-            @test collect(SparseMatrixCSC(A_coo_from_csc)) ≈ collect(A)
-            
-            # COO → CSC
-            A_coo = adapt(op, DeviceSparseMatrixCOO(A))
-            A_csc_from_coo = DeviceSparseMatrixCSC(A_coo)
-            @test collect(SparseMatrixCSC(A_csc_from_coo)) ≈ collect(A)
-            
-            # Round-trip
-            A_csc_roundtrip = DeviceSparseMatrixCSC(DeviceSparseMatrixCOO(A_csc))
-            @test collect(SparseMatrixCSC(A_csc_roundtrip)) ≈ collect(A)
-        end
-        
-        # Test CSR → COO → CSR round-trip
-        @testset "CSR ↔ COO" begin
-            A = sparse([1, 2, 3, 1, 2], [1, 2, 3, 2, 3], float_types[end][1.0, 2.0, 3.0, 4.0, 5.0], 3, 3)
-            
-            # CSR → COO
-            A_csr = adapt(op, DeviceSparseMatrixCSR(A))
-            A_coo_from_csr = DeviceSparseMatrixCOO(A_csr)
-            @test collect(SparseMatrixCSC(A_coo_from_csr)) ≈ collect(A)
-            
-            # COO → CSR
-            A_coo = adapt(op, DeviceSparseMatrixCOO(A))
-            A_csr_from_coo = DeviceSparseMatrixCSR(A_coo)
-            @test collect(SparseMatrixCSC(A_csr_from_coo)) ≈ collect(A)
-            
-            # Round-trip
-            A_csr_roundtrip = DeviceSparseMatrixCSR(DeviceSparseMatrixCOO(A_csr))
-            @test collect(SparseMatrixCSC(A_csr_roundtrip)) ≈ collect(A)
-        end
-        
-        # Test with different data types
-        @testset "Different Types" begin
-            # Test with Float32
-            A_f32 = sparse([1, 2], [1, 2], float_types[1][1.0f0, 2.0f0], 2, 2)
-            A_csc_f32 = adapt(op, DeviceSparseMatrixCSC(A_f32))
-            A_coo_f32 = DeviceSparseMatrixCOO(A_csc_f32)
-            @test collect(SparseMatrixCSC(A_coo_f32)) ≈ collect(A_f32)
-            
-            # Test with ComplexF64
-            A_c64 = sparse([1, 2], [1, 2], complex_types[end][1.0+im, 2.0-im], 2, 2)
-            A_csr_c64 = adapt(op, DeviceSparseMatrixCSR(A_c64))
-            A_coo_c64 = DeviceSparseMatrixCOO(A_csr_c64)
-            @test collect(SparseMatrixCSC(A_coo_c64)) ≈ collect(A_c64)
-        end
-        
-        # Test with empty matrices
-        @testset "Edge Cases" begin
-            # Empty matrix
-            A_empty = spzeros(float_types[end], 3, 3)
-            A_csc_empty = adapt(op, DeviceSparseMatrixCSC(A_empty))
-            A_coo_empty = DeviceSparseMatrixCOO(A_csc_empty)
-            @test nnz(A_coo_empty) == 0
-            @test size(A_coo_empty) == (3, 3)
-            
-            # Single element
-            A_single = sparse([1], [1], float_types[end][42.0], 1, 1)
-            A_csr_single = adapt(op, DeviceSparseMatrixCSR(A_single))
-            A_coo_single = DeviceSparseMatrixCOO(A_csr_single)
-            @test collect(SparseMatrixCSC(A_coo_single)) ≈ collect(A_single)
-        end
-        
-        # Test large matrix conversion
-        @testset "Large Matrix" begin
-            A_large = sprand(float_types[end], 100, 100, 0.05)
-            
-            # CSC → COO → CSC
-            A_csc_large = adapt(op, DeviceSparseMatrixCSC(A_large))
-            A_coo_large = DeviceSparseMatrixCOO(A_csc_large)
-            A_csc_back = DeviceSparseMatrixCSC(A_coo_large)
-            @test collect(SparseMatrixCSC(A_csc_back)) ≈ collect(A_large)
-            
-            # CSR → COO → CSR
-            A_csr_large = adapt(op, DeviceSparseMatrixCSR(A_large))
-            A_coo_large2 = DeviceSparseMatrixCOO(A_csr_large)
-            A_csr_back = DeviceSparseMatrixCSR(A_coo_large2)
-            @test collect(SparseMatrixCSC(A_csr_back)) ≈ collect(A_large)
+        # Many conversion functions rely on AcceleratedKernels sortperm
+        # which is not supported on JLBackend. Therefore, we skip conversion
+        # tests for JLArray.
+        if array_type != "JLArray"
+            # Test CSC → COO → CSC round-trip
+            @testset "CSC ↔ COO" begin
+                A = sparse(
+                    [1, 2, 3, 1, 2],
+                    [1, 2, 3, 2, 3],
+                    float_types[end][1.0, 2.0, 3.0, 4.0, 5.0],
+                    3,
+                    3,
+                )
+
+                # CSC → COO
+                A_csc = adapt(op, DeviceSparseMatrixCSC(A))
+                A_coo_from_csc = DeviceSparseMatrixCOO(A_csc)
+                @test collect(SparseMatrixCSC(A_coo_from_csc)) ≈ collect(A)
+
+                # COO → CSC
+                A_coo = adapt(op, DeviceSparseMatrixCOO(A))
+                A_csc_from_coo = DeviceSparseMatrixCSC(A_coo)
+                @test collect(SparseMatrixCSC(A_csc_from_coo)) ≈ collect(A)
+
+                # Round-trip
+                A_csc_roundtrip = DeviceSparseMatrixCSC(DeviceSparseMatrixCOO(A_csc))
+                @test collect(SparseMatrixCSC(A_csc_roundtrip)) ≈ collect(A)
+            end
+
+            # Test CSR → COO → CSR round-trip
+            @testset "CSR ↔ COO" begin
+                A = sparse(
+                    [1, 2, 3, 1, 2],
+                    [1, 2, 3, 2, 3],
+                    float_types[end][1.0, 2.0, 3.0, 4.0, 5.0],
+                    3,
+                    3,
+                )
+
+                # CSR → COO
+                A_csr = adapt(op, DeviceSparseMatrixCSR(A))
+                A_coo_from_csr = DeviceSparseMatrixCOO(A_csr)
+                @test collect(SparseMatrixCSC(A_coo_from_csr)) ≈ collect(A)
+
+                # COO → CSR
+                A_coo = adapt(op, DeviceSparseMatrixCOO(A))
+                A_csr_from_coo = DeviceSparseMatrixCSR(A_coo)
+                @test collect(SparseMatrixCSC(A_csr_from_coo)) ≈ collect(A)
+
+                # Round-trip
+                A_csr_roundtrip = DeviceSparseMatrixCSR(DeviceSparseMatrixCOO(A_csr))
+                @test collect(SparseMatrixCSC(A_csr_roundtrip)) ≈ collect(A)
+            end
+
+            # Test with different data types
+            @testset "Different Types" begin
+                # Test with Float32
+                A_f32 = sparse([1, 2], [1, 2], float_types[1][1.0f0, 2.0f0], 2, 2)
+                A_csc_f32 = adapt(op, DeviceSparseMatrixCSC(A_f32))
+                A_coo_f32 = DeviceSparseMatrixCOO(A_csc_f32)
+                @test collect(SparseMatrixCSC(A_coo_f32)) ≈ collect(A_f32)
+
+                # Test with ComplexF64
+                A_c64 = sparse([1, 2], [1, 2], complex_types[end][1.0+im, 2.0-im], 2, 2)
+                A_csr_c64 = adapt(op, DeviceSparseMatrixCSR(A_c64))
+                A_coo_c64 = DeviceSparseMatrixCOO(A_csr_c64)
+                @test collect(SparseMatrixCSC(A_coo_c64)) ≈ collect(A_c64)
+            end
+
+            # Test with empty matrices
+            @testset "Edge Cases" begin
+                # Empty matrix
+                A_empty = spzeros(float_types[end], 3, 3)
+                A_csc_empty = adapt(op, DeviceSparseMatrixCSC(A_empty))
+                A_coo_empty = DeviceSparseMatrixCOO(A_csc_empty)
+                @test nnz(A_coo_empty) == 0
+                @test size(A_coo_empty) == (3, 3)
+
+                # Single element
+                A_single = sparse([1], [1], float_types[end][42.0], 1, 1)
+                A_csr_single = adapt(op, DeviceSparseMatrixCSR(A_single))
+                A_coo_single = DeviceSparseMatrixCOO(A_csr_single)
+                @test collect(SparseMatrixCSC(A_coo_single)) ≈ collect(A_single)
+            end
+
+            # Test large matrix conversion
+            @testset "Large Matrix" begin
+                A_large = sprand(float_types[end], 100, 100, 0.05)
+
+                # CSC → COO → CSC
+                A_csc_large = adapt(op, DeviceSparseMatrixCSC(A_large))
+                A_coo_large = DeviceSparseMatrixCOO(A_csc_large)
+                A_csc_back = DeviceSparseMatrixCSC(A_coo_large)
+                @test collect(SparseMatrixCSC(A_csc_back)) ≈ collect(A_large)
+
+                # CSR → COO → CSR
+                A_csr_large = adapt(op, DeviceSparseMatrixCSR(A_large))
+                A_coo_large2 = DeviceSparseMatrixCOO(A_csr_large)
+                A_csr_back = DeviceSparseMatrixCSR(A_coo_large2)
+                @test collect(SparseMatrixCSC(A_csr_back)) ≈ collect(A_large)
+            end
         end
     end
 end