Add GPU backend synchronization to benchmarks for accurate timing (#24)

Copilot · web-flow · commit 9bb13501ce27 · 2025-11-13T18:44:08.000+01:00
diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
@@ -3,5 +3,6 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DeviceSparseArrays = "da3fe0eb-88a8-4d14-ae1a-857c283e9c70"
 JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -6,8 +6,10 @@ This directory contains benchmark tracking for the DeviceSparseArrays.jl package
 
 - `Project.toml`: Dependencies for running benchmarks
 - `runbenchmarks.jl`: Main script that runs all benchmarks
+- `benchmark_utils.jl`: Utility functions for benchmarking (synchronization helpers)
 - `vector_benchmarks.jl`: Benchmarks for sparse vector operations
 - `matrix_benchmarks.jl`: Benchmarks for sparse matrix operations
+- `conversion_benchmarks.jl`: Benchmarks for format conversion operations
 
 ## Benchmarks Tracked
 
@@ -23,6 +25,11 @@ All matrix operations are benchmarked for CSC, CSR, and COO formats to compare t
 - **Matrix-Vector Multiplication**: `mul!(y, A, x)` for sparse matrix A and dense vectors x, y
 - **Matrix-Matrix Multiplication**: `mul!(C, A, B)` for sparse matrix A and dense matrix B
 - **Three-argument dot**: `dot(x, A, y)` for sparse matrix A and dense vectors x, y
+- **Sparse + Dense Addition**: `A + B` for sparse matrix A and dense matrix B
+
+### Format Conversions
+- **CSC ↔ COO**: Conversions between Compressed Sparse Column and Coordinate formats
+- **CSR ↔ COO**: Conversions between Compressed Sparse Row and Coordinate formats
 
 ## Array Types
 
@@ -88,20 +95,46 @@ To add new benchmarks:
            SUITE[group_name] = BenchmarkGroup()
        end
        
-       SUITE[group_name]["Test Case [$array_type_name]"] = 
-           @benchmarkable operation($adapted_data)
+       # IMPORTANT: Wrap operations with synchronization for accurate GPU timing
+       SUITE[group_name]["Test Case [$array_type_name]"] = @benchmarkable begin
+           operation($adapted_data)
+           _synchronize_backend($adapted_data)
+       end
        
        return nothing
    end
    ```
 3. Call your function in `runbenchmarks.jl` for each array type
 4. Test locally with `make benchmark`
 
+## GPU Synchronization
+
+All benchmarks include backend synchronization to ensure accurate timing on GPU backends. GPU operations are often asynchronous, meaning they may return before the computation completes. Without synchronization, benchmarks would underestimate the actual execution time.
+
+The `_synchronize_backend(arr)` helper function:
+- Calls `KernelAbstractions.synchronize(get_backend(arr))` for arrays supporting KernelAbstractions
+- Is a no-op for CPU arrays and arrays without KernelAbstractions support
+- Safely handles any array type, even those without `get_backend` defined
+
+This approach works for:
+- **CPU arrays**: No synchronization needed (no-op)
+- **GPU arrays with KernelAbstractions**: Proper synchronization
+- **Other array types**: Gracefully degrades to no-op
+
+All benchmarks follow the pattern:
+```julia
+@benchmarkable begin
+    my_operation(...)
+    _synchronize_backend($some_array)
+end
+```
+
 ## Notes
 
-- Benchmarks use `BLAS.set_num_threads(1)` to ensure consistent results
-- Default parameters: N=10000, T=Float64, 5% sparsity
+- Benchmarks use `BLAS.set_num_threads(2)` to ensure consistent results
+- Default parameters: N=10000, T=Float64, 1% sparsity
 - Parameters can be customized via keyword arguments
 - Array types are detected automatically (JLArrays is optional)
 - Results are saved in JSON format compatible with github-action-benchmark
 - CUDA benchmarks are not included as GitHub Actions runners don't have GPU support
+- All benchmarks include backend synchronization for accurate GPU timing (see "GPU Synchronization" section)
diff --git a/benchmarks/benchmark_utils.jl b/benchmarks/benchmark_utils.jl
@@ -0,0 +1,39 @@
+"""
+    _synchronize_backend(arr)
+
+Synchronize the backend associated with array `arr` to ensure all operations
+have completed before benchmarking continues. This is essential for accurate
+GPU timing.
+
+# Implementation
+This function uses multiple dispatch to handle different array types:
+- For arrays with KernelAbstractions backends, it calls `synchronize` on the backend
+- For other array types, it is a no-op (fallback method)
+- New array types can extend this function by adding methods for specific types
+
+# Examples
+```julia
+# GPU array with KernelAbstractions - will synchronize
+gpu_arr = adapt(CuArray, DeviceSparseVector(...))
+_synchronize_backend(gpu_arr)
+
+# CPU array or arrays without KernelAbstractions - no-op
+cpu_arr = DeviceSparseVector(...)
+_synchronize_backend(cpu_arr)
+
+# Extend for custom array types:
+# _synchronize_backend(arr::MyCustomArray) = my_custom_sync(arr)
+```
+"""
+_synchronize_backend(arr) = nothing  # Fallback: no-op for arrays without KernelAbstractions
+
+"""
+    _synchronize_backend(arr::AbstractDeviceSparseArray)
+
+Synchronize KernelAbstractions backend for DeviceSparseArray types.
+"""
+function _synchronize_backend(arr::AbstractDeviceSparseArray)
+    backend = KernelAbstractions.get_backend(arr)
+    KernelAbstractions.synchronize(backend)
+    return nothing
+end
diff --git a/benchmarks/conversion_benchmarks.jl b/benchmarks/conversion_benchmarks.jl
@@ -33,20 +33,28 @@ function benchmark_conversions!(
     dsm_coo = adapt(array_constructor, sm_coo)
 
     # CSC → COO conversion
-    SUITE["Format Conversions"][array_type_name]["CSC → COO"] =
-        @benchmarkable DeviceSparseMatrixCOO($dsm_csc)
+    SUITE["Format Conversions"][array_type_name]["CSC → COO"] = @benchmarkable begin
+        DeviceSparseMatrixCOO($dsm_csc)
+        _synchronize_backend($dsm_csc)
+    end
 
     # COO → CSC conversion
-    SUITE["Format Conversions"][array_type_name]["COO → CSC"] =
-        @benchmarkable DeviceSparseMatrixCSC($dsm_coo)
+    SUITE["Format Conversions"][array_type_name]["COO → CSC"] = @benchmarkable begin
+        DeviceSparseMatrixCSC($dsm_coo)
+        _synchronize_backend($dsm_coo)
+    end
 
     # CSR → COO conversion
-    SUITE["Format Conversions"][array_type_name]["CSR → COO"] =
-        @benchmarkable DeviceSparseMatrixCOO($dsm_csr)
+    SUITE["Format Conversions"][array_type_name]["CSR → COO"] = @benchmarkable begin
+        DeviceSparseMatrixCOO($dsm_csr)
+        _synchronize_backend($dsm_csr)
+    end
 
     # COO → CSR conversion
-    SUITE["Format Conversions"][array_type_name]["COO → CSR"] =
-        @benchmarkable DeviceSparseMatrixCSR($dsm_coo)
+    SUITE["Format Conversions"][array_type_name]["COO → CSR"] = @benchmarkable begin
+        DeviceSparseMatrixCSR($dsm_coo)
+        _synchronize_backend($dsm_coo)
+    end
 
     return nothing
 end
diff --git a/benchmarks/matrix_benchmarks.jl b/benchmarks/matrix_benchmarks.jl
@@ -37,14 +37,20 @@ function benchmark_matrix_vector_mul!(
     x_vec = adapt(array_constructor, randn(T, N))
 
     # Level 3: Format (CSC, CSR, COO - will be plotted together)
-    SUITE["Matrix-Vector Multiplication"][array_type_name]["CSC"] =
-        @benchmarkable mul!($vec, $dsm_csc, $x_vec)
+    SUITE["Matrix-Vector Multiplication"][array_type_name]["CSC"] = @benchmarkable begin
+        mul!($vec, $dsm_csc, $x_vec)
+        _synchronize_backend($dsm_csc)
+    end
 
-    SUITE["Matrix-Vector Multiplication"][array_type_name]["CSR"] =
-        @benchmarkable mul!($vec, $dsm_csr, $x_vec)
+    SUITE["Matrix-Vector Multiplication"][array_type_name]["CSR"] = @benchmarkable begin
+        mul!($vec, $dsm_csr, $x_vec)
+        _synchronize_backend($dsm_csr)
+    end
 
-    SUITE["Matrix-Vector Multiplication"][array_type_name]["COO"] =
-        @benchmarkable mul!($vec, $dsm_coo, $x_vec)
+    SUITE["Matrix-Vector Multiplication"][array_type_name]["COO"] = @benchmarkable begin
+        mul!($vec, $dsm_coo, $x_vec)
+        _synchronize_backend($dsm_coo)
+    end
 
     return nothing
 end
@@ -91,14 +97,20 @@ function benchmark_matrix_matrix_mul!(
     result_mat = adapt(array_constructor, zeros(T, N, M))
 
     # Level 3: Format (CSC, CSR, COO - will be plotted together)
-    SUITE["Matrix-Matrix Multiplication"][array_type_name]["CSC"] =
-        @benchmarkable mul!($result_mat, $dsm_csc, $mat)
+    SUITE["Matrix-Matrix Multiplication"][array_type_name]["CSC"] = @benchmarkable begin
+        mul!($result_mat, $dsm_csc, $mat)
+        _synchronize_backend($dsm_csc)
+    end
 
-    SUITE["Matrix-Matrix Multiplication"][array_type_name]["CSR"] =
-        @benchmarkable mul!($result_mat, $dsm_csr, $mat)
+    SUITE["Matrix-Matrix Multiplication"][array_type_name]["CSR"] = @benchmarkable begin
+        mul!($result_mat, $dsm_csr, $mat)
+        _synchronize_backend($dsm_csr)
+    end
 
-    SUITE["Matrix-Matrix Multiplication"][array_type_name]["COO"] =
-        @benchmarkable mul!($result_mat, $dsm_coo, $mat)
+    SUITE["Matrix-Matrix Multiplication"][array_type_name]["COO"] = @benchmarkable begin
+        mul!($result_mat, $dsm_coo, $mat)
+        _synchronize_backend($dsm_coo)
+    end
 
     return nothing
 end
@@ -142,14 +154,20 @@ function benchmark_three_arg_dot!(
     y_vec = adapt(array_constructor, randn(T, N))
 
     # Level 3: Format (CSC, CSR, COO - will be plotted together)
-    SUITE["Three-argument dot"][array_type_name]["CSC"] =
-        @benchmarkable dot($x_vec, $dsm_csc, $y_vec)
+    SUITE["Three-argument dot"][array_type_name]["CSC"] = @benchmarkable begin
+        dot($x_vec, $dsm_csc, $y_vec)
+        _synchronize_backend($dsm_csc)
+    end
 
-    SUITE["Three-argument dot"][array_type_name]["CSR"] =
-        @benchmarkable dot($x_vec, $dsm_csr, $y_vec)
+    SUITE["Three-argument dot"][array_type_name]["CSR"] = @benchmarkable begin
+        dot($x_vec, $dsm_csr, $y_vec)
+        _synchronize_backend($dsm_csr)
+    end
 
-    SUITE["Three-argument dot"][array_type_name]["COO"] =
-        @benchmarkable dot($x_vec, $dsm_coo, $y_vec)
+    SUITE["Three-argument dot"][array_type_name]["COO"] = @benchmarkable begin
+        dot($x_vec, $dsm_coo, $y_vec)
+        _synchronize_backend($dsm_coo)
+    end
 
     return nothing
 end
@@ -192,14 +210,20 @@ function benchmark_sparse_dense_add!(
     dense_mat = adapt(array_constructor, randn(T, N, N))
 
     # Level 3: Format (CSC, CSR, COO - will be plotted together)
-    SUITE["Sparse + Dense Addition"][array_type_name]["CSC"] =
-        @benchmarkable $dsm_csc + $dense_mat
-
-    SUITE["Sparse + Dense Addition"][array_type_name]["CSR"] =
-        @benchmarkable $dsm_csr + $dense_mat
-
-    SUITE["Sparse + Dense Addition"][array_type_name]["COO"] =
-        @benchmarkable $dsm_coo + $dense_mat
+    SUITE["Sparse + Dense Addition"][array_type_name]["CSC"] = @benchmarkable begin
+        $dsm_csc + $dense_mat
+        _synchronize_backend($dsm_csc)
+    end
+
+    SUITE["Sparse + Dense Addition"][array_type_name]["CSR"] = @benchmarkable begin
+        $dsm_csr + $dense_mat
+        _synchronize_backend($dsm_csr)
+    end
+
+    SUITE["Sparse + Dense Addition"][array_type_name]["COO"] = @benchmarkable begin
+        $dsm_coo + $dense_mat
+        _synchronize_backend($dsm_coo)
+    end
 
     return nothing
 end
diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl
@@ -4,11 +4,15 @@ using SparseArrays
 using DeviceSparseArrays
 using Adapt
 using JLArrays
+using KernelAbstractions
 
 BLAS.set_num_threads(2)
 
 const SUITE = BenchmarkGroup()
 
+# Include utility functions
+include("benchmark_utils.jl")
+
 # Include benchmark files
 include("vector_benchmarks.jl")
 include("matrix_benchmarks.jl")
diff --git a/benchmarks/vector_benchmarks.jl b/benchmarks/vector_benchmarks.jl
@@ -24,7 +24,10 @@ function benchmark_vector_sum!(
     dsv = adapt(array_constructor, DeviceSparseVector(sv))
 
     # Level 3: Specific operation (will be plotted together)
-    SUITE["Sparse Vector"][array_type_name]["Sum"] = @benchmarkable sum($dsv)
+    SUITE["Sparse Vector"][array_type_name]["Sum"] = @benchmarkable begin
+        sum($dsv)
+        _synchronize_backend($dsv)
+    end
 
     return nothing
 end
@@ -58,8 +61,10 @@ function benchmark_vector_sparse_dense_dot!(
     dense_vec = adapt(array_constructor, randn(T, N))
 
     # Level 3: Specific operation (will be plotted together)
-    SUITE["Sparse Vector"][array_type_name]["Sparse-Dense dot"] =
-        @benchmarkable dot($dsv, $dense_vec)
+    SUITE["Sparse Vector"][array_type_name]["Sparse-Dense dot"] = @benchmarkable begin
+        dot($dsv, $dense_vec)
+        _synchronize_backend($dsv)
+    end
 
     return nothing
 end