Add benchmarks for CUDA sparse propagate copy_xj

dferre97 · dferre97 · commit f6ff8e3d1d5c · 2025-07-03T17:42:49.000+02:00
diff --git a/GraphNeuralNetworks/perf/Project.toml b/GraphNeuralNetworks/perf/Project.toml
@@ -1,6 +1,10 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+GNNGraphs = "aed8fd31-079b-4b5a-b342-a13352159b8c"
+GNNlib = "a6a84749-d869-43f8-aacc-be26a1996e48"
 GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694"
+Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 Graphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
diff --git a/GraphNeuralNetworks/perf/sparse_propagate_cuda.jl b/GraphNeuralNetworks/perf/sparse_propagate_cuda.jl
@@ -0,0 +1,48 @@
+# # Activate the perf environment
+# using Pkg
+# Pkg.activate(@__DIR__)
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNGraphs"))
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNlib"))
+# Pkg.develop(path=joinpath(@__DIR__, ".."))
+# Pkg.instantiate()
+using SparseArrays
+using GraphNeuralNetworks
+using BenchmarkTools
+import Random: seed!
+using LinearAlgebra
+using Flux, CUDA
+
+# ENV["JULIA_DEBUG"] = "GraphNeuralNetworks,GNNlib,GNNlibCUDAExt,GNNGraphs,GNNGraphsCUDAExt,CUDA" # packages with debugging enabled, don't put a whitespace between the package names
+
+function prop_copy_xj(graph_type, sp_p, n, feat_size)
+    A = sprand(n, n, sp_p)
+    b = rand(1, n)
+    B = rand(feat_size, n)
+    g = GNNGraph(A,
+                 ndata = (; b = b, B = B),
+                 edata = (; A = reshape(A.nzval, 1, :)),
+                 graph_type = graph_type) |> dev
+    printstyled("propagate copy_xj for graph type: $graph_type", "\n", color=:yellow)
+    CUDA.@sync propagate(copy_xj, g, +; xj = g.ndata.B) # run once to compile before benchmarking
+    # @profview for _ in 1:1000
+    #     propagate(copy_xj, g, +; xj = g.ndata.B)
+    # end
+    @btime CUDA.@sync propagate($copy_xj, $g, +; xj = $g.ndata.B) # using spmm for :sparse
+    printstyled("gather/scatter propagate copy_xj for graph type: $graph_type", "\n", color=:yellow)
+    CUDA.@sync propagate((xi, xj, e) -> xj, g, +; xj = g.ndata.B) # run once to compile before benchmarking
+    @btime CUDA.@sync propagate((xi, xj, e) -> xj, $g, +; xj = $g.ndata.B) # using gather/scatter
+    return nothing
+end
+
+seed!(0)
+dev = gpu_device()
+println("Device: ", dev)
+feat_size = 128
+# test for :sparse graph_type
+for n in (32, 128, 1024)
+    for sp_p in (0.01, 0.1, 0.9)
+        printstyled("n = $n, feat_size = $feat_size, sparsity = $sp_p\n", color=:blue)
+        prop_copy_xj(:sparse, sp_p, n, feat_size)
+        println()
+    end
+end