Remove CUDA dependence in favor of extension (#318)

CarloLucibello · web-flow · commit 92d316387741 · 2023-07-16T20:17:34.000+02:00
* cuda extension

* fix
diff --git a/Project.toml b/Project.toml
@@ -3,9 +3,14 @@ uuid = "cffab07f-9bc2-4db1-8861-388f63bf7694"
 authors = ["Carlo Lucibello and contributors"]
 version = "0.6.8"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+GraphNeuralNetworksCUDAExt = "CUDA"
+
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
@@ -22,7 +27,6 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
 Adapt = "3"
@@ -46,12 +50,16 @@ julia = "1.9"
 [extras]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["Test", "Adapt", "DataFrames", "InlineStrings", "Zygote", "FiniteDifferences", "ChainRulesTestUtils", "MLDatasets"]
+test = ["Test", "Adapt", "DataFrames", "InlineStrings", "Zygote", 
+        "FiniteDifferences", "ChainRulesTestUtils", "MLDatasets",
+        "CUDA", "cuDNN"]
diff --git a/ext/GraphNeuralNetworksCUDAExt/GNNGraphs/query.jl b/ext/GraphNeuralNetworksCUDAExt/GNNGraphs/query.jl
@@ -0,0 +1,2 @@
+
+GNNGraphs._rand_dense_vector(A::CUMAT_T) = CUDA.randn(size(A, 1))
diff --git a/ext/GraphNeuralNetworksCUDAExt/GNNGraphs/transform.jl b/ext/GraphNeuralNetworksCUDAExt/GNNGraphs/transform.jl
@@ -0,0 +1,2 @@
+
+GNNGraphs.dense_zeros_like(a::CUMAT_T, T::Type, sz = size(a)) = CUDA.zeros(T, sz)
diff --git a/ext/GraphNeuralNetworksCUDAExt/GNNGraphs/utils.jl b/ext/GraphNeuralNetworksCUDAExt/GNNGraphs/utils.jl
@@ -0,0 +1,8 @@
+
+GNNGraphs.iscuarray(x::AnyCuArray) = true
+
+
+function sort_edge_index(u::AnyCuArray, v::AnyCuArray)
+    #TODO proper cuda friendly implementation
+    sort_edge_index(u |> Flux.cpu, v |> Flux.cpu) |> Flux.gpu
+end
diff --git a/ext/GraphNeuralNetworksCUDAExt/GraphNeuralNetworksCUDAExt.jl b/ext/GraphNeuralNetworksCUDAExt/GraphNeuralNetworksCUDAExt.jl
@@ -0,0 +1,17 @@
+module GraphNeuralNetworksCUDAExt
+
+using CUDA
+using Random, Statistics, LinearAlgebra
+using GraphNeuralNetworks
+using GraphNeuralNetworks.GNNGraphs
+using GraphNeuralNetworks.GNNGraphs: COO_T, ADJMAT_T, SPARSE_T 
+import GraphNeuralNetworks: propagate
+
+const CUMAT_T = Union{CUDA.AnyCuMatrix, CUDA.CUSPARSE.CuSparseMatrix}
+
+include("GNNGraphs/query.jl")
+include("GNNGraphs/transform.jl")
+include("GNNGraphs/utils.jl")
+include("msgpass.jl")
+
+end #module
diff --git a/ext/GraphNeuralNetworksCUDAExt/msgpass.jl b/ext/GraphNeuralNetworksCUDAExt/msgpass.jl
@@ -0,0 +1,37 @@
+
+###### PROPAGATE SPECIALIZATIONS ####################
+
+## COPY_XJ 
+
+## avoid the fast path on gpu until we have better cuda support
+function propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
+                   xi, xj::AnyCuMatrix, e)
+    propagate((xi, xj, e) -> copy_xj(xi, xj, e), g, +, xi, xj, e)
+end
+
+## E_MUL_XJ 
+
+## avoid the fast path on gpu until we have better cuda support
+function propagate(::typeof(e_mul_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
+                   xi, xj::AnyCuMatrix, e::AbstractVector)
+    propagate((xi, xj, e) -> e_mul_xj(xi, xj, e), g, +, xi, xj, e)
+end
+
+## W_MUL_XJ 
+
+## avoid the fast path on gpu until we have better cuda support
+function propagate(::typeof(w_mul_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
+                   xi, xj::AnyCuMatrix, e::Nothing)
+    propagate((xi, xj, e) -> w_mul_xj(xi, xj, e), g, +, xi, xj, e)
+end
+
+# function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(mean), xi, xj::AbstractMatrix, e)
+#     A = adjacency_matrix(g, weighted=false)
+#     D = compute_degree(A)
+#     return xj * A * D
+# end
+
+# # Zygote bug. Error with sparse matrix without nograd
+# compute_degree(A) = Diagonal(1f0 ./ vec(sum(A; dims=2)))
+
+# Flux.Zygote.@nograd compute_degree
diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl
@@ -2,7 +2,6 @@ module GNNGraphs
 
 using SparseArrays
 using Functors: @functor
-using CUDA
 import Graphs
 using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree,
               has_self_loops, is_directed
@@ -15,7 +14,7 @@ import KrylovKit
 using ChainRulesCore
 using LinearAlgebra, Random, Statistics
 import MLUtils
-using MLUtils: getobs, numobs
+using MLUtils: getobs, numobs, ones_like, zeros_like
 import Functors
 
 include("chainrules.jl") # hacks for differentiability
diff --git a/src/GNNGraphs/abstracttypes.jl b/src/GNNGraphs/abstracttypes.jl
@@ -3,7 +3,6 @@ const COO_T = Tuple{T, T, V} where {T <: AbstractVector{<:Integer}, V}
 const ADJLIST_T = AbstractVector{T} where {T <: AbstractVector{<:Integer}}
 const ADJMAT_T = AbstractMatrix
 const SPARSE_T = AbstractSparseMatrix # subset of ADJMAT_T
-const CUMAT_T = Union{CUDA.AnyCuMatrix, CUDA.CUSPARSE.CuSparseMatrix}
 
 const AVecI = AbstractVector{<:Integer}
 
diff --git a/src/GNNGraphs/gatherscatter.jl b/src/GNNGraphs/gatherscatter.jl
@@ -16,60 +16,3 @@ function _scatter(aggr,
     dstsize = (size(src)[1:(end - 1)]..., n)
     return NNlib.scatter(aggr, src, idx; dstsize)
 end
-
-## TO MOVE TO NNlib ######################################################
-
-### Considers the src a zero dimensional object.
-### Useful for implementing `StatsBase.counts`, `degree`, etc...
-### function NNlib.scatter!(op, dst::AbstractArray, src::Number, idx::AbstractArray)
-###     for k in CartesianIndices(idx)
-###         # dst_v = NNlib._view(dst, idx[k])
-###         # dst_v .= (op).(dst_v, src)
-###         dst[idx[k]] .= (op).(dst[idx[k]], src)
-###     end
-###     dst
-### end
-
-# 10 times faster than the generic version above. 
-# All the speedup comes from not broadcasting `op`, i dunno why.
-# function NNlib.scatter!(op, dst::AbstractVector, src::Number, idx::AbstractVector{<:Integer})
-#     for i in idx
-#         dst[i] = op(dst[i], src)
-#     end
-# end
-
-## NNlib._view(X, k) = view(X, k...)
-## NNlib._view(X, k::Union{Integer, CartesianIndex}) = view(X,  k)
-#
-## Considers src as a zero dimensional object to be scattered
-## function NNlib.scatter(op,
-##                 src::Tsrc,
-##                 idx::AbstractArray{Tidx,Nidx};
-##                 init = nothing, dstsize = nothing) where {Tsrc<:Number,Tidx,Nidx}   
-##     dstsz = isnothing(dstsize) ? maximum_dims(idx) : dstsize 
-##     dst = similar(src, Tsrc, dstsz)
-##     xinit = isnothing(init) ? scatter_empty(op, Tsrc) : init 
-##     fill!(dst, xinit)
-##     scatter!(op, dst, src, idx)
-## end
-
-# function scatter_scalar_kernel!(op, dst, src, idx)
-#     index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-
-#     @inbounds if index <= length(idx)
-#         CUDA.@atomic dst[idx[index]...] = op(dst[idx[index]...], src)
-#     end
-#     return nothing
-# end
-
-# function NNlib.scatter!(op, dst::AnyCuArray, src::Number, idx::AnyCuArray)
-#     max_idx = length(idx)
-#     args = op, dst, src, idx
-
-#     kernel = @cuda launch=false scatter_scalar_kernel!(args...)
-#     config = launch_configuration(kernel.fun; max_threads=256)
-#     threads = min(max_idx, config.threads)
-#     blocks = cld(max_idx, threads)
-#     kernel(args...; threads=threads, blocks=blocks)
-#     return dst
-# end
diff --git a/src/GNNGraphs/query.jl b/src/GNNGraphs/query.jl
@@ -181,7 +181,7 @@ If `weighted=true`, the `A` will contain the edge weights if any, otherwise the
 """
 function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType = eltype(g); dir = :out,
                                  weighted = true)
-    if g.graph[1] isa CuVector
+    if iscuarray(g.graph[1])
         # Revisit after 
         # https://github.com/JuliaGPU/CUDA.jl/issues/1113
         A, n, m = to_dense(g.graph, T; num_nodes = g.num_nodes, weighted)
@@ -448,7 +448,6 @@ function _eigmax(A)
 end
 
 _rand_dense_vector(A::AbstractMatrix{T}) where {T} = randn(float(T), size(A, 1))
-_rand_dense_vector(A::CUMAT_T) = CUDA.randn(size(A, 1))
 
 # Eigenvalues for cuarray don't seem to be well supported. 
 # https://github.com/JuliaGPU/CUDA.jl/issues/154
diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl
@@ -756,7 +756,7 @@ function negative_sample(g::GNNGraph;
 
     s, t = edge_index(g)
     n = g.num_nodes
-    if s isa CuArray
+    if iscuarray(s)
         # Convert to gpu since set operations and sampling are not supported by CUDA.jl
         device = Flux.gpu
         s, t = Flux.cpu(s), Flux.cpu(t)
@@ -852,7 +852,6 @@ end
 
 dense_zeros_like(a::SparseMatrixCSC, T::Type, sz = size(a)) = zeros(T, sz)
 dense_zeros_like(a::AbstractArray, T::Type, sz = size(a)) = fill!(similar(a, T, sz), 0)
-dense_zeros_like(a::CUMAT_T, T::Type, sz = size(a)) = CUDA.zeros(T, sz)
 dense_zeros_like(x, sz = size(x)) = dense_zeros_like(x, eltype(x), sz)
 
 # """
diff --git a/src/GNNGraphs/utils.jl b/src/GNNGraphs/utils.jl
@@ -55,11 +55,6 @@ function sort_edge_index(u, v)
     return u[p], v[p]
 end
 
-function sort_edge_index(u::AnyCuArray, v::AnyCuArray)
-    #TODO proper cuda friendly implementation
-    sort_edge_index(u |> Flux.cpu, v |> Flux.cpu) |> Flux.gpu
-end
-
 cat_features(x1::Nothing, x2::Nothing) = nothing
 cat_features(x1::AbstractArray, x2::AbstractArray) = cat(x1, x2, dims = ndims(x1))
 function cat_features(x1::Union{Number, AbstractVector}, x2::Union{Number, AbstractVector})
@@ -193,11 +188,6 @@ function normalize_heterographdata(data::Dict; default_name::Symbol, ns::Dict, k
          for (k, n) in ns]...)
 end
 
-ones_like(x::AbstractArray, T::Type, sz = size(x)) = fill!(similar(x, T, sz), 1)
-ones_like(x::SparseMatrixCSC, T::Type, sz = size(x)) = ones(T, sz)
-ones_like(x::CUMAT_T, T::Type, sz = size(x)) = CUDA.ones(T, sz)
-ones_like(x, sz = size(x)) = ones_like(x, eltype(x), sz)
-
 numnonzeros(a::AbstractSparseMatrix) = nnz(a)
 numnonzeros(a::AbstractMatrix) = count(!=(0), a)
 
@@ -303,3 +293,6 @@ end
 
 @non_differentiable normalize_graphdata(::NamedTuple{(), Tuple{}})
 @non_differentiable normalize_graphdata(::Nothing)
+
+iscuarray(x::AbstractArray) = false 
+@non_differentiable iscuarray(::Any)
diff --git a/src/GraphNeuralNetworks.jl b/src/GraphNeuralNetworks.jl
@@ -3,7 +3,6 @@ module GraphNeuralNetworks
 using Statistics: mean
 using LinearAlgebra, Random
 using Base: tail
-using CUDA
 using Flux
 using Flux: glorot_uniform, leakyrelu, GRUCell, @functor, batch
 using MacroTools: @forward
diff --git a/src/msgpass.jl b/src/msgpass.jl
@@ -242,6 +242,7 @@ function w_mul_xj(xi, xj::AbstractArray{Tj, Nj}, w::AbstractVector) where {Tj, N
 end
 
 ###### PROPAGATE SPECIALIZATIONS ####################
+## See also the methods defined in the package extensions.
 
 ## COPY_XJ 
 
@@ -250,12 +251,6 @@ function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstract
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
-                   xi, xj::AnyCuMatrix, e)
-    propagate((xi, xj, e) -> copy_xj(xi, xj, e), g, +, xi, xj, e)
-end
-
 ## E_MUL_XJ 
 
 # for weighted convolution
@@ -266,11 +261,6 @@ function propagate(::typeof(e_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(e_mul_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
-                   xi, xj::AnyCuMatrix, e::AbstractVector)
-    propagate((xi, xj, e) -> e_mul_xj(xi, xj, e), g, +, xi, xj, e)
-end
 
 ## W_MUL_XJ 
 
@@ -281,11 +271,6 @@ function propagate(::typeof(w_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(w_mul_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
-                   xi, xj::AnyCuMatrix, e::Nothing)
-    propagate((xi, xj, e) -> w_mul_xj(xi, xj, e), g, +, xi, xj, e)
-end
 
 # function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(mean), xi, xj::AbstractMatrix, e)
 #     A = adjacency_matrix(g, weighted=false)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+`
	`2`	`+GNNGraphs._rand_dense_vector(A::CUMAT_T) = CUDA.randn(size(A, 1))`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+`
	`2`	`+GNNGraphs.dense_zeros_like(a::CUMAT_T, T::Type, sz = size(a)) = CUDA.zeros(T, sz)`