Merge pull request #58 from maleadt/cuda4

CarloLucibello · web-flow · commit c7c18a93728a · 2023-02-02T18:24:20.000+01:00
Updates for CUDA.jl 4.
diff --git a/ext/NNlibCUDA/Project.toml b/ext/NNlibCUDA/Project.toml
@@ -9,10 +9,12 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
 Adapt = "3.3"
-CUDA = "3.11"
+cuDNN = "1"
+CUDA = "4"
 NNlib = "0.8.15"
 julia = "1.6"
 
diff --git a/ext/NNlibCUDA/src/NNlibCUDA.jl b/ext/NNlibCUDA/src/NNlibCUDA.jl
@@ -1,7 +1,7 @@
 module NNlibCUDA
 
 using NNlib
-using CUDA
+using CUDA, cuDNN
 using Random, Statistics
 
 const IntOrIntTuple = Union{Integer, NTuple{N,<:Integer} where N}
diff --git a/ext/NNlibCUDA/src/cudnn/activations.jl b/ext/NNlibCUDA/src/cudnn/activations.jl
@@ -2,10 +2,10 @@
 # Activation
 
 using Base.Broadcast
-using CUDA.CUDNN: cudnnActivationForward!, cudnnOpTensor!,
-            CUDNN_ACTIVATION_TANH,CUDNN_ACTIVATION_SIGMOID,CUDNN_ACTIVATION_ELU,
-            CUDNN_ACTIVATION_RELU,CUDNN_ACTIVATION_CLIPPED_RELU,CUDNN_OP_TENSOR_MAX,
-            CUDNN_ACTIVATION_IDENTITY
+using cuDNN: cudnnActivationForward!, cudnnOpTensor!,
+             CUDNN_ACTIVATION_TANH, CUDNN_ACTIVATION_SIGMOID, CUDNN_ACTIVATION_ELU,
+             CUDNN_ACTIVATION_RELU, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_OP_TENSOR_MAX,
+             CUDNN_ACTIVATION_IDENTITY
 
 for (f, op) in [
     CUDA.tanh       => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_TANH),
@@ -15,7 +15,7 @@ for (f, op) in [
     # NNlib.relu6     => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_CLIPPED_RELU, coef=6.0),
     # NNlib.leakyrelu => (src,dst)->cudnnOpTensor!(dst, src, src; op=CUDNN_OP_TENSOR_MAX, alpha1=0.01),
     ]
-    
+
     @eval begin
         # in-place
         function Base.materialize!(dst::DenseCuArray{<:CUDNNFloat},
diff --git a/ext/NNlibCUDA/src/cudnn/batchnorm.jl b/ext/NNlibCUDA/src/cudnn/batchnorm.jl
@@ -1,6 +1,6 @@
-using CUDA.CUDNN: CUDNN_BN_MIN_EPSILON, cudnnBatchNormalizationBackward,
-                  cudnnBatchNormalizationForwardInference, CUDNN_BATCHNORM_SPATIAL,
-                  cudnnBatchNormalizationForwardTraining
+using cuDNN: CUDNN_BN_MIN_EPSILON, cudnnBatchNormalizationBackward,
+             cudnnBatchNormalizationForwardInference, CUDNN_BATCHNORM_SPATIAL,
+             cudnnBatchNormalizationForwardTraining
 
 
 # TODO: replace with new cudnn normalization interface
@@ -116,7 +116,7 @@ function ∇batchnorm(g::DenseCuArray{T}, b::DenseCuArray{T}, x::DenseCuArray{T}
   if affine
     (dg, db, dx)
   else
-    # CUDNN always calculates dg and db, therefore we just have to drop them
+    # cuDNN always calculates dg and db, therefore we just have to drop them
     (nothing, nothing, dx)
   end
 end
diff --git a/ext/NNlibCUDA/src/cudnn/conv.jl b/ext/NNlibCUDA/src/cudnn/conv.jl
@@ -2,11 +2,11 @@
 using NNlib: DenseConvDims
 import NNlib: conv!, ∇conv_filter!, ∇conv_data!, conv_bias_act!
 
-using CUDA.CUDNN: scalingParameter, CUDNN_CONVOLUTION, convdims,
-                  cudnnConvolutionDescriptor, cudnnConvolutionBwdDataAlgoPerf,
-                  cudnnConvolutionForward!, cudnnConvolutionBwdFilterAlgoPerf,
-                  cudnnConvolutionBackwardData, cudnnConvolutionBackwardFilter,
-                  cudnnConvolutionBackwardBias
+using cuDNN: scalingParameter, CUDNN_CONVOLUTION, convdims,
+             cudnnConvolutionDescriptor, cudnnConvolutionBwdDataAlgoPerf,
+             cudnnConvolutionForward!, cudnnConvolutionBwdFilterAlgoPerf,
+             cudnnConvolutionBackwardData, cudnnConvolutionBackwardFilter,
+             cudnnConvolutionBackwardBias
 
 const CUDNNFloat = Union{Float16,Float32,Float64}
 
@@ -24,22 +24,22 @@ function cudnnConvolutionDescriptorAndPaddedInput(cdims::DenseConvDims, x::Dense
     # which side of x to pad. Oh, and we use a CartesianIndex as we will mainly use this to index in x
     pad_manual = CartesianIndex(ntuple(i -> i > sdims ? 0 : pad[2(i-1)+1] - pad[2(i-1)+2], ndims(x)))
 
-    # How much we can let cudnn pad: The smallest padding amount between pad_left and pad_right, pad_top 
+    # How much we can let cudnn pad: The smallest padding amount between pad_left and pad_right, pad_top
     # and pad_bottom etc. respectively
-    pad_cudnn = ntuple(i -> min(pad[2(i-1)+1], pad[2(i-1)+2]), sdims) 
+    pad_cudnn = ntuple(i -> min(pad[2(i-1)+1], pad[2(i-1)+2]), sdims)
 
     x_padded_size = ntuple(i -> i <= sdims ? size(x, i) + abs(pad_manual[i]) : size(x ,i), ndims(x))
     x_padded = similar(x, x_padded_size)
     fill!(x_padded, 0)
     # This is a bit yucky, but we are basically figuring out where in x_padded we shall insert x
-    # Haven't benchmarked if this has any advantages over a more readable solution, e.g. writing dim 
+    # Haven't benchmarked if this has any advantages over a more readable solution, e.g. writing dim
     # by dim to an array in a loop
     xIs = CartesianIndices(x)
     xI_first = first(xIs)
     xI_last = last(xIs)
     xIs_pad = max(xI_first, xI_first + pad_manual) : max(xI_last, xI_last + pad_manual)
-    x_padded[xIs_pad] = x 
-    
+    x_padded[xIs_pad] = x
+
     return cudnnConvolutionDescriptor(cdims, x_padded, pad_cudnn), x_padded, _x -> _x[xIs_pad]
 end
 
@@ -101,7 +101,7 @@ function ∇conv_data!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, w::DenseCuArray
     with_workspace(p.memory) do workspace
         cudnnConvolutionBackwardData(handle(), alpha, wDesc, w, yDesc, dy, convDesc, p.algo, workspace, sizeof(workspace), beta, xDesc, dx)
     end
-    return depad(dx) 
+    return depad(dx)
 end
 
 function ∇conv_filter!(dw::DenseCuArray{T}, x::DenseCuArray{T}, dy::DenseCuArray{T},
diff --git a/ext/NNlibCUDA/src/cudnn/cudnn.jl b/ext/NNlibCUDA/src/cudnn/cudnn.jl
@@ -1,8 +1,8 @@
-using CUDA.CUDNN: handle, with_workspace, cudnnTensorDescriptor, cudnnFilterDescriptor,
-                  cudnnDataType, math_mode, CUDNN_DEFAULT_REORDER, CUDNN_CROSS_CORRELATION,
-                  CUDNN_NOT_PROPAGATE_NAN, CUDNN_TENSOR_NCHW, dim4
+using cuDNN: handle, with_workspace, cudnnTensorDescriptor, cudnnFilterDescriptor,
+             cudnnDataType, math_mode, CUDNN_DEFAULT_REORDER, CUDNN_CROSS_CORRELATION,
+             CUDNN_NOT_PROPAGATE_NAN, CUDNN_TENSOR_NCHW, dim4
 
-cudnnversion() = CUDA.CUDNN.version()
+cudnnversion() = cuDNN.version()
 
 function nnlibPadding(dims)
     pd = NNlib.padding(dims)
diff --git a/ext/NNlibCUDA/src/cudnn/pooling.jl b/ext/NNlibCUDA/src/cudnn/pooling.jl
@@ -1,9 +1,9 @@
-using CUDA.CUDNN: cudnnPoolingMode_t, CUDNN_POOLING_MAX, 
-                  CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
-                  cudnnPoolingForward!, pooldims, cudnnPoolingBackward
-          
+using cuDNN: cudnnPoolingMode_t, CUDNN_POOLING_MAX,
+             CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+             cudnnPoolingForward!, pooldims, cudnnPoolingBackward
+
 import NNlib: maxpool!, ∇maxpool!, meanpool!, ∇meanpool!
-import CUDA.CUDNN: cudnnPoolingDescriptor
+import cuDNN: cudnnPoolingDescriptor
 
 function cudnnPoolingDescriptor(pdims::PoolDims, x::DenseCuArray{T}, mode::cudnnPoolingMode_t) where T
     window, padding, stride = NNlib.kernel_size(pdims), nnlibPadding(pdims), NNlib.stride(pdims)
diff --git a/ext/NNlibCUDA/src/cudnn/softmax.jl b/ext/NNlibCUDA/src/cudnn/softmax.jl
@@ -1,9 +1,9 @@
 import NNlib: softmax, softmax!, ∇softmax, ∇softmax!,
               logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!
 
-using CUDA.CUDNN: CUDNN_SOFTMAX_LOG, CUDNN_SOFTMAX_MODE_CHANNEL, 
-                CUDNN_SOFTMAX_FAST, CUDNN_SOFTMAX_ACCURATE, cudnnSoftmaxForward!,
-                cudnnSoftmaxBackward
+using cuDNN: CUDNN_SOFTMAX_LOG, CUDNN_SOFTMAX_MODE_CHANNEL,
+             CUDNN_SOFTMAX_FAST, CUDNN_SOFTMAX_ACCURATE, cudnnSoftmaxForward!,
+             cudnnSoftmaxBackward
 
 # Softmax
 
@@ -43,8 +43,8 @@ function _∇logsoftmax!(dx::T, dy::T, x::T, y::T; dims) where {T<:DenseCuArray}
     dx .= dy .- sum(dy; dims) .* exp.(y)
 end
 
-# Trick by @norci to use cudnn for softmax dims args that are contiguous: 
-# If dims=(dmin:dmax) then CUDNN_SOFTMAX_MODE_CHANNEL does the trick with reshape 
+# Trick by @norci to use cudnn for softmax dims args that are contiguous:
+# If dims=(dmin:dmax) then CUDNN_SOFTMAX_MODE_CHANNEL does the trick with reshape
 #    (1, prod(size(x)[1:dmin-1]), prod(size(x)[dmin:dmax]), :)
 # softmaxdims returns nothing when the backup implementation should be used.
 
@@ -79,7 +79,7 @@ function ∇softmax!(dx::T, dy::T, x::T, y::T; dims=1) where {R,T<:DenseCuArray{
     s === nothing && return _∇softmax!(dx, dy, x, y; dims)
     xDesc = cudnnTensorDescriptor(reshape(x,s))
     alpha, beta = scalingParameter(R,1), scalingParameter(R,0)
-    cudnnSoftmaxBackward(handle(), softmaxalgo(), CUDNN_SOFTMAX_MODE_CHANNEL, 
+    cudnnSoftmaxBackward(handle(), softmaxalgo(), CUDNN_SOFTMAX_MODE_CHANNEL,
                          alpha, xDesc, y, xDesc, dy, beta, xDesc, dx)
     return dx
 end
@@ -96,7 +96,7 @@ function ∇logsoftmax!(dx::T, dy::T, x::T, y::T; dims=1) where {R,T<:DenseCuArr
     s === nothing && return _∇logsoftmax!(dx, dy, x, y; dims)
     xDesc = cudnnTensorDescriptor(reshape(x,s))
     alpha, beta = scalingParameter(R,1), scalingParameter(R,0)
-    cudnnSoftmaxBackward(handle(), CUDNN_SOFTMAX_LOG, CUDNN_SOFTMAX_MODE_CHANNEL, 
+    cudnnSoftmaxBackward(handle(), CUDNN_SOFTMAX_LOG, CUDNN_SOFTMAX_MODE_CHANNEL,
                          alpha, xDesc, y, xDesc, dy, beta, xDesc, dx)
     return dx
 end
diff --git a/ext/NNlibCUDA/test/activations.jl b/ext/NNlibCUDA/test/activations.jl
@@ -16,7 +16,7 @@ end
 
 # Broadcasting over complex CuArray works without NNlibCUDA, this test checks that
 # NNlibCUDA does not cause such operations to take a fast path which does not support
-# complex numbers (e.g. CUDNN)
+# complex numbers (e.g. cuDNN)
 @testset "complex" begin
     f(x) = tanh.(x)
     cs = rand(ComplexF64, 5)
diff --git a/ext/NNlibCUDA/test/softmax.jl b/ext/NNlibCUDA/test/softmax.jl
@@ -12,7 +12,7 @@
         gputest(NNlib.∇logsoftmax_data, dy, y2; dims=dims)
 
         # From NNlib 0.8.3, ∇softmax! is not used in the gradient.
-        # But NNlibCUDA still knows how to call CUDNN routines, let's test they agree:
+        # But NNlibCUDA still knows how to call cuDNN routines, let's test they agree:
         @test NNlib.∇softmax_data(dy, y; dims=dims) ≈ collect(∇softmax!(similar(cu(x)), cu(dy), cu(x), cu(y); dims=dims)) atol=1e-4
         @test NNlib.∇logsoftmax_data(dy, y2; dims=dims) ≈ collect(∇logsoftmax!(similar(cu(x)), cu(dy), cu(x), cu(y2); dims=dims)) atol=1e-4
         # (Note that ∇softmax! does not depend on x, it's just there to disambiguate from an even older signature.)