Updates for CUDA v4, KernelAbstractions v0.9 (#177)

vpuri3 · web-flow · commit bcf544ecb7e5 · 2023-10-16T13:26:23.000-04:00
* git ignore Manifest.toml

* add CUDA v4 test to CI

* rm CUDAKernels.jl

* add Pkg to test, rm CUDAKernels

* test Tullio with CUDA.jl v4

* CUDADevice -&gt; CUDABackend, remove Events/waits

* kernelabstractions 0.9 compat

* rm debugging file

* properly scope CUDABackend

* print Pkg.status()

* Update runtests.jl

uncomment tests

* Update Project.toml

* rm cuda 3

* tensoroperations v4

* fix buildkite julia versions

* fix buildkite pipeline julia version

* clean up tests

* clean up ci
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -3,10 +3,10 @@ env:
   # SECRET_CODECOV_TOKEN: "..."
 
 steps:
-  - label: "Julia 1.6"
+  - label: "Julia 1.8"
     plugins:
       - JuliaCI/julia#v0.5:
-          version: 1.6
+          version: "1.8"
       - JuliaCI/julia-test#v0.3: ~
       # - JuliaCI/julia-coverage#v0.3:
       #     codecov: true
@@ -16,10 +16,10 @@ steps:
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
 
-  - label: "Julia 1.8"
+  - label: "Julia 1.10"
     plugins:
       - JuliaCI/julia#v0.5:
-          version: 1.8
+          version: "1.10"
       - JuliaCI/julia-test#v0.3: ~
       # - JuliaCI/julia-coverage#v0.3:
       #     codecov: true
diff --git a/.github/workflows/ci-julia-nightly.yml b/.github/workflows/ci-julia-nightly.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -28,6 +28,7 @@ jobs:
         version:
           - '1.6'
           - '1' # automatically expands to the latest stable 1.x release of Julia
+          - 'nightly'
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -22,34 +22,33 @@ TullioTrackerExt = "Tracker"
 TullioChainRulesCoreExt = "ChainRulesCore"
 
 [compat]
-CUDA = "3.6, 4"
-CUDAKernels = "0.3.3, 0.4"
+CUDA = "4, 5"
 ChainRulesCore = "1"
 DiffRules = "1"
 FillArrays = "0.11, 0.12, 0.13"
 ForwardDiff = "0.10"
-KernelAbstractions = "0.7.2, 0.8"
+KernelAbstractions = "0.7.2, 0.8, 0.9"
 LoopVectorization = "0.12.101"
 NamedDims = "0.2"
 OffsetArrays = "1"
 Requires = "1"
-TensorOperations = "3"
+TensorOperations = "4"
 Tracker = "0.2"
 VectorizationBase = "0.21.23"
 Zygote = "0.6.33"
-julia = "1.6"
+julia = "1.8"
 
 [extras]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
@@ -59,4 +58,4 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
+test = ["Test", "CUDA", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Pkg", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ But it also co-operates with various other packages, provided they are loaded be
 
 * It uses [`LoopVectorization.@avx`](https://github.com/chriselrod/LoopVectorization.jl) to speed many things up. (Disable with keyword `avx=false`.) On a good day this will match the speed of OpenBLAS for matrix multiplication.
 
-* It uses [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast.
+* It uses [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast.
 
 The macro also tries to provide a gradient for use with [Tracker](https://github.com/FluxML/Tracker.jl) or (via  [ChainRules](https://github.com/JuliaDiff/ChainRules.jl)) for [Zygote](https://github.com/FluxML/Zygote.jl), [Yota](https://github.com/dfdx/Yota.jl), etc. <!-- or [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl). -->
 (Disable with `grad=false`, or `nograd=A`.) This is done in one of two ways:
@@ -237,7 +237,7 @@ using Tracker # or Zygote
 ΔA = Tracker.gradient((A,B) -> sum(mul(A, B)), A, B)[1]
 ΔA ≈ ones(3,500) * B' # true
 
-using CUDA, CUDAKernels, KernelAbstractions # Now defined with a GPU version:
+using CUDA, KernelAbstractions # Now defined with a GPU version:
 mul(A, B) = @tullio C[i,k] := A[i,j] * B[j,k]
 
 cu(A * B) ≈ mul(cu(A), cu(B)) # true
diff --git a/src/macro.jl b/src/macro.jl
@@ -1161,16 +1161,15 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
             end
             store.verbose==2 && @info "=====KA===== KernelAbstractions kernel $note" verbosetidy(kex1)
             push!(store.outpre, macroexpand(store.mod, kex1))
-            if isdefined(store.mod, :CUDA) && isdefined(store.mod, :CuArray) # new-style, CUDA.jl, with CUDADevice()
+            if isdefined(store.mod, :CUDA) && isdefined(store.mod, :CuArray) # new-style, CUDA.jl, with CUDA.CUDABackend()
                 info2 = store.verbose>0 ? :(@info "running KernelAbstractions + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
                 kex2 = quote
 
                     local @inline function $act!(::Type{<:CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
                         $info2
-                        cu_kern! = $kernel(CUDADevice())
+                        cu_kern! = $kernel(CUDA.CUDABackend())
                         $(asserts...)
-                        $ACC = cu_kern!($(args...), $KEEP, $FINAL; ndrange=tuple($(sizes...)), workgroupsize=$workgroupsize, dependencies=Event(CUDADevice()))
-                        KernelAbstractions.wait(CUDADevice(), $ACC)
+                        $ACC = cu_kern!($(args...), $KEEP, $FINAL; ndrange=tuple($(sizes...)), workgroupsize=$workgroupsize)
                     end
 
                 end
@@ -1185,7 +1184,6 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
                     cpu_kern! = $kernel(CPU(), 4)
                     $(asserts...)
                     $ACC = cpu_kern!($(args...), $KEEP, $FINAL; ndrange=tuple($(sizes...)))
-                    KernelAbstractions.wait($ACC)
                 end
 
             end
diff --git a/test/cuda.jl b/test/cuda.jl
@@ -1,6 +1,6 @@
 
 using Tullio, Test
-using CUDA, CUDAKernels, KernelAbstractions
+using CUDA, KernelAbstractions
 CUDA.allowscalar(false)
 using Tracker, ForwardDiff
 @tullio grad=Base
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,5 @@
 using Test, Printf
+import Pkg
 
 t1 = @elapsed using Tullio
 @info @sprintf("Loading Tullio took %.1f seconds", t1)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`using Test, Printf`
	`2`	`+import Pkg`
`2`	`3`
`3`	`4`	`t1 = @elapsed using Tullio`
`4`	`5`	`@info @sprintf("Loading Tullio took %.1f seconds", t1)`