diff --git a/Project.toml b/Project.toml index cd47a89..7fd9f0d 100644 --- a/Project.toml +++ b/Project.toml @@ -11,12 +11,14 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df" [compat] CUDA = "2, 3" -CUDAKernels = "0.1, 0.2" +CUDAKernels = "0.1, 0.2, 0.3" ChainRulesCore = "0.10" DiffRules = "1" FillArrays = "0.11" +FLoops = "0.1.10" +FoldsCUDA = "0.1.5" ForwardDiff = "0.10" -KernelAbstractions = "0.6" +KernelAbstractions = "0.6, 0.7" LoopVectorization = "0.12.48" NamedDims = "0.2" OffsetArrays = "1" @@ -31,6 +33,8 @@ julia = "1.5" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" +FLoops = "cc61a311-1640-44b5-9fba-1b764f453329" +FoldsCUDA = "6cd66ae4-5932-4b96-926d-e73e578e42cc" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -46,4 +50,4 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"] +test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "FLoops", "FoldsCUDA", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"] diff --git a/README.md b/README.md index 2178548..6854d5b 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,9 @@ But it also co-operates with various other packages, provided they are loaded be * It uses [`LoopVectorization.@avx`](https://github.com/chriselrod/LoopVectorization.jl) to speed many things up. (Disable with keyword `avx=false`.) On a good day this will match the speed of OpenBLAS for matrix multiplication. -* It uses [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast. +* It can use [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast. + +* It can also use [`FLoops.@floop`](https://github.com/JuliaFolds/FLoops.jl), in particular to execute using [FoldsCUDA.jl](https://github.com/JuliaFolds/FoldsCUDA.jl) on the GPU. The macro also tries to provide a gradient for use with [Tracker](https://github.com/FluxML/Tracker.jl) or (via [ChainRules](https://github.com/JuliaDiff/ChainRules.jl)) for [Zygote](https://github.com/FluxML/Zygote.jl), [Yota](https://github.com/dfdx/Yota.jl), etc. (Disable with `grad=false`, or `nograd=A`.) This is done in one of two ways: diff --git a/src/macro.jl b/src/macro.jl index 0f0aa0b..60765aa 100644 --- a/src/macro.jl +++ b/src/macro.jl @@ -102,6 +102,7 @@ OPTS = Dict( :grad => [false, :Base, :Dual], :avx => Integer, :cuda => Integer, + :floops => [true, false], :tensor => [true, false], ) @@ -111,6 +112,7 @@ _THREADS = Ref{Any}(true) _GRAD = Ref{Any}(:Base) _AVX = Ref{Any}(true) _CUDA = Ref{Any}(true) +_FLOOPS = Ref{Any}(false) function parse_options(exs...) opts = Dict{Symbol,Any}( @@ -123,6 +125,7 @@ function parse_options(exs...) :grad => _GRAD[], :avx => _AVX[], :cuda => _CUDA[], + :floops => _FLOOPS[], :tensor => false, ) expr = nothing @@ -178,6 +181,7 @@ function parse_options(exs...) _GRAD[] = opts[:grad] _AVX[] = opts[:avx] _CUDA[] = opts[:cuda] + _FLOOPS[] = opts[:floops] end opts[:tensor] == false || @warn "option tensor=true is deprecated, try Tullio.@tensor" (redfun=opts[:redfun], @@ -189,6 +193,7 @@ function parse_options(exs...) grad=opts[:grad], avx=opts[:avx], cuda=opts[:cuda], + floops=opts[:floops], nograd=nograd, ), ranges, expr end @@ -1059,6 +1064,42 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex store.verbose>0 && @warn "can't parallelise this gradient, no shared indices $note" end + #===== FLoops =====# + + if store.floops != false && isdefined(store.mod, :FLoops) + try + info1 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing + fex1 = quote + + local @inline function $act!(::Type{<:AbstractArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} + $info1 + FLoops.@floop begin $ex1; $ex2 end + end + + end + store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex1) + if store.threads==false + # same dodgy switch as for KernelAbstractions, threads=false routes CPU calculation here: + push!(store.outpre, macroexpand(store.mod, fex1)) + end + if isdefined(store.mod, :FoldsCUDA) && isdefined(store.mod, :CUDA) + info2 = store.verbose>0 ? :(@info "running FLoops + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing + fex2 = quote + + local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} + $info1 + FLoops.@floop FLoops.CUDAEx() begin $ex1; $ex2 end + end + + end + push!(store.outpre, macroexpand(store.mod, fex2)) + end + store.verbose==2 && @info "success expanding FLoops.@floops" + catch err + store.verbose>0 && @warn "FLoops failed $note" err + end + end + #===== LoopVectorization =====# expre, exloop0, expost = if isempty(outer) @@ -1159,11 +1200,11 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex end store.verbose==2 && @info "=====KA===== KernelAbstractions kernel $note" verbosetidy(kex1) push!(store.outpre, macroexpand(store.mod, kex1)) - if isdefined(store.mod, :CUDA) && isdefined(store.mod, :CuArray) # new-style, CUDA.jl, with CUDADevice() + if isdefined(store.mod, :CUDA) info2 = store.verbose>0 ? :(@info "running KernelAbstractions + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing kex2 = quote - local @inline function $act!(::Type{<:CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} + local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} $info2 cu_kern! = $kernel(CUDADevice()) $(asserts...) diff --git a/test/cuda.jl b/test/cuda.jl index 2c1624a..0413a3b 100644 --- a/test/cuda.jl +++ b/test/cuda.jl @@ -1,7 +1,11 @@ using Tullio, Test -using CUDA, CUDAKernels, KernelAbstractions +using CUDA CUDA.allowscalar(false) + +# using CUDAKernels, KernelAbstractions +# using FoldsCUDA, FLoops + using Tracker, ForwardDiff @tullio grad=Base diff --git a/test/group-2.jl b/test/group-2.jl index 6381206..3ddaca3 100644 --- a/test/group-2.jl +++ b/test/group-2.jl @@ -4,7 +4,6 @@ t4 = time() using KernelAbstractions using Tracker - GRAD = :Tracker _gradient(x...) = Tracker.gradient(x...) @@ -23,20 +22,62 @@ _gradient(x...) = Tracker.gradient(x...) end end -using CUDA +using CUDA, CUDAKernels if is_buildkite # If we are on Buildkite, we should assert that we have a CUDA GPU available @test CUDA.has_cuda_gpu() end -if CUDA.has_cuda_gpu() +if false # CUDA.has_cuda_gpu() @info "===== found a GPU, starting CUDA tests =====" - @testset "===== CUDA tests on GPU =====" begin + @testset "===== KernelAbstractions CUDA tests on GPU =====" begin include("cuda.jl") end +else + @info "===== skipping KernelAbstractions + CUDA tests =====" end @info @sprintf("KernelAbstractions tests took %.1f seconds", time()-t4) @tullio cuda=false + +#===== FLoops =====# + +t5 = time() +using FLoops +@tullio floops=false + +using Tracker +GRAD = :Tracker +_gradient(x...) = Tracker.gradient(x...) + +@testset "FLoops + parsing + gradients" begin + A = (rand(3,4)); + B = (rand(4,5)); + @tullio C[i,k] := A[i,j] * B[j,k] threads=false verbose=1 + @test C ≈ A * B + + @tullio threads=false + include("parsing.jl") + include("gradients.jl") + @tullio threads=true + + for sy in Tullio.SYMBOLS + @test !isdefined(@__MODULE__, sy) + end +end + +using CUDA, FoldsCUDA + +if CUDA.has_cuda_gpu() + @info "===== found a GPU, starting CUDA tests =====" + @testset "===== FLoops + FoldsCUDA tests on GPU =====" begin + include("cuda.jl") + end +end + +@info @sprintf("FLoops tests took %.1f seconds", time()-t5) + +@tullio floops=false +