Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"

[compat]
CUDA = "2, 3"
CUDAKernels = "0.1, 0.2"
CUDAKernels = "0.1, 0.2, 0.3"
ChainRulesCore = "0.10"
DiffRules = "1"
FillArrays = "0.11"
FLoops = "0.1.10"
FoldsCUDA = "0.1.5"
ForwardDiff = "0.10"
KernelAbstractions = "0.6"
KernelAbstractions = "0.6, 0.7"
LoopVectorization = "0.12.48"
NamedDims = "0.2"
OffsetArrays = "1"
Expand All @@ -31,6 +33,8 @@ julia = "1.5"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
FoldsCUDA = "6cd66ae4-5932-4b96-926d-e73e578e42cc"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand All @@ -46,4 +50,4 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[targets]
test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "FLoops", "FoldsCUDA", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ But it also co-operates with various other packages, provided they are loaded be

* It uses [`LoopVectorization.@avx`](https://github.com/chriselrod/LoopVectorization.jl) to speed many things up. (Disable with keyword `avx=false`.) On a good day this will match the speed of OpenBLAS for matrix multiplication.

* It uses [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast.
* It can use [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast.

* It can also use [`FLoops.@floop`](https://github.com/JuliaFolds/FLoops.jl), in particular to execute using [FoldsCUDA.jl](https://github.com/JuliaFolds/FoldsCUDA.jl) on the GPU.

The macro also tries to provide a gradient for use with [Tracker](https://github.com/FluxML/Tracker.jl) or (via [ChainRules](https://github.com/JuliaDiff/ChainRules.jl)) for [Zygote](https://github.com/FluxML/Zygote.jl), [Yota](https://github.com/dfdx/Yota.jl), etc. <!-- or [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl). -->
(Disable with `grad=false`, or `nograd=A`.) This is done in one of two ways:
Expand Down
45 changes: 43 additions & 2 deletions src/macro.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ OPTS = Dict(
:grad => [false, :Base, :Dual],
:avx => Integer,
:cuda => Integer,
:floops => [true, false],
:tensor => [true, false],
)

Expand All @@ -111,6 +112,7 @@ _THREADS = Ref{Any}(true)
_GRAD = Ref{Any}(:Base)
_AVX = Ref{Any}(true)
_CUDA = Ref{Any}(true)
_FLOOPS = Ref{Any}(false)

function parse_options(exs...)
opts = Dict{Symbol,Any}(
Expand All @@ -123,6 +125,7 @@ function parse_options(exs...)
:grad => _GRAD[],
:avx => _AVX[],
:cuda => _CUDA[],
:floops => _FLOOPS[],
:tensor => false,
)
expr = nothing
Expand Down Expand Up @@ -178,6 +181,7 @@ function parse_options(exs...)
_GRAD[] = opts[:grad]
_AVX[] = opts[:avx]
_CUDA[] = opts[:cuda]
_FLOOPS[] = opts[:floops]
end
opts[:tensor] == false || @warn "option tensor=true is deprecated, try Tullio.@tensor"
(redfun=opts[:redfun],
Expand All @@ -189,6 +193,7 @@ function parse_options(exs...)
grad=opts[:grad],
avx=opts[:avx],
cuda=opts[:cuda],
floops=opts[:floops],
nograd=nograd,
), ranges, expr
end
Expand Down Expand Up @@ -1059,6 +1064,42 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
store.verbose>0 && @warn "can't parallelise this gradient, no shared indices $note"
end

#===== FLoops =====#

if store.floops != false && isdefined(store.mod, :FLoops)
try
info1 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
fex1 = quote

local @inline function $act!(::Type{<:AbstractArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
$info1
FLoops.@floop begin $ex1; $ex2 end
end

end
store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex1)
if store.threads==false
# same dodgy switch as for KernelAbstractions, threads=false routes CPU calculation here:
push!(store.outpre, macroexpand(store.mod, fex1))
end
if isdefined(store.mod, :FoldsCUDA) && isdefined(store.mod, :CUDA)
info2 = store.verbose>0 ? :(@info "running FLoops + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
fex2 = quote

local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
$info1
FLoops.@floop FLoops.CUDAEx() begin $ex1; $ex2 end
end

end
push!(store.outpre, macroexpand(store.mod, fex2))
end
store.verbose==2 && @info "success expanding FLoops.@floops"
catch err
store.verbose>0 && @warn "FLoops failed $note" err
end
end

#===== LoopVectorization =====#

expre, exloop0, expost = if isempty(outer)
Expand Down Expand Up @@ -1159,11 +1200,11 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
end
store.verbose==2 && @info "=====KA===== KernelAbstractions kernel $note" verbosetidy(kex1)
push!(store.outpre, macroexpand(store.mod, kex1))
if isdefined(store.mod, :CUDA) && isdefined(store.mod, :CuArray) # new-style, CUDA.jl, with CUDADevice()
if isdefined(store.mod, :CUDA)
info2 = store.verbose>0 ? :(@info "running KernelAbstractions + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
kex2 = quote

local @inline function $act!(::Type{<:CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
$info2
cu_kern! = $kernel(CUDADevice())
$(asserts...)
Expand Down
6 changes: 5 additions & 1 deletion test/cuda.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@

using Tullio, Test
using CUDA, CUDAKernels, KernelAbstractions
using CUDA
CUDA.allowscalar(false)

# using CUDAKernels, KernelAbstractions
# using FoldsCUDA, FLoops

using Tracker, ForwardDiff
@tullio grad=Base

Expand Down
49 changes: 45 additions & 4 deletions test/group-2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ t4 = time()
using KernelAbstractions

using Tracker

GRAD = :Tracker
_gradient(x...) = Tracker.gradient(x...)

Expand All @@ -23,20 +22,62 @@ _gradient(x...) = Tracker.gradient(x...)
end
end

using CUDA
using CUDA, CUDAKernels

if is_buildkite
# If we are on Buildkite, we should assert that we have a CUDA GPU available
@test CUDA.has_cuda_gpu()
end

if CUDA.has_cuda_gpu()
if false # CUDA.has_cuda_gpu()
@info "===== found a GPU, starting CUDA tests ====="
@testset "===== CUDA tests on GPU =====" begin
@testset "===== KernelAbstractions CUDA tests on GPU =====" begin
include("cuda.jl")
end
else
@info "===== skipping KernelAbstractions + CUDA tests ====="
end

@info @sprintf("KernelAbstractions tests took %.1f seconds", time()-t4)

@tullio cuda=false

#===== FLoops =====#

t5 = time()
using FLoops
@tullio floops=false

using Tracker
GRAD = :Tracker
_gradient(x...) = Tracker.gradient(x...)

@testset "FLoops + parsing + gradients" begin
A = (rand(3,4));
B = (rand(4,5));
@tullio C[i,k] := A[i,j] * B[j,k] threads=false verbose=1
@test C ≈ A * B

@tullio threads=false
include("parsing.jl")
include("gradients.jl")
@tullio threads=true

for sy in Tullio.SYMBOLS
@test !isdefined(@__MODULE__, sy)
end
end

using CUDA, FoldsCUDA

if CUDA.has_cuda_gpu()
@info "===== found a GPU, starting CUDA tests ====="
@testset "===== FLoops + FoldsCUDA tests on GPU =====" begin
include("cuda.jl")
end
end

@info @sprintf("FLoops tests took %.1f seconds", time()-t5)

@tullio floops=false