diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 76abb0efb..f80d0651b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,7 +48,7 @@ jobs: show-versioninfo: true - uses: julia-actions/cache@v2 - run: | - julia -e '@static if VERSION >= v"1.10" + julia -e ' using Pkg withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do Pkg.activate("test") @@ -61,11 +61,8 @@ jobs: try Pkg.develop([PackageSpec("Enzyme"), PackageSpec("EnzymeCore")]) catch err - @error "Could not install Enzyme" exception=(err,catch_backtrace()) - exit(3) end end - end ' - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 diff --git a/Project.toml b/Project.toml index dddeefb96..85e8989b6 100644 --- a/Project.toml +++ b/Project.toml @@ -21,7 +21,7 @@ UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [compat] Adapt = "0.4, 1.0, 2.0, 3.0, 4" Atomix = "0.1" -EnzymeCore = "0.8.1" +EnzymeCore = "0.7, 0.8.1" InteractiveUtils = "1.6" LinearAlgebra = "1.6" MacroTools = "0.5" diff --git a/ext/EnzymeCore07Ext.jl b/ext/EnzymeCore07Ext.jl new file mode 100644 index 000000000..1998b020b --- /dev/null +++ b/ext/EnzymeCore07Ext.jl @@ -0,0 +1,342 @@ +# https://github.com/EnzymeAD/Enzyme.jl/issues/1516 +# On the CPU `autodiff_deferred` can deadlock. +# Hence a specialized CPU version +function cpu_fwd(ctx, f, args...) + EnzymeCore.autodiff(Forward, Const(f), Const{Nothing}, Const(ctx), args...) + return nothing +end + +function gpu_fwd(ctx, f, args...) + EnzymeCore.autodiff_deferred(Forward, Const(f), Const{Nothing}, Const(ctx), args...) + return nothing +end + +function EnzymeRules.forward( + func::Const{<:Kernel{CPU}}, + ::Type{Const{Nothing}}, + args...; + ndrange = nothing, + workgroupsize = nothing, + ) + kernel = func.val + f = kernel.f + fwd_kernel = similar(kernel, cpu_fwd) + + fwd_kernel(f, args...; ndrange, workgroupsize) +end + +function EnzymeRules.forward( + func::Const{<:Kernel{<:GPU}}, + ::Type{Const{Nothing}}, + args...; + ndrange = nothing, + workgroupsize = nothing, + ) + kernel = func.val + f = kernel.f + fwd_kernel = similar(kernel, gpu_fwd) + + fwd_kernel(f, args...; ndrange, workgroupsize) +end + +_enzyme_mkcontext(kernel::Kernel{CPU}, ndrange, iterspace, dynamic) = + mkcontext(kernel, first(blocks(iterspace)), ndrange, iterspace, dynamic) +_enzyme_mkcontext(kernel::Kernel{<:GPU}, ndrange, iterspace, dynamic) = + mkcontext(kernel, ndrange, iterspace) + +_augmented_return(::Kernel{CPU}, subtape, arg_refs, tape_type) = + AugmentedReturn{Nothing, Nothing, Tuple{Array, typeof(arg_refs), typeof(tape_type)}}( + nothing, + nothing, + (subtape, arg_refs, tape_type), +) +_augmented_return(::Kernel{<:GPU}, subtape, arg_refs, tape_type) = + AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, (subtape, arg_refs, tape_type)) + +function _create_tape_kernel( + kernel::Kernel{CPU}, + ModifiedBetween, + FT, + ctxTy, + ndrange, + iterspace, + args2..., + ) + TapeType = EnzymeCore.tape_type( + ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), + FT, + Const{Nothing}, + Const{ctxTy}, + map(Core.Typeof, args2)..., + ) + subtape = Array{TapeType}(undef, size(blocks(iterspace))) + aug_kernel = similar(kernel, cpu_aug_fwd) + return TapeType, subtape, aug_kernel +end + +function _create_tape_kernel( + kernel::Kernel{<:GPU}, + ModifiedBetween, + FT, + ctxTy, + ndrange, + iterspace, + args2..., + ) + # For peeking at the TapeType we need to first construct a correct compilation job + # this requires the use of the device side representation of arguments. + # So we convert the arguments here, this is a bit wasteful since the `aug_kernel` call + # will later do the same. + dev_args2 = ((argconvert(kernel, a) for a in args2)...,) + dev_TT = map(Core.Typeof, dev_args2) + + job = + EnzymeCore.compiler_job_from_backend(backend(kernel), typeof(() -> return), Tuple{}) + TapeType = EnzymeCore.tape_type( + job, + ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), + FT, + Const{Nothing}, + Const{ctxTy}, + dev_TT..., + ) + + # Allocate per thread + subtape = allocate(backend(kernel), TapeType, prod(ndrange)) + + aug_kernel = similar(kernel, gpu_aug_fwd) + return TapeType, subtape, aug_kernel +end + +_create_rev_kernel(kernel::Kernel{CPU}) = similar(kernel, cpu_rev) +_create_rev_kernel(kernel::Kernel{<:GPU}) = similar(kernel, gpu_rev) + +function cpu_aug_fwd( + ctx, + f::FT, + ::Val{ModifiedBetween}, + subtape, + ::Val{TapeType}, + args..., + ) where {ModifiedBetween, FT, TapeType} + # A2 = Const{Nothing} -- since f->Nothing + forward, _ = EnzymeCore.autodiff_thunk( + ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), + Const{Core.Typeof(f)}, + Const{Nothing}, + Const{Core.Typeof(ctx)}, + map(Core.Typeof, args)..., + ) + + # On the CPU: F is a per block function + # On the CPU: subtape::Vector{Vector} + I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# + subtape[I] = forward(Const(f), Const(ctx), args...)[1] + return nothing +end + +function cpu_rev( + ctx, + f::FT, + ::Val{ModifiedBetween}, + subtape, + ::Val{TapeType}, + args..., + ) where {ModifiedBetween, FT, TapeType} + _, reverse = EnzymeCore.autodiff_thunk( + ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), + Const{Core.Typeof(f)}, + Const{Nothing}, + Const{Core.Typeof(ctx)}, + map(Core.Typeof, args)..., + ) + I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# + tp = subtape[I] + reverse(Const(f), Const(ctx), args..., tp) + return nothing +end + +# GPU support +function gpu_aug_fwd( + ctx, + f::FT, + ::Val{ModifiedBetween}, + subtape, + ::Val{TapeType}, + args..., + ) where {ModifiedBetween, FT, TapeType} + # A2 = Const{Nothing} -- since f->Nothing + forward, _ = EnzymeCore.autodiff_deferred_thunk( + ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), + TapeType, + Const{Core.Typeof(f)}, + Const{Nothing}, + Const{Core.Typeof(ctx)}, + map(Core.Typeof, args)..., + ) + + # On the GPU: F is a per thread function + # On the GPU: subtape::Vector + if __validindex(ctx) + I = __index_Global_Linear(ctx) + subtape[I] = forward(Const(f), Const(ctx), args...)[1] + end + return nothing +end + +function gpu_rev( + ctx, + f::FT, + ::Val{ModifiedBetween}, + subtape, + ::Val{TapeType}, + args..., + ) where {ModifiedBetween, FT, TapeType} + # XXX: TapeType and A2 as args to autodiff_deferred_thunk + _, reverse = EnzymeCore.autodiff_deferred_thunk( + ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), + TapeType, + Const{Core.Typeof(f)}, + Const{Nothing}, + Const{Core.Typeof(ctx)}, + map(Core.Typeof, args)..., + ) + if __validindex(ctx) + I = __index_Global_Linear(ctx) + tp = subtape[I] + reverse(Const(f), Const(ctx), args..., tp) + end + return nothing +end + +function EnzymeRules.augmented_primal( + config::Config, + func::Const{<:Kernel}, + ::Type{Const{Nothing}}, + args::Vararg{Any, N}; + ndrange = nothing, + workgroupsize = nothing, + ) where {N} + kernel = func.val + f = kernel.f + + ndrange, workgroupsize, iterspace, dynamic = + launch_config(kernel, ndrange, workgroupsize) + ctx = _enzyme_mkcontext(kernel, ndrange, iterspace, dynamic) + ctxTy = Core.Typeof(ctx) # CompilerMetadata{ndrange(kernel), Core.Typeof(dynamic)} + # TODO autodiff_deferred on the func.val + ModifiedBetween = Val((overwritten(config)[1], false, overwritten(config)[2:end]...)) + + FT = Const{Core.Typeof(f)} + + arg_refs = ntuple(Val(N)) do i + Base.@_inline_meta + if args[i] isa Active + if func.val isa Kernel{<:GPU} + error("Active kernel arguments not supported on GPU") + else + Ref(EnzymeCore.make_zero(args[i].val)) + end + else + nothing + end + end + args2 = ntuple(Val(N)) do i + Base.@_inline_meta + if args[i] isa Active + MixedDuplicated(args[i].val, arg_refs[i]) + else + args[i] + end + end + + TapeType, subtape, aug_kernel = _create_tape_kernel( + kernel, + ModifiedBetween, + FT, + ctxTy, + ndrange, + iterspace, + args2..., + ) + aug_kernel(f, ModifiedBetween, subtape, Val(TapeType), args2...; ndrange, workgroupsize) + + # TODO the fact that ctxTy is type unstable means this is all type unstable. + # Since custom rules require a fixed return type, explicitly cast to Any, rather + # than returning a AugmentedReturn{Nothing, Nothing, T} where T. + return _augmented_return(kernel, subtape, arg_refs, TapeType) +end + +function EnzymeRules.reverse( + config::Config, + func::Const{<:Kernel}, + ::Type{<:EnzymeCore.Annotation}, + tape, + args::Vararg{Any, N}; + ndrange = nothing, + workgroupsize = nothing, + ) where {N} + subtape, arg_refs, tape_type = tape + + args2 = ntuple(Val(N)) do i + Base.@_inline_meta + if args[i] isa Active + MixedDuplicated(args[i].val, arg_refs[i]) + else + args[i] + end + end + + kernel = func.val + f = kernel.f + + ModifiedBetween = Val((overwritten(config)[1], false, overwritten(config)[2:end]...)) + + rev_kernel = _create_rev_kernel(kernel) + rev_kernel( + f, + ModifiedBetween, + subtape, + Val(tape_type), + args2...; + ndrange, + workgroupsize, + ) + res = ntuple(Val(N)) do i + Base.@_inline_meta + if args[i] isa Active + arg_refs[i][] + else + nothing + end + end + # Reverse synchronization right after the kernel launch + synchronize(backend(kernel)) + return res +end + +# Synchronize rules +# TODO: Right now we do the synchronization as part of the kernel launch in the augmented primal +# and reverse rules. This is not ideal, as we would want to launch the kernel in the reverse +# synchronize rule and then synchronize where the launch was. However, with the current +# kernel semantics this ensures correctness for now. +function EnzymeRules.augmented_primal( + config::Config, + func::Const{typeof(synchronize)}, + ::Type{Const{Nothing}}, + backend::T, + ) where {T <: EnzymeCore.Annotation} + synchronize(backend.val) + return AugmentedReturn(nothing, nothing, nothing) +end + +function EnzymeRules.reverse( + config::Config, + func::Const{typeof(synchronize)}, + ::Type{Const{Nothing}}, + tape, + backend, + ) + # noop for now + return (nothing,) +end diff --git a/ext/EnzymeCore08Ext.jl b/ext/EnzymeCore08Ext.jl new file mode 100644 index 000000000..8bdd8ee42 --- /dev/null +++ b/ext/EnzymeCore08Ext.jl @@ -0,0 +1,318 @@ +# https://github.com/EnzymeAD/Enzyme.jl/issues/1516 +# On the CPU `autodiff_deferred` can deadlock. +# Hence a specialized CPU version +function cpu_fwd(ctx, config, f, args...) + EnzymeCore.autodiff(EnzymeCore.set_runtime_activity(Forward, config), Const(f), Const{Nothing}, Const(ctx), args...) + return nothing +end + +function gpu_fwd(ctx, config, f, args...) + EnzymeCore.autodiff_deferred(EnzymeCore.set_runtime_activity(Forward, config), Const(f), Const{Nothing}, Const(ctx), args...) + return nothing +end + +function EnzymeRules.forward( + config, + func::Const{<:Kernel{CPU}}, + ::Type{Const{Nothing}}, + args...; + ndrange = nothing, + workgroupsize = nothing, + ) + kernel = func.val + f = kernel.f + fwd_kernel = similar(kernel, cpu_fwd) + + fwd_kernel(config, f, args...; ndrange, workgroupsize) +end + +function EnzymeRules.forward( + config, + func::Const{<:Kernel{<:GPU}}, + ::Type{Const{Nothing}}, + args...; + ndrange = nothing, + workgroupsize = nothing, + ) + kernel = func.val + f = kernel.f + fwd_kernel = similar(kernel, gpu_fwd) + + fwd_kernel(config, f, args...; ndrange, workgroupsize) +end + +_enzyme_mkcontext(kernel::Kernel{CPU}, ndrange, iterspace, dynamic) = + mkcontext(kernel, first(blocks(iterspace)), ndrange, iterspace, dynamic) +_enzyme_mkcontext(kernel::Kernel{<:GPU}, ndrange, iterspace, dynamic) = + mkcontext(kernel, ndrange, iterspace) + +_augmented_return(::Kernel{CPU}, subtape, arg_refs, tape_type) = + AugmentedReturn{Nothing, Nothing, Tuple{Array, typeof(arg_refs), typeof(tape_type)}}( + nothing, + nothing, + (subtape, arg_refs, tape_type), +) +_augmented_return(::Kernel{<:GPU}, subtape, arg_refs, tape_type) = + AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, (subtape, arg_refs, tape_type)) + +function _create_tape_kernel( + kernel::Kernel{CPU}, + Mode, + FT, + ctxTy, + ndrange, + iterspace, + args2..., + ) + TapeType = EnzymeCore.tape_type( + Mode, + FT, + Const{Nothing}, + Const{ctxTy}, + map(Core.Typeof, args2)..., + ) + subtape = Array{TapeType}(undef, size(blocks(iterspace))) + aug_kernel = similar(kernel, cpu_aug_fwd) + return TapeType, subtape, aug_kernel +end + +function _create_tape_kernel( + kernel::Kernel{<:GPU}, + Mode, + FT, + ctxTy, + ndrange, + iterspace, + args2..., + ) + # For peeking at the TapeType we need to first construct a correct compilation job + # this requires the use of the device side representation of arguments. + # So we convert the arguments here, this is a bit wasteful since the `aug_kernel` call + # will later do the same. + dev_args2 = ((argconvert(kernel, a) for a in args2)...,) + dev_TT = map(Core.Typeof, dev_args2) + + job = + EnzymeCore.compiler_job_from_backend(backend(kernel), typeof(() -> return), Tuple{}) + TapeType = EnzymeCore.tape_type( + job, + Mode, + FT, + Const{Nothing}, + Const{ctxTy}, + dev_TT..., + ) + + # Allocate per thread + subtape = allocate(backend(kernel), TapeType, prod(ndrange)) + + aug_kernel = similar(kernel, gpu_aug_fwd) + return TapeType, subtape, aug_kernel +end + +_create_rev_kernel(kernel::Kernel{CPU}) = similar(kernel, cpu_rev) +_create_rev_kernel(kernel::Kernel{<:GPU}) = similar(kernel, gpu_rev) + +function cpu_aug_fwd( + ctx, + f::FT, + mode::Mode, + subtape, + ::Val{TapeType}, + args..., + ) where {Mode, FT, TapeType} + # A2 = Const{Nothing} -- since f->Nothing + forward, _ = EnzymeCore.autodiff_thunk( + mode, + Const{Core.Typeof(f)}, + Const{Nothing}, + Const{Core.Typeof(ctx)}, + map(Core.Typeof, args)..., + ) + + # On the CPU: F is a per block function + # On the CPU: subtape::Vector{Vector} + I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# + subtape[I] = forward(Const(f), Const(ctx), args...)[1] + return nothing +end + +function cpu_rev( + ctx, + f::FT, + mode::Mode, + subtape, + ::Val{TapeType}, + args..., + ) where {Mode, FT, TapeType} + _, reverse = EnzymeCore.autodiff_thunk( + mode, + Const{Core.Typeof(f)}, + Const{Nothing}, + Const{Core.Typeof(ctx)}, + map(Core.Typeof, args)..., + ) + I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# + tp = subtape[I] + reverse(Const(f), Const(ctx), args..., tp) + return nothing +end + +# GPU support +function gpu_aug_fwd( + ctx, + f::FT, + mode::Mode, + subtape, + ::Val{TapeType}, + args..., + ) where {Mode, FT, TapeType} + # A2 = Const{Nothing} -- since f->Nothing + forward, _ = EnzymeCore.autodiff_deferred_thunk( + mode, + TapeType, + Const{Core.Typeof(f)}, + Const{Nothing}, + Const{Core.Typeof(ctx)}, + map(Core.Typeof, args)..., + ) + + # On the GPU: F is a per thread function + # On the GPU: subtape::Vector + if __validindex(ctx) + I = __index_Global_Linear(ctx) + subtape[I] = forward(Const(f), Const(ctx), args...)[1] + end + return nothing +end + +function gpu_rev( + ctx, + f::FT, + mode::Mode, + subtape, + ::Val{TapeType}, + args..., + ) where {Mode, FT, TapeType} + # XXX: TapeType and A2 as args to autodiff_deferred_thunk + _, reverse = EnzymeCore.autodiff_deferred_thunk( + mode, + TapeType, + Const{Core.Typeof(f)}, + Const{Nothing}, + Const{Core.Typeof(ctx)}, + map(Core.Typeof, args)..., + ) + if __validindex(ctx) + I = __index_Global_Linear(ctx) + tp = subtape[I] + reverse(Const(f), Const(ctx), args..., tp) + end + return nothing +end + +function EnzymeRules.augmented_primal( + config::RevConfig, + func::Const{<:Kernel}, + ::Type{Const{Nothing}}, + args::Vararg{Any, N}; + ndrange = nothing, + workgroupsize = nothing, + ) where {N} + kernel = func.val + f = kernel.f + + ndrange, workgroupsize, iterspace, dynamic = + launch_config(kernel, ndrange, workgroupsize) + ctx = _enzyme_mkcontext(kernel, ndrange, iterspace, dynamic) + ctxTy = Core.Typeof(ctx) # CompilerMetadata{ndrange(kernel), Core.Typeof(dynamic)} + # TODO autodiff_deferred on the func.val + ModifiedBetween = Val((overwritten(config)[1], false, overwritten(config)[2:end]...)) + + FT = Const{Core.Typeof(f)} + + arg_refs = ntuple(Val(N)) do i + Base.@_inline_meta + if args[i] isa Active + if func.val isa Kernel{<:GPU} + error("Active kernel arguments not supported on GPU") + else + Ref(EnzymeCore.make_zero(args[i].val)) + end + else + nothing + end + end + args2 = ntuple(Val(N)) do i + Base.@_inline_meta + if args[i] isa Active + MixedDuplicated(args[i].val, arg_refs[i]) + else + args[i] + end + end + Mode = EnzymeCore.set_runtime_activity(ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), config) + TapeType, subtape, aug_kernel = _create_tape_kernel( + kernel, + Mode, + FT, + ctxTy, + ndrange, + iterspace, + args2..., + ) + aug_kernel(f, Mode, subtape, Val(TapeType), args2...; ndrange, workgroupsize) + + # TODO the fact that ctxTy is type unstable means this is all type unstable. + # Since custom rules require a fixed return type, explicitly cast to Any, rather + # than returning a AugmentedReturn{Nothing, Nothing, T} where T. + return _augmented_return(kernel, subtape, arg_refs, TapeType) +end + +function EnzymeRules.reverse( + config::RevConfig, + func::Const{<:Kernel}, + ::Type{<:EnzymeCore.Annotation}, + tape, + args::Vararg{Any, N}; + ndrange = nothing, + workgroupsize = nothing, + ) where {N} + subtape, arg_refs, tape_type = tape + + args2 = ntuple(Val(N)) do i + Base.@_inline_meta + if args[i] isa Active + MixedDuplicated(args[i].val, arg_refs[i]) + else + args[i] + end + end + + kernel = func.val + f = kernel.f + + ModifiedBetween = Val((overwritten(config)[1], false, overwritten(config)[2:end]...)) + Mode = EnzymeCore.set_runtime_activity(ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), config) + rev_kernel = _create_rev_kernel(kernel) + rev_kernel( + f, + Mode, + subtape, + Val(tape_type), + args2...; + ndrange, + workgroupsize, + ) + res = ntuple(Val(N)) do i + Base.@_inline_meta + if args[i] isa Active + arg_refs[i][] + else + nothing + end + end + # Reverse synchronization right after the kernel launch + synchronize(backend(kernel)) + return res +end diff --git a/ext/EnzymeExt.jl b/ext/EnzymeExt.jl index 40721d144..1fda4e051 100644 --- a/ext/EnzymeExt.jl +++ b/ext/EnzymeExt.jl @@ -41,323 +41,10 @@ end EnzymeRules.inactive(::Type{StaticSize}, x...) = nothing -# https://github.com/EnzymeAD/Enzyme.jl/issues/1516 -# On the CPU `autodiff_deferred` can deadlock. -# Hence a specialized CPU version -function cpu_fwd(ctx, config, f, args...) - EnzymeCore.autodiff(EnzymeCore.set_runtime_activity(Forward, config), Const(f), Const{Nothing}, Const(ctx), args...) - return nothing -end - -function gpu_fwd(ctx, config, f, args...) - EnzymeCore.autodiff_deferred(EnzymeCore.set_runtime_activity(Forward, config), Const(f), Const{Nothing}, Const(ctx), args...) - return nothing -end - -function EnzymeRules.forward( - config, - func::Const{<:Kernel{CPU}}, - ::Type{Const{Nothing}}, - args...; - ndrange = nothing, - workgroupsize = nothing, - ) - kernel = func.val - f = kernel.f - fwd_kernel = similar(kernel, cpu_fwd) - - fwd_kernel(config, f, args...; ndrange, workgroupsize) -end - -function EnzymeRules.forward( - config, - func::Const{<:Kernel{<:GPU}}, - ::Type{Const{Nothing}}, - args...; - ndrange = nothing, - workgroupsize = nothing, - ) - kernel = func.val - f = kernel.f - fwd_kernel = similar(kernel, gpu_fwd) - - fwd_kernel(config, f, args...; ndrange, workgroupsize) -end - -_enzyme_mkcontext(kernel::Kernel{CPU}, ndrange, iterspace, dynamic) = - mkcontext(kernel, first(blocks(iterspace)), ndrange, iterspace, dynamic) -_enzyme_mkcontext(kernel::Kernel{<:GPU}, ndrange, iterspace, dynamic) = - mkcontext(kernel, ndrange, iterspace) - -_augmented_return(::Kernel{CPU}, subtape, arg_refs, tape_type) = - AugmentedReturn{Nothing, Nothing, Tuple{Array, typeof(arg_refs), typeof(tape_type)}}( - nothing, - nothing, - (subtape, arg_refs, tape_type), -) -_augmented_return(::Kernel{<:GPU}, subtape, arg_refs, tape_type) = - AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, (subtape, arg_refs, tape_type)) - -function _create_tape_kernel( - kernel::Kernel{CPU}, - Mode, - FT, - ctxTy, - ndrange, - iterspace, - args2..., - ) - TapeType = EnzymeCore.tape_type( - Mode, - FT, - Const{Nothing}, - Const{ctxTy}, - map(Core.Typeof, args2)..., - ) - subtape = Array{TapeType}(undef, size(blocks(iterspace))) - aug_kernel = similar(kernel, cpu_aug_fwd) - return TapeType, subtape, aug_kernel -end - -function _create_tape_kernel( - kernel::Kernel{<:GPU}, - Mode, - FT, - ctxTy, - ndrange, - iterspace, - args2..., - ) - # For peeking at the TapeType we need to first construct a correct compilation job - # this requires the use of the device side representation of arguments. - # So we convert the arguments here, this is a bit wasteful since the `aug_kernel` call - # will later do the same. - dev_args2 = ((argconvert(kernel, a) for a in args2)...,) - dev_TT = map(Core.Typeof, dev_args2) - - job = - EnzymeCore.compiler_job_from_backend(backend(kernel), typeof(() -> return), Tuple{}) - TapeType = EnzymeCore.tape_type( - job, - Mode, - FT, - Const{Nothing}, - Const{ctxTy}, - dev_TT..., - ) - - # Allocate per thread - subtape = allocate(backend(kernel), TapeType, prod(ndrange)) - - aug_kernel = similar(kernel, gpu_aug_fwd) - return TapeType, subtape, aug_kernel -end - -_create_rev_kernel(kernel::Kernel{CPU}) = similar(kernel, cpu_rev) -_create_rev_kernel(kernel::Kernel{<:GPU}) = similar(kernel, gpu_rev) - -function cpu_aug_fwd( - ctx, - f::FT, - mode::Mode, - subtape, - ::Val{TapeType}, - args..., - ) where {Mode, FT, TapeType} - # A2 = Const{Nothing} -- since f->Nothing - forward, _ = EnzymeCore.autodiff_thunk( - mode, - Const{Core.Typeof(f)}, - Const{Nothing}, - Const{Core.Typeof(ctx)}, - map(Core.Typeof, args)..., - ) - - # On the CPU: F is a per block function - # On the CPU: subtape::Vector{Vector} - I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# - subtape[I] = forward(Const(f), Const(ctx), args...)[1] - return nothing -end - -function cpu_rev( - ctx, - f::FT, - mode::Mode, - subtape, - ::Val{TapeType}, - args..., - ) where {Mode, FT, TapeType} - _, reverse = EnzymeCore.autodiff_thunk( - mode, - Const{Core.Typeof(f)}, - Const{Nothing}, - Const{Core.Typeof(ctx)}, - map(Core.Typeof, args)..., - ) - I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# - tp = subtape[I] - reverse(Const(f), Const(ctx), args..., tp) - return nothing -end - -# GPU support -function gpu_aug_fwd( - ctx, - f::FT, - mode::Mode, - subtape, - ::Val{TapeType}, - args..., - ) where {Mode, FT, TapeType} - # A2 = Const{Nothing} -- since f->Nothing - forward, _ = EnzymeCore.autodiff_deferred_thunk( - mode, - TapeType, - Const{Core.Typeof(f)}, - Const{Nothing}, - Const{Core.Typeof(ctx)}, - map(Core.Typeof, args)..., - ) - - # On the GPU: F is a per thread function - # On the GPU: subtape::Vector - if __validindex(ctx) - I = __index_Global_Linear(ctx) - subtape[I] = forward(Const(f), Const(ctx), args...)[1] - end - return nothing -end - -function gpu_rev( - ctx, - f::FT, - mode::Mode, - subtape, - ::Val{TapeType}, - args..., - ) where {Mode, FT, TapeType} - # XXX: TapeType and A2 as args to autodiff_deferred_thunk - _, reverse = EnzymeCore.autodiff_deferred_thunk( - mode, - TapeType, - Const{Core.Typeof(f)}, - Const{Nothing}, - Const{Core.Typeof(ctx)}, - map(Core.Typeof, args)..., - ) - if __validindex(ctx) - I = __index_Global_Linear(ctx) - tp = subtape[I] - reverse(Const(f), Const(ctx), args..., tp) - end - return nothing -end - -function EnzymeRules.augmented_primal( - config::RevConfig, - func::Const{<:Kernel}, - ::Type{Const{Nothing}}, - args::Vararg{Any, N}; - ndrange = nothing, - workgroupsize = nothing, - ) where {N} - kernel = func.val - f = kernel.f - - ndrange, workgroupsize, iterspace, dynamic = - launch_config(kernel, ndrange, workgroupsize) - ctx = _enzyme_mkcontext(kernel, ndrange, iterspace, dynamic) - ctxTy = Core.Typeof(ctx) # CompilerMetadata{ndrange(kernel), Core.Typeof(dynamic)} - # TODO autodiff_deferred on the func.val - ModifiedBetween = Val((overwritten(config)[1], false, overwritten(config)[2:end]...)) - - FT = Const{Core.Typeof(f)} - - arg_refs = ntuple(Val(N)) do i - Base.@_inline_meta - if args[i] isa Active - if func.val isa Kernel{<:GPU} - error("Active kernel arguments not supported on GPU") - else - Ref(EnzymeCore.make_zero(args[i].val)) - end - else - nothing - end - end - args2 = ntuple(Val(N)) do i - Base.@_inline_meta - if args[i] isa Active - MixedDuplicated(args[i].val, arg_refs[i]) - else - args[i] - end - end - Mode = EnzymeCore.set_runtime_activity(ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), config) - TapeType, subtape, aug_kernel = _create_tape_kernel( - kernel, - Mode, - FT, - ctxTy, - ndrange, - iterspace, - args2..., - ) - aug_kernel(f, Mode, subtape, Val(TapeType), args2...; ndrange, workgroupsize) - - # TODO the fact that ctxTy is type unstable means this is all type unstable. - # Since custom rules require a fixed return type, explicitly cast to Any, rather - # than returning a AugmentedReturn{Nothing, Nothing, T} where T. - return _augmented_return(kernel, subtape, arg_refs, TapeType) -end - -function EnzymeRules.reverse( - config::RevConfig, - func::Const{<:Kernel}, - ::Type{<:EnzymeCore.Annotation}, - tape, - args::Vararg{Any, N}; - ndrange = nothing, - workgroupsize = nothing, - ) where {N} - subtape, arg_refs, tape_type = tape - - args2 = ntuple(Val(N)) do i - Base.@_inline_meta - if args[i] isa Active - MixedDuplicated(args[i].val, arg_refs[i]) - else - args[i] - end - end - - kernel = func.val - f = kernel.f - - ModifiedBetween = Val((overwritten(config)[1], false, overwritten(config)[2:end]...)) - Mode = EnzymeCore.set_runtime_activity(ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), config) - rev_kernel = _create_rev_kernel(kernel) - rev_kernel( - f, - Mode, - subtape, - Val(tape_type), - args2...; - ndrange, - workgroupsize, - ) - res = ntuple(Val(N)) do i - Base.@_inline_meta - if args[i] isa Active - arg_refs[i][] - else - nothing - end - end - # Reverse synchronization right after the kernel launch - synchronize(backend(kernel)) - return res +@static if isdefined(EnzymeCore, :set_runtime_activity) + include("EnzymeCore08Ext.jl") +else + include("EnzymeCore07Ext.jl") end # Synchronize rules @@ -366,7 +53,7 @@ end # synchronize rule and then synchronize where the launch was. However, with the current # kernel semantics this ensures correctness for now. function EnzymeRules.augmented_primal( - config::RevConfig, + config, func::Const{typeof(synchronize)}, ::Type{Const{Nothing}}, backend::T, @@ -376,7 +63,7 @@ function EnzymeRules.augmented_primal( end function EnzymeRules.reverse( - config::RevConfig, + config, func::Const{typeof(synchronize)}, ::Type{Const{Nothing}}, tape, diff --git a/test/runtests.jl b/test/runtests.jl index 0599143f2..f992afad2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -72,8 +72,9 @@ struct NewBackend <: KernelAbstractions.GPU end @test_throws MethodError kernel() end -@static if VERSION >= v"1.10.0" - include("extensions/enzyme.jl") + +include("extensions/enzyme.jl") +@static if VERSION >= v"1.7.0" @testset "Enzyme" begin enzyme_testsuite(CPU, Array) end