JuliaGPU
diff --git a/‎docs/make.jl‎
Lines changed: 1 addition & 0 deletions b/‎docs/make.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/histogram.jl‎
Lines changed: 2 additions & 2 deletions b/‎examples/histogram.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/matmul.jl‎
Lines changed: 1 addition & 1 deletion b/‎examples/matmul.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/memcopy.jl‎
Lines changed: 1 addition & 1 deletion b/‎examples/memcopy.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/memcopy_static.jl‎
Lines changed: 1 addition & 1 deletion b/‎examples/memcopy_static.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/mpi.jl‎
Lines changed: 3 additions & 1 deletion b/‎examples/mpi.jl‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/naive_transpose.jl‎
Lines changed: 1 addition & 1 deletion b/‎examples/naive_transpose.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ext/EnzymeExt.jl‎
Lines changed: 10 additions & 10 deletions b/‎ext/EnzymeExt.jl‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎src/KernelAbstractions.jl‎
Lines changed: 16 additions & 16 deletions b/‎src/KernelAbstractions.jl‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎src/cpu.jl‎
Lines changed: 5 additions & 5 deletions b/‎src/cpu.jl‎
Lines changed: 5 additions & 5 deletions
@@ -44,6 +44,7 @@ function main()
             push_preview = true,
         )
     end
+    return
 end
 
 isinteractive() || main()
@@ -62,13 +62,13 @@ function histogram!(histogram_output, input)
     backend = get_backend(histogram_output)
     # Need static block size
     kernel! = histogram_kernel!(backend, (256,))
-    kernel!(histogram_output, input, ndrange = size(input))
+    return kernel!(histogram_output, input, ndrange = size(input))
 end
 
 function move(backend, input)
     # TODO replace with adapt(backend, input)
     out = KernelAbstractions.allocate(backend, eltype(input), size(input))
-    KernelAbstractions.copyto!(backend, out, input)
+    return KernelAbstractions.copyto!(backend, out, input)
 end
 
 @testset "histogram tests" begin
 
@@ -22,7 +22,7 @@ function matmul!(output, a, b)
     end
     backend = KernelAbstractions.get_backend(a)
     kernel! = matmul_kernel!(backend)
-    kernel!(output, a, b, ndrange = size(output))
+    return kernel!(output, a, b, ndrange = size(output))
 end
 
 a = rand!(allocate(backend, Float32, 256, 123))
 
@@ -12,7 +12,7 @@ function mycopy!(A, B)
     @assert get_backend(B) == backend
 
     kernel = copy_kernel!(backend)
-    kernel(A, B, ndrange = length(A))
+    return kernel(A, B, ndrange = length(A))
 end
 
 A = KernelAbstractions.zeros(backend, Float64, 128, 128)
 
@@ -12,7 +12,7 @@ function mycopy_static!(A, B)
     @assert get_backend(B) == backend
 
     kernel = copy_kernel!(backend, 32, size(A)) # if size(A) varies this will cause recompilation
-    kernel(A, B, ndrange = size(A))
+    return kernel(A, B, ndrange = size(A))
 end
 
 A = KernelAbstractions.zeros(backend, Float64, 128, 128)
 
@@ -9,14 +9,15 @@ function cooperative_test!(req)
         done, _ = MPI.Test(req, MPI.Status)
         yield()
     end
+    return nothing
 end
 
 function cooperative_wait(task::Task)
     while !Base.istaskdone(task)
         MPI.Iprobe(MPI.MPI_ANY_SOURCE, MPI.MPI_ANY_TAG, MPI.COMM_WORLD)
         yield()
     end
-    wait(task)
+    return wait(task)
 end
 
 function exchange!(h_send_buf, d_recv_buf, h_recv_buf, src_rank, dst_rank, comm)
@@ -68,6 +69,7 @@ function main(backend)
     cooperative_wait(send_task)
 
     @test all(d_recv_buf .== src_rank)
+    return
 end
 
 main(backend)
@@ -17,7 +17,7 @@ function naive_transpose!(a, b)
     @assert get_backend(b) == backend
     groupsize = KernelAbstractions.isgpu(backend) ? 256 : 1024
     kernel! = naive_transpose_kernel!(backend, groupsize)
-    kernel!(a, b, ndrange = size(a))
+    return kernel!(a, b, ndrange = size(a))
 end
 
 # resolution of grid will be res*res
 
@@ -65,7 +65,7 @@ function EnzymeRules.forward(
     f = kernel.f
     fwd_kernel = similar(kernel, cpu_fwd)
 
-    fwd_kernel(f, args...; ndrange, workgroupsize)
+    return fwd_kernel(f, args...; ndrange, workgroupsize)
 end
 
 function EnzymeRules.forward(
@@ -79,7 +79,7 @@ function EnzymeRules.forward(
     f = kernel.f
     fwd_kernel = similar(kernel, gpu_fwd)
 
-    fwd_kernel(f, args...; ndrange, workgroupsize)
+    return fwd_kernel(f, args...; ndrange, workgroupsize)
 end
 
 _enzyme_mkcontext(kernel::Kernel{CPU}, ndrange, iterspace, dynamic) =
@@ -278,18 +278,18 @@ function EnzymeRules.augmented_primal(
             if func.val isa Kernel{<:GPU}
                 error("Active kernel arguments not supported on GPU")
             else
-                Ref(EnzymeCore.make_zero(args[i].val))
+                return Ref(EnzymeCore.make_zero(args[i].val))
             end
         else
-            nothing
+            return nothing
         end
     end
     args2 = ntuple(Val(N)) do i
         Base.@_inline_meta
         if args[i] isa Active
-            MixedDuplicated(args[i].val, arg_refs[i])
+            return MixedDuplicated(args[i].val, arg_refs[i])
         else
-            args[i]
+            return args[i]
         end
     end
 
@@ -324,9 +324,9 @@ function EnzymeRules.reverse(
     args2 = ntuple(Val(N)) do i
         Base.@_inline_meta
         if args[i] isa Active
-            MixedDuplicated(args[i].val, arg_refs[i])
+            return MixedDuplicated(args[i].val, arg_refs[i])
         else
-            args[i]
+            return args[i]
         end
     end
 
@@ -348,9 +348,9 @@ function EnzymeRules.reverse(
     res = ntuple(Val(N)) do i
         Base.@_inline_meta
         if args[i] isa Active
-            arg_refs[i][]
+            return arg_refs[i][]
         else
-            nothing
+            return nothing
         end
     end
     # Reverse synchronization right after the kernel launch
 
@@ -51,7 +51,7 @@ synchronize(backend)
 ```
 """
 macro kernel(expr)
-    __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
+    return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
 end
 
 """
@@ -69,7 +69,7 @@ This allows for two different configurations:
 """
 macro kernel(ex...)
     if length(ex) == 1
-        __kernel(ex[1], true, false)
+        return __kernel(ex[1], true, false)
     else
         generate_cpu = true
         force_inbounds = false
@@ -89,7 +89,7 @@ macro kernel(ex...)
                 )
             end
         end
-        __kernel(ex[end], generate_cpu, force_inbounds)
+        return __kernel(ex[end], generate_cpu, force_inbounds)
     end
 end
 
@@ -167,7 +167,7 @@ a tuple corresponding to kernel configuration. In order to get
 the total size you can use `prod(@groupsize())`.
 """
 macro groupsize()
-    quote
+    return quote
         $groupsize($(esc(:__ctx__)))
     end
 end
@@ -179,7 +179,7 @@ Query the ndrange on the backend. This function returns
 a tuple corresponding to kernel configuration.
 """
 macro ndrange()
-    quote
+    return quote
         $size($ndrange($(esc(:__ctx__))))
     end
 end
@@ -193,7 +193,7 @@ macro localmem(T, dims)
     # Stay in sync with CUDAnative
     id = gensym("static_shmem")
 
-    quote
+    return quote
         $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id))))
     end
 end
@@ -214,7 +214,7 @@ macro private(T, dims)
     if dims isa Integer
         dims = (dims,)
     end
-    quote
+    return quote
         $Scratchpad($(esc(:__ctx__)), $(esc(T)), Val($(esc(dims))))
     end
 end
@@ -226,7 +226,7 @@ Creates a private local of `mem` per item in the workgroup. This can be safely u
 across [`@synchronize`](@ref) statements.
 """
 macro private(expr)
-    esc(expr)
+    return esc(expr)
 end
 
 """
@@ -236,7 +236,7 @@ end
 that span workitems, or are reused across `@synchronize` statements.
 """
 macro uniform(value)
-    esc(value)
+    return esc(value)
 end
 
 """
@@ -247,7 +247,7 @@ from each thread in the workgroup are visible in from all other threads in the
 workgroup.
 """
 macro synchronize()
-    quote
+    return quote
         $__synchronize()
     end
 end
@@ -264,7 +264,7 @@ workgroup. `cond` is not allowed to have any visible sideffects.
   - `CPU`: This synchronization will always occur.
 """
 macro synchronize(cond)
-    quote
+    return quote
         $(esc(cond)) && $__synchronize()
     end
 end
@@ -289,7 +289,7 @@ end
 ```
 """
 macro context()
-    esc(:(__ctx__))
+    return esc(:(__ctx__))
 end
 
 """
@@ -329,7 +329,7 @@ macro print(items...)
         end
     end
 
-    quote
+    return quote
         $__print($(map(esc, args)...))
     end
 end
@@ -385,7 +385,7 @@ macro index(locale, args...)
     end
 
     index_function = Symbol(:__index_, locale, :_, indexkind)
-    Expr(:call, GlobalRef(KernelAbstractions, index_function), esc(:__ctx__), map(esc, args)...)
+    return Expr(:call, GlobalRef(KernelAbstractions, index_function), esc(:__ctx__), map(esc, args)...)
 end
 
 ###
@@ -591,7 +591,7 @@ struct Kernel{Backend, WorkgroupSize <: _Size, NDRange <: _Size, Fun}
 end
 
 function Base.similar(kernel::Kernel{D, WS, ND}, f::F) where {D, WS, ND, F}
-    Kernel{D, WS, ND, F}(kernel.backend, f)
+    return Kernel{D, WS, ND, F}(kernel.backend, f)
 end
 
 workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize
@@ -701,7 +701,7 @@ end
         push!(args, item)
     end
 
-    quote
+    return quote
         print($(args...))
     end
 end
 
@@ -43,7 +43,7 @@ function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing)
         return nothing
     end
 
-    __run(obj, ndrange, iterspace, args, dynamic, obj.backend.static)
+    return __run(obj, ndrange, iterspace, args, dynamic, obj.backend.static)
 end
 
 const CPU_GRAINSIZE = 1024 # Vectorization, 4x unrolling, minimal grain size
@@ -162,15 +162,15 @@ end
 
 @inline function __index_Global_Linear(ctx, idx::CartesianIndex)
     I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-    @inbounds LinearIndices(__ndrange(ctx))[I]
+    return @inbounds LinearIndices(__ndrange(ctx))[I]
 end
 
 @inline function __index_Local_Cartesian(_, idx::CartesianIndex)
     return idx
 end
 
 @inline function __index_Group_Cartesian(ctx, ::CartesianIndex)
-    __groupindex(ctx)
+    return __groupindex(ctx)
 end
 
 @inline function __index_Global_Cartesian(ctx, idx::CartesianIndex)
@@ -191,7 +191,7 @@ end
 # CPU implementation of shared memory
 ###
 @inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val) where {T, Dims}
-    MArray{__size(Dims), T}(undef)
+    return MArray{__size(Dims), T}(undef)
 end
 
 ###
@@ -212,7 +212,7 @@ end
 # https://github.com/JuliaLang/julia/issues/39308
 @inline function aview(A, I::Vararg{Any, N}) where {N}
     J = Base.to_indices(A, I)
-    Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...)
+    return Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...)
 end
 
 @inline function Base.getindex(A::ScratchArray{N}, idx) where {N}
Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@ function main()`
`44`	`44`	`push_preview = true,`
`45`	`45`	`)`
`46`	`46`	`end`
	`47`	`+ return`
`47`	`48`	`end`
`48`	`49`
`49`	`50`	`isinteractive() \|\| main()`