JuliaGPU
diff --git a/‎Project.toml‎
Lines changed: 7 additions & 0 deletions b/‎Project.toml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/make.jl‎
Lines changed: 0 additions & 1 deletion b/‎docs/make.jl‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/src/caching_allocator.md‎
Lines changed: 0 additions & 76 deletions b/‎docs/src/caching_allocator.md‎
Lines changed: 0 additions & 76 deletions
diff --git a/‎ext/AMDGPUChainRulesCoreExt.jl‎
Lines changed: 9 additions & 0 deletions b/‎ext/AMDGPUChainRulesCoreExt.jl‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/AMDGPU.jl‎
Lines changed: 8 additions & 15 deletions b/‎src/AMDGPU.jl‎
Lines changed: 8 additions & 15 deletions
diff --git a/‎src/array.jl‎
Lines changed: 20 additions & 20 deletions b/‎src/array.jl‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎src/cache.jl‎
Lines changed: 52 additions & 40 deletions b/‎src/cache.jl‎
Lines changed: 52 additions & 40 deletions
diff --git a/‎src/cache_allocator.jl‎
Lines changed: 5 additions & 0 deletions b/‎src/cache_allocator.jl‎
Lines changed: 5 additions & 0 deletions
@@ -33,12 +33,19 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
+[weakdeps]
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+
+[extensions]
+AMDGPUChainRulesCoreExt = "ChainRulesCore"
+
 [compat]
 AbstractFFTs = "1.0"
 AcceleratedKernels = "0.2"
 Adapt = "4"
 Atomix = "0.1, 1"
 CEnum = "0.4, 0.5"
+ChainRulesCore = "1"
 ExprTools = "0.1"
 GPUArrays = "11.1"
 GPUCompiler = "0.27, 1.0"
 
@@ -27,7 +27,6 @@ function main()
             "Exceptions" => "exceptions.md",
             "Profiling" => "profiling.md",
             "Memory" => "memory.md",
-            "Caching Memory Allocator" => "caching_allocator.md",
             "Host-Call" => "hostcall.md",
             "Printing" => "printing.md",
             "Logging" => "logging.md",
 
@@ -0,0 +1,9 @@
+module AMDGPUChainRulesCoreExt
+
+using AMDGPU: ROCArray
+
+import ChainRulesCore
+
+ChainRulesCore.is_inplaceable_destination(::ROCArray) = true
+
+end
@@ -36,15 +36,8 @@ struct LockedObject{T}
     lock::ReentrantLock
     payload::T
 end
-
 LockedObject(payload) = LockedObject(ReentrantLock(), payload)
 
-function Base.lock(f, x::LockedObject)
-    Base.@lock x.lock begin
-        return f(x.payload)
-    end
-end
-
 # TODO simplify
 struct KernelState
     # Exception reporting buffers.
@@ -114,7 +107,6 @@ include("tls.jl")
 include("highlevel.jl")
 include("reflection.jl")
 include("array.jl")
-include("caching_allocator.jl")
 include("conversions.jl")
 include("broadcast.jl")
 include("exception_handler.jl")
@@ -139,21 +131,21 @@ include("random.jl")
 # Enable hardware FP atomics for +/- ops.
 const ROCIndexableRef{Indexable <: ROCDeviceArray} = Atomix.IndexableRef{Indexable}
 
-function Atomix.modify!(
-    ref::ROCIndexableRef, op::OP, x, ord,
-) where OP <: Union{typeof(+), typeof(-)}
+function Atomix.modify!(ref::ROCIndexableRef, op::OP, x, ord) where {
+    OP <: Union{typeof(+), typeof(-)}
+}
     x = Atomix.asstorable(ref, x)
     ptr = Atomix.pointer(ref)
     root = Atomix.gcroot(ref)
-    GC.@preserve root begin
-        UnsafeAtomics.modify!(ptr, op, x, ord, Val(:agent))
-    end
+    GC.@preserve root UnsafeAtomics.modify!(ptr, op, x, ord, Val(:agent))
 end
 
 include("ROCKernels.jl")
 import .ROCKernels: ROCBackend
 export ROCBackend
 
+# include("cache_allocator.jl")
+
 function __init__()
     # Used to shutdown hostcalls if any is running.
     atexit(() -> begin Runtime.RT_EXITING[] = true end)
@@ -174,7 +166,8 @@ function __init__()
         end
 
         if !isempty(libhsaruntime)
-            HSA.init() == HSA.STATUS_SUCCESS ?
+            status = HSA.init()
+            status == HSA.STATUS_SUCCESS ?
                 atexit(() -> HSA.shut_down()) :
                 @warn "HSA initialization failed with code $status"
         else
 
@@ -7,24 +7,22 @@ mutable struct ROCArray{T, N, B} <: AbstractGPUArray{T, N}
         ::UndefInitializer, dims::Dims{N},
     ) where {T, N, B <: Mem.AbstractAMDBuffer}
         @assert isbitstype(T) "ROCArray only supports bits types"
-
-        alloc_name = cache_alloc_name()
-        # Do not use caching allocator if it is not set or
-        # the buffer is not a device memory.
-        x = if !(B <: Mem.HIPBuffer) || alloc_name == :none
-            data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
-            x = new{T, N, B}(data, dims, 0)
-        else
-            alloc = cache_allocator!(alloc_name)
-            tmp = alloc!(alloc, B, T, dims)
-            if tmp ≡ nothing
-                data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
-                tmp = new{T, N, B}(data, dims, 0)
-                add_busy!(alloc, tmp)
-            end
-            tmp::ROCArray{T, N, B}
+        function _alloc_f()
+            sz::Int64 = prod(dims) * sizeof(T)
+            @debug "Allocate `T=$T`, `dims=$dims`: $(Base.format_bytes(sz))"
+            data = DataRef(pool_free, pool_alloc(B, sz))
+            finalizer(unsafe_free!, new{T, N, B}(data, dims, 0))
         end
-        return finalizer(unsafe_free!, x)
+        return _alloc_f()
+
+        # name = GPUArrays.CacheAllocatorName[]
+        # # Do not use caching allocator if it is not set or
+        # # the buffer is not a device memory.
+        # return if !(B <: Mem.HIPBuffer) || name == :none
+        #     _alloc_f()
+        # else
+        #     GPUArrays.alloc!(_alloc_f, ROCBackend(), name, T, dims)::ROCArray{T, N, B}
+        # end
     end
 
     function ROCArray{T, N}(
@@ -38,9 +36,7 @@ end
 
 GPUArrays.storage(a::ROCArray) = a.buf
 
-function GPUArrays.derive(
-    ::Type{T}, x::ROCArray, dims::Dims{N}, offset::Int,
-) where {N, T}
+function GPUArrays.derive(::Type{T}, x::ROCArray, dims::Dims{N}, offset::Int) where {N, T}
     ref = copy(x.buf)
     offset += (x.offset * Base.elsize(x)) ÷ sizeof(T)
     ROCArray{T, N}(ref, dims; offset)
@@ -154,6 +150,8 @@ function Base.copyto!(
     amount == 0 && return dest
     @boundscheck checkbounds(dest, d_offset + amount - 1)
     @boundscheck checkbounds(source, s_offset + amount - 1)
+
+    @debug "[gpu -> cpu] T=$T, shape=$(size(dest))"
     stm = stream()
     Mem.download!(
         pointer(dest, d_offset),
@@ -171,6 +169,8 @@ function Base.copyto!(
     amount == 0 && return dest
     @boundscheck checkbounds(dest, d_offset + amount - 1)
     @boundscheck checkbounds(source, s_offset + amount - 1)
+
+    @debug "[cpu -> gpu] T=$T, shape=$(size(dest))"
     Mem.upload!(
         Mem.view(convert(Mem.AbstractAMDBuffer, dest.buf[]),
             (dest.offset + d_offset - 1) * sizeof(T)),
 
@@ -2,87 +2,99 @@
 # Copied from CUDA.jl/lib/utils/cache.jl
 
 # TODO:
-# - keep track of the (estimated?) size of cache contents
-# - clean the caches when memory is needed. this will require registering the destructor
-#   upfront, so that it can set the environment (e.g. switch to the appropriate context).
-#   alternatively, register the `unsafe_free!`` methods with the pool instead of the cache.
+# - store ctor/dtor in cache
+# - clean cache when under memory pressure
 
 export HandleCache
 
-struct HandleCache{K,V}
-    active_handles::Set{Pair{K,V}}      # for debugging, and to prevent handle finalization
-    idle_handles::Dict{K,Vector{V}}
-    lock::ReentrantLock
+struct HandleCache{K, V}
+    active_handles::Set{Pair{K, V}}
+    idle_handles::Dict{K, Vector{V}}
+    lock::Base.ThreadSynchronizer
+    # TODO when finalizers are run on their own tasks use reentrant lock
 
     max_entries::Int
 
-    function HandleCache{K,V}(max_entries::Int=32) where {K,V}
-        return new{K,V}(Set{Pair{K,V}}(), Dict{K,Vector{V}}(), ReentrantLock(), max_entries)
+    function HandleCache{K, V}(max_entries::Int = 32) where {K, V}
+        new{K,V}(
+            Set{Pair{K, V}}(),
+            Dict{K, Vector{V}}(),
+            Base.ThreadSynchronizer(),
+            max_entries)
     end
 end
 
 # remove a handle from the cache, or create a new one
-function Base.pop!(f::Function, cache::HandleCache{K,V}, key) where {K,V}
-    function check_cache(f::Function=()->nothing)
-        lock(cache.lock) do
-            handle = if !haskey(cache.idle_handles, key) || isempty(cache.idle_handles[key])
-                f()
-            else
-                pop!(cache.idle_handles[key])
-            end
-
-            if handle !== nothing
-                push!(cache.active_handles, key=>handle)
-            end
-
-            return handle
+function Base.pop!(f::Function, cache::HandleCache{K, V}, key) where {K, V}
+    # Check cache.
+    handle, n_active_handles = Base.@lock cache.lock begin
+        if haskey(cache.idle_handles, key) && !isempty(cache.idle_handles[key])
+            pop!(cache.idle_handles[key]), length(cache.active_handles)
+        else
+            nothing, length(cache.active_handles)
         end
     end
 
-    handle = check_cache()
-
-    if handle === nothing
-        # if we didn't find anything, perform a quick GC collection to free up old handles.
+    # If didn't find anything, but lots of active handles - try to free some.
+    if handle ≡ nothing && n_active_handles > cache.max_entries
         GC.gc(false)
-
-        handle = check_cache(f)
+        Base.@lock cache.lock begin
+            if haskey(cache.idle_handles, key) && !isempty(cache.idle_handles[key])
+                handle = pop!(cache.idle_handles[key])
+            end
+        end
     end
 
+    # If still nothing, create a new handle.
+    handle ≡ nothing && (handle = f();)
+
+    Base.@lock cache.lock push!(cache.active_handles, key => handle)
     return handle::V
 end
 
 # put a handle in the cache, or destroy it if it doesn't fit
-function Base.push!(f::Function, cache::HandleCache{K,V}, key::K, handle::V) where {K,V}
-    lock(cache.lock) do
-        delete!(cache.active_handles, key=>handle)
+function Base.push!(f::Function, cache::HandleCache{K, V}, key::K, handle::V) where {K, V}
+    saved = Base.@lock cache.lock begin
+        (key => handle) ∉ cache.active_handles && error(
+            """Trying to free active handle that is not managed by cache.
+            - Key: $key
+            - Handle: $handle
+            """)
+        delete!(cache.active_handles, key => handle)
 
         if haskey(cache.idle_handles, key)
             if length(cache.idle_handles[key]) > cache.max_entries
-                f()
+                false
             else
                 push!(cache.idle_handles[key], handle)
+                true
             end
         else
             cache.idle_handles[key] = [handle]
+            true
         end
     end
+
+    saved || f()
+    return
 end
 
 # shorthand version to put a handle back without having to remember the key
-function Base.push!(f::Function, cache::HandleCache{K,V}, handle::V) where {K,V}
-    lock(cache.lock) do
+function Base.push!(f::Function, cache::HandleCache{K, V}, handle::V) where {K, V}
+    key = Base.@lock cache.lock begin
         key = nothing
         for entry in cache.active_handles
             if entry[2] == handle
                 key = entry[1]
                 break
             end
         end
-        if key === nothing
-            error("Attempt to cache handle $handle that was not created by the handle cache")
-        end
-        push!(f, cache, key, handle)
+
+        key ≡ nothing && error(
+            "Attempt to cache handle $handle that was not created by the handle cache")
+        key
     end
+    push!(f, cache, key, handle)
 end
 
 # Copied from CUDA.jl/lib/cublas/CUBLAS.jl
 
@@ -0,0 +1,5 @@
+const ROCCacheAllocator = GPUArrays.PerDeviceCacheAllocator(ROCArray; free_immediately=false)
+
+GPUArrays.cache_allocator(::ROCBackend) = ROCCacheAllocator
+
+GPUArrays.device(::ROCBackend) = AMDGPU.device()