add support for unified memory

simeonschaub · simeonschaub · commit 647b71c099db · 2025-11-10T14:46:58.000+01:00
See https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_runtime_api/memory_management/unified_memory.html closes #840 mostly written by Copilot
diff --git a/docs/src/api/memory.md b/docs/src/api/memory.md
@@ -182,3 +182,24 @@ julia> xd * xd # Can be used with HIP libraries.
 !!! note
     Passing `own=true` keyword will make the wrapped array take the ownership of the memory.
     For host memory it will unpin it on destruction and for device memory it will free it.
+
+## Unified Memory
+
+AMDGPU.jl supports HIP unified memory (also known as managed memory), which allows the same memory to be accessed from both CPU and GPU without explicit transfers. Arrays can be allocated with unified memory using the `unified=true` keyword:
+
+```julia
+x = ROCArray{Float32}(undef, 4, 4; unified=true)
+x .= 1f0  # Can be accessed and modified directly from CPU
+y = x .+ 2f0  # Can be used in GPU kernels
+Array(x)  # No copy needed - wraps the same memory
+```
+
+You can also wrap unified memory arrays to `Array` for direct CPU access:
+
+```julia
+x = ROCArray{Float32}(undef, 4; unified=true)
+x_cpu = unsafe_wrap(Array, x)  # Zero-copy access from CPU
+x_cpu[1] = 42f0  # Modifies the GPU array directly
+```
+
+Unified memory is particularly useful for irregular workloads and when you need frequent CPU-GPU data exchanges.
diff --git a/src/ROCKernels.jl b/src/ROCKernels.jl
@@ -34,9 +34,15 @@ KA.get_backend(::AMDGPU.ROCArray) = ROCBackend()
 KA.synchronize(::ROCBackend) = AMDGPU.synchronize()
 
 KA.unsafe_free!(x::AMDGPU.ROCArray) = AMDGPU.unsafe_free!(x)
-KA.allocate(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.ROCArray{T}(undef, dims)
-KA.zeros(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.zeros(T, dims)
-KA.ones(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.ones(T, dims)
+KA.supports_unified(::ROCBackend) = true
+
+function KA.allocate(backend::ROCBackend, ::Type{T}, dims::Tuple; unified::Bool=false) where T
+    if unified
+        return AMDGPU.ROCArray{T, length(dims), AMDGPU.Mem.HIPUnifiedBuffer}(undef, dims)
+    else
+        return AMDGPU.ROCArray{T}(undef, dims)
+    end
+end
 
 function KA.priority!(::ROCBackend, priority::Symbol)
     priority ∉ (:high, :normal, :low) && error(
diff --git a/src/array.jl b/src/array.jl
@@ -212,6 +212,39 @@ end
 Base.unsafe_wrap(::Type{ROCArray{T}}, ptr::Ptr, dims; kwargs...) where T =
     unsafe_wrap(ROCArray, Base.unsafe_convert(Ptr{T}, ptr), dims; kwargs...)
 
+"""
+    Base.unsafe_wrap(::Type{Array}, A::ROCArray, dims)
+
+Wrap a ROCArray backed by unified or host memory as a CPU Array, allowing direct CPU
+access without explicit data transfers. This only works for arrays backed by
+`HIPUnifiedBuffer` or `HostBuffer`.
+
+!!! warning
+    For unified memory arrays, ensure proper synchronization before accessing the wrapped
+    array from the host to avoid race conditions.
+
+# Example
+```julia
+# Create a unified memory array
+x = ROCArray{Float32, 1, AMDGPU.Mem.HIPUnifiedBuffer}(undef, 100)
+x .= 1.0f0
+
+# Wrap as CPU array
+cpu_view = unsafe_wrap(Array, x)
+cpu_view[1] = 42.0f0  # Direct CPU access
+```
+"""
+function Base.unsafe_wrap(::Type{Array}, A::ROCArray{T,N,B}) where {T,N,B}
+    if B === Mem.HIPUnifiedBuffer || B === Mem.HostBuffer
+        ptr = Base.unsafe_convert(Ptr{T}, A)
+        return unsafe_wrap(Array, ptr, size(A))
+    else
+        throw(ArgumentError(
+            "unsafe_wrap(Array, ::ROCArray) only supports arrays backed by " *
+            "HIPUnifiedBuffer or HostBuffer, got $B"))
+    end
+end
+
 ## interop with CPU arrays
 
 # We don't convert isbits types in `adapt`, since they are already
@@ -279,12 +312,18 @@ function Base.resize!(A::ROCVector{T}, n::Integer) where T
 
     maxsize = n * sizeof(T)
     bufsize = Base.isbitsunion(T) ? (maxsize + n) : maxsize
-    new_buf = Mem.HIPBuffer(bufsize; stream=stream())
+
+    # Preserve the buffer type (HIPBuffer, HIPUnifiedBuffer, or HostBuffer)
+    old_buf = convert(Mem.AbstractAMDBuffer, A.buf[])
+    new_buf = if old_buf isa Mem.HIPUnifiedBuffer
+        Mem.HIPUnifiedBuffer(bufsize; stream=stream())
+    else
+        Mem.HIPBuffer(bufsize; stream=stream())
+    end
 
     copy_size = min(length(A), n) * sizeof(T)
     if copy_size > 0
-        Mem.transfer!(new_buf, convert(Mem.AbstractAMDBuffer, A.buf[]),
-            copy_size; stream=stream())
+        Mem.transfer!(new_buf, old_buf, copy_size; stream=stream())
     end
 
     # Free old buffer.
@@ -301,10 +340,10 @@ end
 function Base.convert(
     ::Type{ROCDeviceArray{T, N, AS.Global}}, a::ROCArray{T, N},
 ) where {T, N}
-    # If HostBuffer, use device pointer.
+    # Get the appropriate device pointer based on buffer type
     buf = convert(Mem.AbstractAMDBuffer, a.buf[])
-    ptr = convert(Ptr{T}, typeof(buf) <: Mem.HIPBuffer ?
-        buf : buf.dev_ptr)
+    ptr = convert(Ptr{T}, typeof(buf) <: Mem.HostBuffer ?
+        buf.dev_ptr : buf)
     llvm_ptr = AMDGPU.LLVMPtr{T,AS.Global}(ptr + a.offset * sizeof(T))
     ROCDeviceArray{T, N, AS.Global}(a.dims, llvm_ptr)
 end
diff --git a/src/memory.jl b/src/memory.jl
@@ -246,11 +246,17 @@ function Base.convert(::Type{Ptr{T}}, managed::Managed{M}) where {T, M}
     end
 
     managed.dirty = true
-    # TODO introduce HIPPtr to differentiate
+    # Return appropriate pointer based on buffer type
     if M <: Mem.HIPBuffer
         convert(Ptr{T}, managed.mem)
-    else
+    elseif M <: Mem.HIPUnifiedBuffer
+        # Unified memory: accessible from device
+        convert(Ptr{T}, managed.mem)
+    elseif M <: Mem.HostBuffer
+        # Host buffer: use device pointer
         convert(Ptr{T}, managed.mem.dev_ptr)
+    else
+        error("Unsupported buffer type for device conversion: $M")
     end
 end
 
diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl
@@ -165,6 +165,190 @@ download!(dst::HIPBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
 transfer!(dst::HostBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
     HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
 
+"""
+    HIPUnifiedBuffer
+
+Unified memory buffer that can be accessed from both host and device.
+Allocated using `hipMallocManaged` with automatic migration between host and device.
+
+Supports memory advise hints and explicit prefetching for performance optimization.
+See: https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_runtime_api/memory_management/unified_memory.html
+"""
+struct HIPUnifiedBuffer <: AbstractAMDBuffer
+    device::HIPDevice
+    ctx::HIPContext
+    ptr::Ptr{Cvoid}
+    bytesize::Int
+    own::Bool
+end
+
+function HIPUnifiedBuffer(
+    bytesize::Integer, flags = HIP.hipMemAttachGlobal;
+    stream::HIP.HIPStream = AMDGPU.stream(),
+)
+    dev, ctx = stream.device, stream.ctx
+    bytesize == 0 && return HIPUnifiedBuffer(dev, ctx, C_NULL, 0, true)
+
+    AMDGPU.maybe_collect()
+
+    ptr_ref = Ref{Ptr{Cvoid}}()
+    HIP.hipMallocManaged(ptr_ref, bytesize, flags)
+    ptr = ptr_ref[]
+    ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory))
+
+    AMDGPU.account!(AMDGPU.memory_stats(dev), bytesize)
+    HIPUnifiedBuffer(dev, ctx, ptr, bytesize, true)
+end
+
+function HIPUnifiedBuffer(
+    ptr::Ptr{Cvoid}, sz::Integer;
+    stream::HIP.HIPStream = AMDGPU.stream(), own::Bool = false,
+)
+    HIPUnifiedBuffer(stream.device, stream.ctx, ptr, sz, own)
+end
+
+Base.sizeof(b::HIPUnifiedBuffer) = UInt64(b.bytesize)
+
+Base.convert(::Type{Ptr{T}}, buf::HIPUnifiedBuffer) where T = convert(Ptr{T}, buf.ptr)
+
+function view(buf::HIPUnifiedBuffer, bytesize::Int)
+    bytesize > buf.bytesize && throw(BoundsError(buf, bytesize))
+    HIPUnifiedBuffer(
+        buf.device, buf.ctx,
+        buf.ptr + bytesize,
+        buf.bytesize - bytesize, buf.own)
+end
+
+function free(buf::HIPUnifiedBuffer; kwargs...)
+    buf.own || return
+    buf.ptr == C_NULL && return
+    HIP.hipFree(buf)
+    AMDGPU.account!(AMDGPU.memory_stats(buf.device), -buf.bytesize)
+    return
+end
+
+# Unified memory can be accessed from both host and device
+upload!(dst::HIPUnifiedBuffer, src::Ptr, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
+
+upload!(dst::HIPUnifiedBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)
+
+upload!(dst::HIPUnifiedBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
+
+download!(dst::Ptr, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
+
+download!(dst::HIPBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)
+
+download!(dst::HostBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
+
+transfer!(dst::HIPUnifiedBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyDefault, stream)
+
+transfer!(dst::HIPUnifiedBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)
+
+transfer!(dst::HIPBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)
+
+transfer!(dst::HIPUnifiedBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
+
+transfer!(dst::HostBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
+    HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
+
+"""
+    prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream)
+    prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream)
+
+Prefetch unified memory to the specified device (or the buffer's device).
+Explicitly migrates the data to improve performance by reducing page faults.
+
+See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv419hipMemPrefetchAsyncPvmi13hipStream_t
+"""
+function prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream = AMDGPU.stream())
+    buf.ptr == C_NULL && return
+    HIP.hipMemPrefetchAsync(buf.ptr, buf.bytesize, HIP.device_id(device), stream)
+    return
+end
+
+function prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream = AMDGPU.stream())
+    prefetch!(buf, buf.device; stream)
+end
+
+"""
+    advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice)
+    advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise)
+
+Provide hints to the unified memory system about how the memory will be used.
+
+Available advice flags:
+- `hipMemAdviseSetReadMostly`: Data will be mostly read and only occasionally written to
+- `hipMemAdviseUnsetReadMostly`: Undo read-mostly advice
+- `hipMemAdviseSetPreferredLocation`: Set preferred location for the data
+- `hipMemAdviseUnsetPreferredLocation`: Clear preferred location
+- `hipMemAdviseSetAccessedBy`: Data will be accessed by specified device
+- `hipMemAdviseUnsetAccessedBy`: Clear accessed-by hint
+- `hipMemAdviseSetCoarseGrain`: Use coarse-grain coherency (AMD-specific)
+- `hipMemAdviseUnsetCoarseGrain`: Use fine-grain coherency (AMD-specific)
+
+See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv412hipMemAdvisePvmj8hipMemoryAdvise_t
+"""
+function advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice)
+    buf.ptr == C_NULL && return
+    HIP.hipMemAdvise(buf.ptr, buf.bytesize, advice, HIP.device_id(device))
+    return
+end
+
+function advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise)
+    advise!(buf, advice, buf.device)
+end
+
+"""
+    query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute)
+
+Query attributes of unified memory range.
+
+Available attributes:
+- `hipMemRangeAttributeReadMostly`: Query if the range is read-mostly
+- `hipMemRangeAttributePreferredLocation`: Query preferred location
+- `hipMemRangeAttributeAccessedBy`: Query which devices can access this range
+- `hipMemRangeAttributeLastPrefetchLocation`: Query last prefetch location
+- `hipMemRangeAttributeCoherencyMode`: Query coherency mode (AMD-specific)
+
+Returns the attribute value.
+
+See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv423hipMemRangeGetAttributePvm20hipMemRangeAttribute_tPvm
+"""
+function query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute)
+    buf.ptr == C_NULL && error("Cannot query attributes of NULL pointer")
+
+    # Different attributes return different types
+    if attribute == HIP.hipMemRangeAttributeReadMostly
+        data = Ref{Cint}()
+        HIP.hipMemRangeGetAttribute(data, sizeof(Cint), attribute, buf.ptr, buf.bytesize)
+        return Bool(data[])
+    elseif attribute in (HIP.hipMemRangeAttributePreferredLocation,
+                         HIP.hipMemRangeAttributeLastPrefetchLocation)
+        data = Ref{Cint}()
+        HIP.hipMemRangeGetAttribute(data, sizeof(Cint), attribute, buf.ptr, buf.bytesize)
+        return data[]
+    elseif attribute == HIP.hipMemRangeAttributeCoherencyMode
+        data = Ref{Cuint}()
+        HIP.hipMemRangeGetAttribute(data, sizeof(Cuint), attribute, buf.ptr, buf.bytesize)
+        return data[]
+    else
+        # For AccessedBy and other attributes, return raw pointer
+        data = Ref{Ptr{Cvoid}}()
+        HIP.hipMemRangeGetAttribute(data, sizeof(Ptr{Cvoid}), attribute, buf.ptr, buf.bytesize)
+        return data[]
+    end
+end
+
 # download!(::Ptr, ::HIPBuffer)
 transfer!(dst::HostBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
     HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToHost, stream)
diff --git a/test/rocarray/base.jl b/test/rocarray/base.jl
@@ -229,4 +229,6 @@ else
     @test_skip "Multi-GPU"
 end
 
+include("unified.jl")
+
 end
diff --git a/test/rocarray/unified.jl b/test/rocarray/unified.jl