Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions docs/src/api/memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,24 @@ julia> xd * xd # Can be used with HIP libraries.
!!! note
Passing `own=true` keyword will make the wrapped array take the ownership of the memory.
For host memory it will unpin it on destruction and for device memory it will free it.

## Unified Memory

AMDGPU.jl supports HIP unified memory (also known as managed memory), which allows the same memory to be accessed from both CPU and GPU without explicit transfers. Arrays can be allocated with unified memory using the `unified=true` keyword:

```julia
x = ROCArray{Float32}(undef, 4, 4; unified=true)
x .= 1f0 # Can be accessed and modified directly from CPU
y = x .+ 2f0 # Can be used in GPU kernels
Array(x) # No copy needed - wraps the same memory
```

You can also wrap unified memory arrays to `Array` for direct CPU access:

```julia
x = ROCArray{Float32}(undef, 4; unified=true)
x_cpu = unsafe_wrap(Array, x) # Zero-copy access from CPU
x_cpu[1] = 42f0 # Modifies the GPU array directly
```

Unified memory is particularly useful for irregular workloads and when you need frequent CPU-GPU data exchanges.
12 changes: 9 additions & 3 deletions src/ROCKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,15 @@ KA.get_backend(::AMDGPU.ROCArray) = ROCBackend()
KA.synchronize(::ROCBackend) = AMDGPU.synchronize()

KA.unsafe_free!(x::AMDGPU.ROCArray) = AMDGPU.unsafe_free!(x)
KA.allocate(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.ROCArray{T}(undef, dims)
KA.zeros(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.zeros(T, dims)
KA.ones(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.ones(T, dims)
KA.supports_unified(::ROCBackend) = true

function KA.allocate(backend::ROCBackend, ::Type{T}, dims::Tuple; unified::Bool=false) where T
if unified
return AMDGPU.ROCArray{T, length(dims), AMDGPU.Mem.HIPUnifiedBuffer}(undef, dims)
else
return AMDGPU.ROCArray{T}(undef, dims)
end
end

function KA.priority!(::ROCBackend, priority::Symbol)
priority ∉ (:high, :normal, :low) && error(
Expand Down
64 changes: 58 additions & 6 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,39 @@ end
Base.unsafe_wrap(::Type{ROCArray{T}}, ptr::Ptr, dims; kwargs...) where T =
unsafe_wrap(ROCArray, Base.unsafe_convert(Ptr{T}, ptr), dims; kwargs...)

"""
Base.unsafe_wrap(::Type{Array}, A::ROCArray, dims)

Wrap a ROCArray backed by unified or host memory as a CPU Array, allowing direct CPU
access without explicit data transfers. This only works for arrays backed by
`HIPUnifiedBuffer` or `HostBuffer`.

!!! warning
For unified memory arrays, ensure proper synchronization before accessing the wrapped
array from the host to avoid race conditions.

# Example
```julia
# Create a unified memory array
x = ROCArray{Float32, 1, AMDGPU.Mem.HIPUnifiedBuffer}(undef, 100)
x .= 1.0f0

# Wrap as CPU array
cpu_view = unsafe_wrap(Array, x)
cpu_view[1] = 42.0f0 # Direct CPU access
```
"""
function Base.unsafe_wrap(::Type{Array}, A::ROCArray{T,N,B}) where {T,N,B}
if B === Mem.HIPUnifiedBuffer || B === Mem.HostBuffer
ptr = Base.unsafe_convert(Ptr{T}, A)
return unsafe_wrap(Array, ptr, size(A))
else
throw(ArgumentError(
"unsafe_wrap(Array, ::ROCArray) only supports arrays backed by " *
"HIPUnifiedBuffer or HostBuffer, got $B"))
end
end

## interop with CPU arrays

# We don't convert isbits types in `adapt`, since they are already
Expand Down Expand Up @@ -279,12 +312,18 @@ function Base.resize!(A::ROCVector{T}, n::Integer) where T

maxsize = n * sizeof(T)
bufsize = Base.isbitsunion(T) ? (maxsize + n) : maxsize
new_buf = Mem.HIPBuffer(bufsize; stream=stream())

# Preserve the buffer type (HIPBuffer, HIPUnifiedBuffer, or HostBuffer)
old_buf = convert(Mem.AbstractAMDBuffer, A.buf[])
new_buf = if old_buf isa Mem.HIPUnifiedBuffer
Mem.HIPUnifiedBuffer(bufsize; stream=stream())
else
Mem.HIPBuffer(bufsize; stream=stream())
end

copy_size = min(length(A), n) * sizeof(T)
if copy_size > 0
Mem.transfer!(new_buf, convert(Mem.AbstractAMDBuffer, A.buf[]),
copy_size; stream=stream())
Mem.transfer!(new_buf, old_buf, copy_size; stream=stream())
end

# Free old buffer.
Expand All @@ -301,13 +340,26 @@ end
function Base.convert(
::Type{ROCDeviceArray{T, N, AS.Global}}, a::ROCArray{T, N},
) where {T, N}
# If HostBuffer, use device pointer.
# Get the appropriate device pointer based on buffer type
buf = convert(Mem.AbstractAMDBuffer, a.buf[])
ptr = convert(Ptr{T}, typeof(buf) <: Mem.HIPBuffer ?
buf : buf.dev_ptr)
ptr = convert(Ptr{T}, typeof(buf) <: Mem.HostBuffer ?
buf.dev_ptr : buf)
llvm_ptr = AMDGPU.LLVMPtr{T,AS.Global}(ptr + a.offset * sizeof(T))
ROCDeviceArray{T, N, AS.Global}(a.dims, llvm_ptr)
end

Adapt.adapt_storage(::Runtime.Adaptor, x::ROCArray{T,N}) where {T,N} =
convert(ROCDeviceArray{T,N,AS.Global}, x)


## indexing

function Base.getindex(x::ROCArray{T, <:Any, <:Union{Mem.HostBuffer, Mem.HIPUnifiedBuffer}}, I::Int) where T
@boundscheck checkbounds(x, I)
unsafe_load(pointer(x, I))
end

function Base.setindex!(x::ROCArray{T, <:Any, <:Union{Mem.HostBuffer, Mem.HIPUnifiedBuffer}}, v, I::Int) where T
@boundscheck checkbounds(x, I)
unsafe_store!(pointer(x, I), v)
end
10 changes: 8 additions & 2 deletions src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,17 @@ function Base.convert(::Type{Ptr{T}}, managed::Managed{M}) where {T, M}
end

managed.dirty = true
# TODO introduce HIPPtr to differentiate
# Return appropriate pointer based on buffer type
if M <: Mem.HIPBuffer
convert(Ptr{T}, managed.mem)
else
elseif M <: Mem.HIPUnifiedBuffer
# Unified memory: accessible from device
convert(Ptr{T}, managed.mem)
elseif M <: Mem.HostBuffer
# Host buffer: use device pointer
convert(Ptr{T}, managed.mem.dev_ptr)
else
error("Unsupported buffer type for device conversion: $M")
end
end

Expand Down
184 changes: 184 additions & 0 deletions src/runtime/memory/hip.jl
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,190 @@ download!(dst::HIPBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
transfer!(dst::HostBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)

"""
HIPUnifiedBuffer

Unified memory buffer that can be accessed from both host and device.
Allocated using `hipMallocManaged` with automatic migration between host and device.

Supports memory advise hints and explicit prefetching for performance optimization.
See: https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_runtime_api/memory_management/unified_memory.html
"""
struct HIPUnifiedBuffer <: AbstractAMDBuffer
device::HIPDevice
ctx::HIPContext
ptr::Ptr{Cvoid}
bytesize::Int
own::Bool
end

function HIPUnifiedBuffer(
bytesize::Integer, flags = HIP.hipMemAttachGlobal;
stream::HIP.HIPStream = AMDGPU.stream(),
)
dev, ctx = stream.device, stream.ctx
bytesize == 0 && return HIPUnifiedBuffer(dev, ctx, C_NULL, 0, true)

AMDGPU.maybe_collect()

ptr_ref = Ref{Ptr{Cvoid}}()
HIP.hipMallocManaged(ptr_ref, bytesize, flags)
ptr = ptr_ref[]
ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory))

AMDGPU.account!(AMDGPU.memory_stats(dev), bytesize)
HIPUnifiedBuffer(dev, ctx, ptr, bytesize, true)
end

function HIPUnifiedBuffer(
ptr::Ptr{Cvoid}, sz::Integer;
stream::HIP.HIPStream = AMDGPU.stream(), own::Bool = false,
)
HIPUnifiedBuffer(stream.device, stream.ctx, ptr, sz, own)
end

Base.sizeof(b::HIPUnifiedBuffer) = UInt64(b.bytesize)

Base.convert(::Type{Ptr{T}}, buf::HIPUnifiedBuffer) where T = convert(Ptr{T}, buf.ptr)

function view(buf::HIPUnifiedBuffer, bytesize::Int)
bytesize > buf.bytesize && throw(BoundsError(buf, bytesize))
HIPUnifiedBuffer(
buf.device, buf.ctx,
buf.ptr + bytesize,
buf.bytesize - bytesize, buf.own)
end

function free(buf::HIPUnifiedBuffer; kwargs...)
buf.own || return
buf.ptr == C_NULL && return
HIP.hipFree(buf)
AMDGPU.account!(AMDGPU.memory_stats(buf.device), -buf.bytesize)
return
end

# Unified memory can be accessed from both host and device
upload!(dst::HIPUnifiedBuffer, src::Ptr, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)

upload!(dst::HIPUnifiedBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)

upload!(dst::HIPUnifiedBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)

download!(dst::Ptr, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)

download!(dst::HIPBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)

download!(dst::HostBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)

transfer!(dst::HIPUnifiedBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDefault, stream)

transfer!(dst::HIPUnifiedBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)

transfer!(dst::HIPBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)

transfer!(dst::HIPUnifiedBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)

transfer!(dst::HostBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)

"""
prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream)
prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream)

Prefetch unified memory to the specified device (or the buffer's device).
Explicitly migrates the data to improve performance by reducing page faults.

See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv419hipMemPrefetchAsyncPvmi13hipStream_t
"""
function prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream = AMDGPU.stream())
buf.ptr == C_NULL && return
HIP.hipMemPrefetchAsync(buf.ptr, buf.bytesize, HIP.device_id(device), stream)
return
end

function prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream = AMDGPU.stream())
prefetch!(buf, buf.device; stream)
end

"""
advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice)
advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise)

Provide hints to the unified memory system about how the memory will be used.

Available advice flags:
- `hipMemAdviseSetReadMostly`: Data will be mostly read and only occasionally written to
- `hipMemAdviseUnsetReadMostly`: Undo read-mostly advice
- `hipMemAdviseSetPreferredLocation`: Set preferred location for the data
- `hipMemAdviseUnsetPreferredLocation`: Clear preferred location
- `hipMemAdviseSetAccessedBy`: Data will be accessed by specified device
- `hipMemAdviseUnsetAccessedBy`: Clear accessed-by hint
- `hipMemAdviseSetCoarseGrain`: Use coarse-grain coherency (AMD-specific)
- `hipMemAdviseUnsetCoarseGrain`: Use fine-grain coherency (AMD-specific)

See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv412hipMemAdvisePvmj8hipMemoryAdvise_t
"""
function advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice)
buf.ptr == C_NULL && return
HIP.hipMemAdvise(buf.ptr, buf.bytesize, advice, HIP.device_id(device))
return
end

function advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise)
advise!(buf, advice, buf.device)
end

"""
query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute)

Query attributes of unified memory range.

Available attributes:
- `hipMemRangeAttributeReadMostly`: Query if the range is read-mostly
- `hipMemRangeAttributePreferredLocation`: Query preferred location
- `hipMemRangeAttributeAccessedBy`: Query which devices can access this range
- `hipMemRangeAttributeLastPrefetchLocation`: Query last prefetch location
- `hipMemRangeAttributeCoherencyMode`: Query coherency mode (AMD-specific)

Returns the attribute value.

See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv423hipMemRangeGetAttributePvm20hipMemRangeAttribute_tPvm
"""
function query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute)
buf.ptr == C_NULL && error("Cannot query attributes of NULL pointer")

# Different attributes return different types
if attribute == HIP.hipMemRangeAttributeReadMostly
data = Ref{Cint}()
HIP.hipMemRangeGetAttribute(data, sizeof(Cint), attribute, buf.ptr, buf.bytesize)
return Bool(data[])
elseif attribute in (HIP.hipMemRangeAttributePreferredLocation,
HIP.hipMemRangeAttributeLastPrefetchLocation)
data = Ref{Cint}()
HIP.hipMemRangeGetAttribute(data, sizeof(Cint), attribute, buf.ptr, buf.bytesize)
return data[]
elseif attribute == HIP.hipMemRangeAttributeCoherencyMode
data = Ref{Cuint}()
HIP.hipMemRangeGetAttribute(data, sizeof(Cuint), attribute, buf.ptr, buf.bytesize)
return data[]
else
# For AccessedBy and other attributes, return raw pointer
data = Ref{Ptr{Cvoid}}()
HIP.hipMemRangeGetAttribute(data, sizeof(Ptr{Cvoid}), attribute, buf.ptr, buf.bytesize)
return data[]
end
end

# download!(::Ptr, ::HIPBuffer)
transfer!(dst::HostBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToHost, stream)
Expand Down
2 changes: 2 additions & 0 deletions test/rocarray/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -229,4 +229,6 @@ else
@test_skip "Multi-GPU"
end

include("unified.jl")

end
Loading