Skip to content

Commit 647b71c

Browse files
committed
1 parent 15824e6 commit 647b71c

File tree

7 files changed

+493
-11
lines changed

7 files changed

+493
-11
lines changed

docs/src/api/memory.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,24 @@ julia> xd * xd # Can be used with HIP libraries.
182182
!!! note
183183
Passing `own=true` keyword will make the wrapped array take the ownership of the memory.
184184
For host memory it will unpin it on destruction and for device memory it will free it.
185+
186+
## Unified Memory
187+
188+
AMDGPU.jl supports HIP unified memory (also known as managed memory), which allows the same memory to be accessed from both CPU and GPU without explicit transfers. Arrays can be allocated with unified memory using the `unified=true` keyword:
189+
190+
```julia
191+
x = ROCArray{Float32}(undef, 4, 4; unified=true)
192+
x .= 1f0 # Can be accessed and modified directly from CPU
193+
y = x .+ 2f0 # Can be used in GPU kernels
194+
Array(x) # No copy needed - wraps the same memory
195+
```
196+
197+
You can also wrap unified memory arrays to `Array` for direct CPU access:
198+
199+
```julia
200+
x = ROCArray{Float32}(undef, 4; unified=true)
201+
x_cpu = unsafe_wrap(Array, x) # Zero-copy access from CPU
202+
x_cpu[1] = 42f0 # Modifies the GPU array directly
203+
```
204+
205+
Unified memory is particularly useful for irregular workloads and when you need frequent CPU-GPU data exchanges.

src/ROCKernels.jl

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,15 @@ KA.get_backend(::AMDGPU.ROCArray) = ROCBackend()
3434
KA.synchronize(::ROCBackend) = AMDGPU.synchronize()
3535

3636
KA.unsafe_free!(x::AMDGPU.ROCArray) = AMDGPU.unsafe_free!(x)
37-
KA.allocate(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.ROCArray{T}(undef, dims)
38-
KA.zeros(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.zeros(T, dims)
39-
KA.ones(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.ones(T, dims)
37+
KA.supports_unified(::ROCBackend) = true
38+
39+
function KA.allocate(backend::ROCBackend, ::Type{T}, dims::Tuple; unified::Bool=false) where T
40+
if unified
41+
return AMDGPU.ROCArray{T, length(dims), AMDGPU.Mem.HIPUnifiedBuffer}(undef, dims)
42+
else
43+
return AMDGPU.ROCArray{T}(undef, dims)
44+
end
45+
end
4046

4147
function KA.priority!(::ROCBackend, priority::Symbol)
4248
priority (:high, :normal, :low) && error(

src/array.jl

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,39 @@ end
212212
Base.unsafe_wrap(::Type{ROCArray{T}}, ptr::Ptr, dims; kwargs...) where T =
213213
unsafe_wrap(ROCArray, Base.unsafe_convert(Ptr{T}, ptr), dims; kwargs...)
214214

215+
"""
216+
Base.unsafe_wrap(::Type{Array}, A::ROCArray, dims)
217+
218+
Wrap a ROCArray backed by unified or host memory as a CPU Array, allowing direct CPU
219+
access without explicit data transfers. This only works for arrays backed by
220+
`HIPUnifiedBuffer` or `HostBuffer`.
221+
222+
!!! warning
223+
For unified memory arrays, ensure proper synchronization before accessing the wrapped
224+
array from the host to avoid race conditions.
225+
226+
# Example
227+
```julia
228+
# Create a unified memory array
229+
x = ROCArray{Float32, 1, AMDGPU.Mem.HIPUnifiedBuffer}(undef, 100)
230+
x .= 1.0f0
231+
232+
# Wrap as CPU array
233+
cpu_view = unsafe_wrap(Array, x)
234+
cpu_view[1] = 42.0f0 # Direct CPU access
235+
```
236+
"""
237+
function Base.unsafe_wrap(::Type{Array}, A::ROCArray{T,N,B}) where {T,N,B}
238+
if B === Mem.HIPUnifiedBuffer || B === Mem.HostBuffer
239+
ptr = Base.unsafe_convert(Ptr{T}, A)
240+
return unsafe_wrap(Array, ptr, size(A))
241+
else
242+
throw(ArgumentError(
243+
"unsafe_wrap(Array, ::ROCArray) only supports arrays backed by " *
244+
"HIPUnifiedBuffer or HostBuffer, got $B"))
245+
end
246+
end
247+
215248
## interop with CPU arrays
216249

217250
# We don't convert isbits types in `adapt`, since they are already
@@ -279,12 +312,18 @@ function Base.resize!(A::ROCVector{T}, n::Integer) where T
279312

280313
maxsize = n * sizeof(T)
281314
bufsize = Base.isbitsunion(T) ? (maxsize + n) : maxsize
282-
new_buf = Mem.HIPBuffer(bufsize; stream=stream())
315+
316+
# Preserve the buffer type (HIPBuffer, HIPUnifiedBuffer, or HostBuffer)
317+
old_buf = convert(Mem.AbstractAMDBuffer, A.buf[])
318+
new_buf = if old_buf isa Mem.HIPUnifiedBuffer
319+
Mem.HIPUnifiedBuffer(bufsize; stream=stream())
320+
else
321+
Mem.HIPBuffer(bufsize; stream=stream())
322+
end
283323

284324
copy_size = min(length(A), n) * sizeof(T)
285325
if copy_size > 0
286-
Mem.transfer!(new_buf, convert(Mem.AbstractAMDBuffer, A.buf[]),
287-
copy_size; stream=stream())
326+
Mem.transfer!(new_buf, old_buf, copy_size; stream=stream())
288327
end
289328

290329
# Free old buffer.
@@ -301,10 +340,10 @@ end
301340
function Base.convert(
302341
::Type{ROCDeviceArray{T, N, AS.Global}}, a::ROCArray{T, N},
303342
) where {T, N}
304-
# If HostBuffer, use device pointer.
343+
# Get the appropriate device pointer based on buffer type
305344
buf = convert(Mem.AbstractAMDBuffer, a.buf[])
306-
ptr = convert(Ptr{T}, typeof(buf) <: Mem.HIPBuffer ?
307-
buf : buf.dev_ptr)
345+
ptr = convert(Ptr{T}, typeof(buf) <: Mem.HostBuffer ?
346+
buf.dev_ptr : buf)
308347
llvm_ptr = AMDGPU.LLVMPtr{T,AS.Global}(ptr + a.offset * sizeof(T))
309348
ROCDeviceArray{T, N, AS.Global}(a.dims, llvm_ptr)
310349
end

src/memory.jl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,11 +246,17 @@ function Base.convert(::Type{Ptr{T}}, managed::Managed{M}) where {T, M}
246246
end
247247

248248
managed.dirty = true
249-
# TODO introduce HIPPtr to differentiate
249+
# Return appropriate pointer based on buffer type
250250
if M <: Mem.HIPBuffer
251251
convert(Ptr{T}, managed.mem)
252-
else
252+
elseif M <: Mem.HIPUnifiedBuffer
253+
# Unified memory: accessible from device
254+
convert(Ptr{T}, managed.mem)
255+
elseif M <: Mem.HostBuffer
256+
# Host buffer: use device pointer
253257
convert(Ptr{T}, managed.mem.dev_ptr)
258+
else
259+
error("Unsupported buffer type for device conversion: $M")
254260
end
255261
end
256262

src/runtime/memory/hip.jl

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,190 @@ download!(dst::HIPBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
165165
transfer!(dst::HostBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
166166
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
167167

168+
"""
169+
HIPUnifiedBuffer
170+
171+
Unified memory buffer that can be accessed from both host and device.
172+
Allocated using `hipMallocManaged` with automatic migration between host and device.
173+
174+
Supports memory advise hints and explicit prefetching for performance optimization.
175+
See: https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_runtime_api/memory_management/unified_memory.html
176+
"""
177+
struct HIPUnifiedBuffer <: AbstractAMDBuffer
178+
device::HIPDevice
179+
ctx::HIPContext
180+
ptr::Ptr{Cvoid}
181+
bytesize::Int
182+
own::Bool
183+
end
184+
185+
function HIPUnifiedBuffer(
186+
bytesize::Integer, flags = HIP.hipMemAttachGlobal;
187+
stream::HIP.HIPStream = AMDGPU.stream(),
188+
)
189+
dev, ctx = stream.device, stream.ctx
190+
bytesize == 0 && return HIPUnifiedBuffer(dev, ctx, C_NULL, 0, true)
191+
192+
AMDGPU.maybe_collect()
193+
194+
ptr_ref = Ref{Ptr{Cvoid}}()
195+
HIP.hipMallocManaged(ptr_ref, bytesize, flags)
196+
ptr = ptr_ref[]
197+
ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory))
198+
199+
AMDGPU.account!(AMDGPU.memory_stats(dev), bytesize)
200+
HIPUnifiedBuffer(dev, ctx, ptr, bytesize, true)
201+
end
202+
203+
function HIPUnifiedBuffer(
204+
ptr::Ptr{Cvoid}, sz::Integer;
205+
stream::HIP.HIPStream = AMDGPU.stream(), own::Bool = false,
206+
)
207+
HIPUnifiedBuffer(stream.device, stream.ctx, ptr, sz, own)
208+
end
209+
210+
Base.sizeof(b::HIPUnifiedBuffer) = UInt64(b.bytesize)
211+
212+
Base.convert(::Type{Ptr{T}}, buf::HIPUnifiedBuffer) where T = convert(Ptr{T}, buf.ptr)
213+
214+
function view(buf::HIPUnifiedBuffer, bytesize::Int)
215+
bytesize > buf.bytesize && throw(BoundsError(buf, bytesize))
216+
HIPUnifiedBuffer(
217+
buf.device, buf.ctx,
218+
buf.ptr + bytesize,
219+
buf.bytesize - bytesize, buf.own)
220+
end
221+
222+
function free(buf::HIPUnifiedBuffer; kwargs...)
223+
buf.own || return
224+
buf.ptr == C_NULL && return
225+
HIP.hipFree(buf)
226+
AMDGPU.account!(AMDGPU.memory_stats(buf.device), -buf.bytesize)
227+
return
228+
end
229+
230+
# Unified memory can be accessed from both host and device
231+
upload!(dst::HIPUnifiedBuffer, src::Ptr, sz::Int; stream::HIP.HIPStream) =
232+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
233+
234+
upload!(dst::HIPUnifiedBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
235+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)
236+
237+
upload!(dst::HIPUnifiedBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
238+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
239+
240+
download!(dst::Ptr, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
241+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
242+
243+
download!(dst::HIPBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
244+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)
245+
246+
download!(dst::HostBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
247+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
248+
249+
transfer!(dst::HIPUnifiedBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
250+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDefault, stream)
251+
252+
transfer!(dst::HIPUnifiedBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
253+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)
254+
255+
transfer!(dst::HIPBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
256+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream)
257+
258+
transfer!(dst::HIPUnifiedBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
259+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
260+
261+
transfer!(dst::HostBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) =
262+
HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream)
263+
264+
"""
265+
prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream)
266+
prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream)
267+
268+
Prefetch unified memory to the specified device (or the buffer's device).
269+
Explicitly migrates the data to improve performance by reducing page faults.
270+
271+
See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv419hipMemPrefetchAsyncPvmi13hipStream_t
272+
"""
273+
function prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream = AMDGPU.stream())
274+
buf.ptr == C_NULL && return
275+
HIP.hipMemPrefetchAsync(buf.ptr, buf.bytesize, HIP.device_id(device), stream)
276+
return
277+
end
278+
279+
function prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream = AMDGPU.stream())
280+
prefetch!(buf, buf.device; stream)
281+
end
282+
283+
"""
284+
advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice)
285+
advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise)
286+
287+
Provide hints to the unified memory system about how the memory will be used.
288+
289+
Available advice flags:
290+
- `hipMemAdviseSetReadMostly`: Data will be mostly read and only occasionally written to
291+
- `hipMemAdviseUnsetReadMostly`: Undo read-mostly advice
292+
- `hipMemAdviseSetPreferredLocation`: Set preferred location for the data
293+
- `hipMemAdviseUnsetPreferredLocation`: Clear preferred location
294+
- `hipMemAdviseSetAccessedBy`: Data will be accessed by specified device
295+
- `hipMemAdviseUnsetAccessedBy`: Clear accessed-by hint
296+
- `hipMemAdviseSetCoarseGrain`: Use coarse-grain coherency (AMD-specific)
297+
- `hipMemAdviseUnsetCoarseGrain`: Use fine-grain coherency (AMD-specific)
298+
299+
See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv412hipMemAdvisePvmj8hipMemoryAdvise_t
300+
"""
301+
function advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice)
302+
buf.ptr == C_NULL && return
303+
HIP.hipMemAdvise(buf.ptr, buf.bytesize, advice, HIP.device_id(device))
304+
return
305+
end
306+
307+
function advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise)
308+
advise!(buf, advice, buf.device)
309+
end
310+
311+
"""
312+
query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute)
313+
314+
Query attributes of unified memory range.
315+
316+
Available attributes:
317+
- `hipMemRangeAttributeReadMostly`: Query if the range is read-mostly
318+
- `hipMemRangeAttributePreferredLocation`: Query preferred location
319+
- `hipMemRangeAttributeAccessedBy`: Query which devices can access this range
320+
- `hipMemRangeAttributeLastPrefetchLocation`: Query last prefetch location
321+
- `hipMemRangeAttributeCoherencyMode`: Query coherency mode (AMD-specific)
322+
323+
Returns the attribute value.
324+
325+
See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv423hipMemRangeGetAttributePvm20hipMemRangeAttribute_tPvm
326+
"""
327+
function query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute)
328+
buf.ptr == C_NULL && error("Cannot query attributes of NULL pointer")
329+
330+
# Different attributes return different types
331+
if attribute == HIP.hipMemRangeAttributeReadMostly
332+
data = Ref{Cint}()
333+
HIP.hipMemRangeGetAttribute(data, sizeof(Cint), attribute, buf.ptr, buf.bytesize)
334+
return Bool(data[])
335+
elseif attribute in (HIP.hipMemRangeAttributePreferredLocation,
336+
HIP.hipMemRangeAttributeLastPrefetchLocation)
337+
data = Ref{Cint}()
338+
HIP.hipMemRangeGetAttribute(data, sizeof(Cint), attribute, buf.ptr, buf.bytesize)
339+
return data[]
340+
elseif attribute == HIP.hipMemRangeAttributeCoherencyMode
341+
data = Ref{Cuint}()
342+
HIP.hipMemRangeGetAttribute(data, sizeof(Cuint), attribute, buf.ptr, buf.bytesize)
343+
return data[]
344+
else
345+
# For AccessedBy and other attributes, return raw pointer
346+
data = Ref{Ptr{Cvoid}}()
347+
HIP.hipMemRangeGetAttribute(data, sizeof(Ptr{Cvoid}), attribute, buf.ptr, buf.bytesize)
348+
return data[]
349+
end
350+
end
351+
168352
# download!(::Ptr, ::HIPBuffer)
169353
transfer!(dst::HostBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) =
170354
HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToHost, stream)

test/rocarray/base.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,6 @@ else
229229
@test_skip "Multi-GPU"
230230
end
231231

232+
include("unified.jl")
233+
232234
end

0 commit comments

Comments
 (0)