@@ -165,6 +165,190 @@ download!(dst::HIPBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) =
165165transfer! (dst:: HostBuffer , src:: HostBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
166166 HIP. memcpy (dst, src, sz, HIP. hipMemcpyHostToHost, stream)
167167
168+ """
169+ HIPUnifiedBuffer
170+
171+ Unified memory buffer that can be accessed from both host and device.
172+ Allocated using `hipMallocManaged` with automatic migration between host and device.
173+
174+ Supports memory advise hints and explicit prefetching for performance optimization.
175+ See: https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_runtime_api/memory_management/unified_memory.html
176+ """
177+ struct HIPUnifiedBuffer <: AbstractAMDBuffer
178+ device:: HIPDevice
179+ ctx:: HIPContext
180+ ptr:: Ptr{Cvoid}
181+ bytesize:: Int
182+ own:: Bool
183+ end
184+
185+ function HIPUnifiedBuffer (
186+ bytesize:: Integer , flags = HIP. hipMemAttachGlobal;
187+ stream:: HIP.HIPStream = AMDGPU. stream (),
188+ )
189+ dev, ctx = stream. device, stream. ctx
190+ bytesize == 0 && return HIPUnifiedBuffer (dev, ctx, C_NULL , 0 , true )
191+
192+ AMDGPU. maybe_collect ()
193+
194+ ptr_ref = Ref {Ptr{Cvoid}} ()
195+ HIP. hipMallocManaged (ptr_ref, bytesize, flags)
196+ ptr = ptr_ref[]
197+ ptr == C_NULL && throw (HIP. HIPError (HIP. hipErrorOutOfMemory))
198+
199+ AMDGPU. account! (AMDGPU. memory_stats (dev), bytesize)
200+ HIPUnifiedBuffer (dev, ctx, ptr, bytesize, true )
201+ end
202+
203+ function HIPUnifiedBuffer (
204+ ptr:: Ptr{Cvoid} , sz:: Integer ;
205+ stream:: HIP.HIPStream = AMDGPU. stream (), own:: Bool = false ,
206+ )
207+ HIPUnifiedBuffer (stream. device, stream. ctx, ptr, sz, own)
208+ end
209+
210+ Base. sizeof (b:: HIPUnifiedBuffer ) = UInt64 (b. bytesize)
211+
212+ Base. convert (:: Type{Ptr{T}} , buf:: HIPUnifiedBuffer ) where T = convert (Ptr{T}, buf. ptr)
213+
214+ function view (buf:: HIPUnifiedBuffer , bytesize:: Int )
215+ bytesize > buf. bytesize && throw (BoundsError (buf, bytesize))
216+ HIPUnifiedBuffer (
217+ buf. device, buf. ctx,
218+ buf. ptr + bytesize,
219+ buf. bytesize - bytesize, buf. own)
220+ end
221+
222+ function free (buf:: HIPUnifiedBuffer ; kwargs... )
223+ buf. own || return
224+ buf. ptr == C_NULL && return
225+ HIP. hipFree (buf)
226+ AMDGPU. account! (AMDGPU. memory_stats (buf. device), - buf. bytesize)
227+ return
228+ end
229+
230+ # Unified memory can be accessed from both host and device
231+ upload! (dst:: HIPUnifiedBuffer , src:: Ptr , sz:: Int ; stream:: HIP.HIPStream ) =
232+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyHostToHost, stream)
233+
234+ upload! (dst:: HIPUnifiedBuffer , src:: HIPBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
235+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyDeviceToDevice, stream)
236+
237+ upload! (dst:: HIPUnifiedBuffer , src:: HostBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
238+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyHostToHost, stream)
239+
240+ download! (dst:: Ptr , src:: HIPUnifiedBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
241+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyHostToHost, stream)
242+
243+ download! (dst:: HIPBuffer , src:: HIPUnifiedBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
244+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyDeviceToDevice, stream)
245+
246+ download! (dst:: HostBuffer , src:: HIPUnifiedBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
247+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyHostToHost, stream)
248+
249+ transfer! (dst:: HIPUnifiedBuffer , src:: HIPUnifiedBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
250+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyDefault, stream)
251+
252+ transfer! (dst:: HIPUnifiedBuffer , src:: HIPBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
253+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyDeviceToDevice, stream)
254+
255+ transfer! (dst:: HIPBuffer , src:: HIPUnifiedBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
256+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyDeviceToDevice, stream)
257+
258+ transfer! (dst:: HIPUnifiedBuffer , src:: HostBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
259+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyHostToHost, stream)
260+
261+ transfer! (dst:: HostBuffer , src:: HIPUnifiedBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
262+ HIP. memcpy (dst, src, sz, HIP. hipMemcpyHostToHost, stream)
263+
264+ """
265+ prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream)
266+ prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream)
267+
268+ Prefetch unified memory to the specified device (or the buffer's device).
269+ Explicitly migrates the data to improve performance by reducing page faults.
270+
271+ See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv419hipMemPrefetchAsyncPvmi13hipStream_t
272+ """
273+ function prefetch! (buf:: HIPUnifiedBuffer , device:: HIPDevice ; stream:: HIP.HIPStream = AMDGPU. stream ())
274+ buf. ptr == C_NULL && return
275+ HIP. hipMemPrefetchAsync (buf. ptr, buf. bytesize, HIP. device_id (device), stream)
276+ return
277+ end
278+
279+ function prefetch! (buf:: HIPUnifiedBuffer ; stream:: HIP.HIPStream = AMDGPU. stream ())
280+ prefetch! (buf, buf. device; stream)
281+ end
282+
283+ """
284+ advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice)
285+ advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise)
286+
287+ Provide hints to the unified memory system about how the memory will be used.
288+
289+ Available advice flags:
290+ - `hipMemAdviseSetReadMostly`: Data will be mostly read and only occasionally written to
291+ - `hipMemAdviseUnsetReadMostly`: Undo read-mostly advice
292+ - `hipMemAdviseSetPreferredLocation`: Set preferred location for the data
293+ - `hipMemAdviseUnsetPreferredLocation`: Clear preferred location
294+ - `hipMemAdviseSetAccessedBy`: Data will be accessed by specified device
295+ - `hipMemAdviseUnsetAccessedBy`: Clear accessed-by hint
296+ - `hipMemAdviseSetCoarseGrain`: Use coarse-grain coherency (AMD-specific)
297+ - `hipMemAdviseUnsetCoarseGrain`: Use fine-grain coherency (AMD-specific)
298+
299+ See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv412hipMemAdvisePvmj8hipMemoryAdvise_t
300+ """
301+ function advise! (buf:: HIPUnifiedBuffer , advice:: HIP.hipMemoryAdvise , device:: HIPDevice )
302+ buf. ptr == C_NULL && return
303+ HIP. hipMemAdvise (buf. ptr, buf. bytesize, advice, HIP. device_id (device))
304+ return
305+ end
306+
307+ function advise! (buf:: HIPUnifiedBuffer , advice:: HIP.hipMemoryAdvise )
308+ advise! (buf, advice, buf. device)
309+ end
310+
311+ """
312+ query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute)
313+
314+ Query attributes of unified memory range.
315+
316+ Available attributes:
317+ - `hipMemRangeAttributeReadMostly`: Query if the range is read-mostly
318+ - `hipMemRangeAttributePreferredLocation`: Query preferred location
319+ - `hipMemRangeAttributeAccessedBy`: Query which devices can access this range
320+ - `hipMemRangeAttributeLastPrefetchLocation`: Query last prefetch location
321+ - `hipMemRangeAttributeCoherencyMode`: Query coherency mode (AMD-specific)
322+
323+ Returns the attribute value.
324+
325+ See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv423hipMemRangeGetAttributePvm20hipMemRangeAttribute_tPvm
326+ """
327+ function query_attribute (buf:: HIPUnifiedBuffer , attribute:: HIP.hipMemRangeAttribute )
328+ buf. ptr == C_NULL && error (" Cannot query attributes of NULL pointer" )
329+
330+ # Different attributes return different types
331+ if attribute == HIP. hipMemRangeAttributeReadMostly
332+ data = Ref {Cint} ()
333+ HIP. hipMemRangeGetAttribute (data, sizeof (Cint), attribute, buf. ptr, buf. bytesize)
334+ return Bool (data[])
335+ elseif attribute in (HIP. hipMemRangeAttributePreferredLocation,
336+ HIP. hipMemRangeAttributeLastPrefetchLocation)
337+ data = Ref {Cint} ()
338+ HIP. hipMemRangeGetAttribute (data, sizeof (Cint), attribute, buf. ptr, buf. bytesize)
339+ return data[]
340+ elseif attribute == HIP. hipMemRangeAttributeCoherencyMode
341+ data = Ref {Cuint} ()
342+ HIP. hipMemRangeGetAttribute (data, sizeof (Cuint), attribute, buf. ptr, buf. bytesize)
343+ return data[]
344+ else
345+ # For AccessedBy and other attributes, return raw pointer
346+ data = Ref {Ptr{Cvoid}} ()
347+ HIP. hipMemRangeGetAttribute (data, sizeof (Ptr{Cvoid}), attribute, buf. ptr, buf. bytesize)
348+ return data[]
349+ end
350+ end
351+
168352# download!(::Ptr, ::HIPBuffer)
169353transfer! (dst:: HostBuffer , src:: HIPBuffer , sz:: Int ; stream:: HIP.HIPStream ) =
170354 HIP. memcpy (dst, src, sz, HIP. hipMemcpyDeviceToHost, stream)
0 commit comments