diff --git a/docs/src/api/memory.md b/docs/src/api/memory.md index 2b34f6ab0..95b4fd7c9 100644 --- a/docs/src/api/memory.md +++ b/docs/src/api/memory.md @@ -182,3 +182,24 @@ julia> xd * xd # Can be used with HIP libraries. !!! note Passing `own=true` keyword will make the wrapped array take the ownership of the memory. For host memory it will unpin it on destruction and for device memory it will free it. + +## Unified Memory + +AMDGPU.jl supports HIP unified memory (also known as managed memory), which allows the same memory to be accessed from both CPU and GPU without explicit transfers. Arrays can be allocated with unified memory using the `unified=true` keyword: + +```julia +x = ROCArray{Float32}(undef, 4, 4; unified=true) +x .= 1f0 # Can be accessed and modified directly from CPU +y = x .+ 2f0 # Can be used in GPU kernels +Array(x) # No copy needed - wraps the same memory +``` + +You can also wrap unified memory arrays to `Array` for direct CPU access: + +```julia +x = ROCArray{Float32}(undef, 4; unified=true) +x_cpu = unsafe_wrap(Array, x) # Zero-copy access from CPU +x_cpu[1] = 42f0 # Modifies the GPU array directly +``` + +Unified memory is particularly useful for irregular workloads and when you need frequent CPU-GPU data exchanges. diff --git a/src/ROCKernels.jl b/src/ROCKernels.jl index 946e2cdff..0b6586ddf 100644 --- a/src/ROCKernels.jl +++ b/src/ROCKernels.jl @@ -34,9 +34,15 @@ KA.get_backend(::AMDGPU.ROCArray) = ROCBackend() KA.synchronize(::ROCBackend) = AMDGPU.synchronize() KA.unsafe_free!(x::AMDGPU.ROCArray) = AMDGPU.unsafe_free!(x) -KA.allocate(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.ROCArray{T}(undef, dims) -KA.zeros(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.zeros(T, dims) -KA.ones(::ROCBackend, ::Type{T}, dims::Tuple) where T = AMDGPU.ones(T, dims) +KA.supports_unified(::ROCBackend) = true + +function KA.allocate(backend::ROCBackend, ::Type{T}, dims::Tuple; unified::Bool=false) where T + if unified + return AMDGPU.ROCArray{T, length(dims), AMDGPU.Mem.HIPUnifiedBuffer}(undef, dims) + else + return AMDGPU.ROCArray{T}(undef, dims) + end +end function KA.priority!(::ROCBackend, priority::Symbol) priority ∉ (:high, :normal, :low) && error( diff --git a/src/array.jl b/src/array.jl index dc17a3cce..b81e397a3 100644 --- a/src/array.jl +++ b/src/array.jl @@ -212,6 +212,39 @@ end Base.unsafe_wrap(::Type{ROCArray{T}}, ptr::Ptr, dims; kwargs...) where T = unsafe_wrap(ROCArray, Base.unsafe_convert(Ptr{T}, ptr), dims; kwargs...) +""" + Base.unsafe_wrap(::Type{Array}, A::ROCArray, dims) + +Wrap a ROCArray backed by unified or host memory as a CPU Array, allowing direct CPU +access without explicit data transfers. This only works for arrays backed by +`HIPUnifiedBuffer` or `HostBuffer`. + +!!! warning + For unified memory arrays, ensure proper synchronization before accessing the wrapped + array from the host to avoid race conditions. + +# Example +```julia +# Create a unified memory array +x = ROCArray{Float32, 1, AMDGPU.Mem.HIPUnifiedBuffer}(undef, 100) +x .= 1.0f0 + +# Wrap as CPU array +cpu_view = unsafe_wrap(Array, x) +cpu_view[1] = 42.0f0 # Direct CPU access +``` +""" +function Base.unsafe_wrap(::Type{Array}, A::ROCArray{T,N,B}) where {T,N,B} + if B === Mem.HIPUnifiedBuffer || B === Mem.HostBuffer + ptr = Base.unsafe_convert(Ptr{T}, A) + return unsafe_wrap(Array, ptr, size(A)) + else + throw(ArgumentError( + "unsafe_wrap(Array, ::ROCArray) only supports arrays backed by " * + "HIPUnifiedBuffer or HostBuffer, got $B")) + end +end + ## interop with CPU arrays # We don't convert isbits types in `adapt`, since they are already @@ -279,12 +312,18 @@ function Base.resize!(A::ROCVector{T}, n::Integer) where T maxsize = n * sizeof(T) bufsize = Base.isbitsunion(T) ? (maxsize + n) : maxsize - new_buf = Mem.HIPBuffer(bufsize; stream=stream()) + + # Preserve the buffer type (HIPBuffer, HIPUnifiedBuffer, or HostBuffer) + old_buf = convert(Mem.AbstractAMDBuffer, A.buf[]) + new_buf = if old_buf isa Mem.HIPUnifiedBuffer + Mem.HIPUnifiedBuffer(bufsize; stream=stream()) + else + Mem.HIPBuffer(bufsize; stream=stream()) + end copy_size = min(length(A), n) * sizeof(T) if copy_size > 0 - Mem.transfer!(new_buf, convert(Mem.AbstractAMDBuffer, A.buf[]), - copy_size; stream=stream()) + Mem.transfer!(new_buf, old_buf, copy_size; stream=stream()) end # Free old buffer. @@ -301,13 +340,26 @@ end function Base.convert( ::Type{ROCDeviceArray{T, N, AS.Global}}, a::ROCArray{T, N}, ) where {T, N} - # If HostBuffer, use device pointer. + # Get the appropriate device pointer based on buffer type buf = convert(Mem.AbstractAMDBuffer, a.buf[]) - ptr = convert(Ptr{T}, typeof(buf) <: Mem.HIPBuffer ? - buf : buf.dev_ptr) + ptr = convert(Ptr{T}, typeof(buf) <: Mem.HostBuffer ? + buf.dev_ptr : buf) llvm_ptr = AMDGPU.LLVMPtr{T,AS.Global}(ptr + a.offset * sizeof(T)) ROCDeviceArray{T, N, AS.Global}(a.dims, llvm_ptr) end Adapt.adapt_storage(::Runtime.Adaptor, x::ROCArray{T,N}) where {T,N} = convert(ROCDeviceArray{T,N,AS.Global}, x) + + +## indexing + +function Base.getindex(x::ROCArray{T, <:Any, <:Union{Mem.HostBuffer, Mem.HIPUnifiedBuffer}}, I::Int) where T + @boundscheck checkbounds(x, I) + unsafe_load(pointer(x, I)) +end + +function Base.setindex!(x::ROCArray{T, <:Any, <:Union{Mem.HostBuffer, Mem.HIPUnifiedBuffer}}, v, I::Int) where T + @boundscheck checkbounds(x, I) + unsafe_store!(pointer(x, I), v) +end diff --git a/src/memory.jl b/src/memory.jl index 6da3b2f0f..e547b22fa 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -246,11 +246,17 @@ function Base.convert(::Type{Ptr{T}}, managed::Managed{M}) where {T, M} end managed.dirty = true - # TODO introduce HIPPtr to differentiate + # Return appropriate pointer based on buffer type if M <: Mem.HIPBuffer convert(Ptr{T}, managed.mem) - else + elseif M <: Mem.HIPUnifiedBuffer + # Unified memory: accessible from device + convert(Ptr{T}, managed.mem) + elseif M <: Mem.HostBuffer + # Host buffer: use device pointer convert(Ptr{T}, managed.mem.dev_ptr) + else + error("Unsupported buffer type for device conversion: $M") end end diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl index c444d2e34..4049e0571 100644 --- a/src/runtime/memory/hip.jl +++ b/src/runtime/memory/hip.jl @@ -165,6 +165,190 @@ download!(dst::HIPBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) = transfer!(dst::HostBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) = HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream) +""" + HIPUnifiedBuffer + +Unified memory buffer that can be accessed from both host and device. +Allocated using `hipMallocManaged` with automatic migration between host and device. + +Supports memory advise hints and explicit prefetching for performance optimization. +See: https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_runtime_api/memory_management/unified_memory.html +""" +struct HIPUnifiedBuffer <: AbstractAMDBuffer + device::HIPDevice + ctx::HIPContext + ptr::Ptr{Cvoid} + bytesize::Int + own::Bool +end + +function HIPUnifiedBuffer( + bytesize::Integer, flags = HIP.hipMemAttachGlobal; + stream::HIP.HIPStream = AMDGPU.stream(), +) + dev, ctx = stream.device, stream.ctx + bytesize == 0 && return HIPUnifiedBuffer(dev, ctx, C_NULL, 0, true) + + AMDGPU.maybe_collect() + + ptr_ref = Ref{Ptr{Cvoid}}() + HIP.hipMallocManaged(ptr_ref, bytesize, flags) + ptr = ptr_ref[] + ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory)) + + AMDGPU.account!(AMDGPU.memory_stats(dev), bytesize) + HIPUnifiedBuffer(dev, ctx, ptr, bytesize, true) +end + +function HIPUnifiedBuffer( + ptr::Ptr{Cvoid}, sz::Integer; + stream::HIP.HIPStream = AMDGPU.stream(), own::Bool = false, +) + HIPUnifiedBuffer(stream.device, stream.ctx, ptr, sz, own) +end + +Base.sizeof(b::HIPUnifiedBuffer) = UInt64(b.bytesize) + +Base.convert(::Type{Ptr{T}}, buf::HIPUnifiedBuffer) where T = convert(Ptr{T}, buf.ptr) + +function view(buf::HIPUnifiedBuffer, bytesize::Int) + bytesize > buf.bytesize && throw(BoundsError(buf, bytesize)) + HIPUnifiedBuffer( + buf.device, buf.ctx, + buf.ptr + bytesize, + buf.bytesize - bytesize, buf.own) +end + +function free(buf::HIPUnifiedBuffer; kwargs...) + buf.own || return + buf.ptr == C_NULL && return + HIP.hipFree(buf) + AMDGPU.account!(AMDGPU.memory_stats(buf.device), -buf.bytesize) + return +end + +# Unified memory can be accessed from both host and device +upload!(dst::HIPUnifiedBuffer, src::Ptr, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream) + +upload!(dst::HIPUnifiedBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream) + +upload!(dst::HIPUnifiedBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream) + +download!(dst::Ptr, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream) + +download!(dst::HIPBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream) + +download!(dst::HostBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream) + +transfer!(dst::HIPUnifiedBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyDefault, stream) + +transfer!(dst::HIPUnifiedBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream) + +transfer!(dst::HIPBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToDevice, stream) + +transfer!(dst::HIPUnifiedBuffer, src::HostBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream) + +transfer!(dst::HostBuffer, src::HIPUnifiedBuffer, sz::Int; stream::HIP.HIPStream) = + HIP.memcpy(dst, src, sz, HIP.hipMemcpyHostToHost, stream) + +""" + prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream) + prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream) + +Prefetch unified memory to the specified device (or the buffer's device). +Explicitly migrates the data to improve performance by reducing page faults. + +See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv419hipMemPrefetchAsyncPvmi13hipStream_t +""" +function prefetch!(buf::HIPUnifiedBuffer, device::HIPDevice; stream::HIP.HIPStream = AMDGPU.stream()) + buf.ptr == C_NULL && return + HIP.hipMemPrefetchAsync(buf.ptr, buf.bytesize, HIP.device_id(device), stream) + return +end + +function prefetch!(buf::HIPUnifiedBuffer; stream::HIP.HIPStream = AMDGPU.stream()) + prefetch!(buf, buf.device; stream) +end + +""" + advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice) + advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise) + +Provide hints to the unified memory system about how the memory will be used. + +Available advice flags: +- `hipMemAdviseSetReadMostly`: Data will be mostly read and only occasionally written to +- `hipMemAdviseUnsetReadMostly`: Undo read-mostly advice +- `hipMemAdviseSetPreferredLocation`: Set preferred location for the data +- `hipMemAdviseUnsetPreferredLocation`: Clear preferred location +- `hipMemAdviseSetAccessedBy`: Data will be accessed by specified device +- `hipMemAdviseUnsetAccessedBy`: Clear accessed-by hint +- `hipMemAdviseSetCoarseGrain`: Use coarse-grain coherency (AMD-specific) +- `hipMemAdviseUnsetCoarseGrain`: Use fine-grain coherency (AMD-specific) + +See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv412hipMemAdvisePvmj8hipMemoryAdvise_t +""" +function advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise, device::HIPDevice) + buf.ptr == C_NULL && return + HIP.hipMemAdvise(buf.ptr, buf.bytesize, advice, HIP.device_id(device)) + return +end + +function advise!(buf::HIPUnifiedBuffer, advice::HIP.hipMemoryAdvise) + advise!(buf, advice, buf.device) +end + +""" + query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute) + +Query attributes of unified memory range. + +Available attributes: +- `hipMemRangeAttributeReadMostly`: Query if the range is read-mostly +- `hipMemRangeAttributePreferredLocation`: Query preferred location +- `hipMemRangeAttributeAccessedBy`: Query which devices can access this range +- `hipMemRangeAttributeLastPrefetchLocation`: Query last prefetch location +- `hipMemRangeAttributeCoherencyMode`: Query coherency mode (AMD-specific) + +Returns the attribute value. + +See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.html#_CPPv423hipMemRangeGetAttributePvm20hipMemRangeAttribute_tPvm +""" +function query_attribute(buf::HIPUnifiedBuffer, attribute::HIP.hipMemRangeAttribute) + buf.ptr == C_NULL && error("Cannot query attributes of NULL pointer") + + # Different attributes return different types + if attribute == HIP.hipMemRangeAttributeReadMostly + data = Ref{Cint}() + HIP.hipMemRangeGetAttribute(data, sizeof(Cint), attribute, buf.ptr, buf.bytesize) + return Bool(data[]) + elseif attribute in (HIP.hipMemRangeAttributePreferredLocation, + HIP.hipMemRangeAttributeLastPrefetchLocation) + data = Ref{Cint}() + HIP.hipMemRangeGetAttribute(data, sizeof(Cint), attribute, buf.ptr, buf.bytesize) + return data[] + elseif attribute == HIP.hipMemRangeAttributeCoherencyMode + data = Ref{Cuint}() + HIP.hipMemRangeGetAttribute(data, sizeof(Cuint), attribute, buf.ptr, buf.bytesize) + return data[] + else + # For AccessedBy and other attributes, return raw pointer + data = Ref{Ptr{Cvoid}}() + HIP.hipMemRangeGetAttribute(data, sizeof(Ptr{Cvoid}), attribute, buf.ptr, buf.bytesize) + return data[] + end +end + # download!(::Ptr, ::HIPBuffer) transfer!(dst::HostBuffer, src::HIPBuffer, sz::Int; stream::HIP.HIPStream) = HIP.memcpy(dst, src, sz, HIP.hipMemcpyDeviceToHost, stream) diff --git a/test/rocarray/base.jl b/test/rocarray/base.jl index 252c4558b..04e5e9c4c 100644 --- a/test/rocarray/base.jl +++ b/test/rocarray/base.jl @@ -229,4 +229,6 @@ else @test_skip "Multi-GPU" end +include("unified.jl") + end diff --git a/test/rocarray/unified.jl b/test/rocarray/unified.jl new file mode 100644 index 000000000..239fd77b5 --- /dev/null +++ b/test/rocarray/unified.jl @@ -0,0 +1,288 @@ +using KernelAbstractions +using AMDGPU.HIP + +@testset "Unified Memory" begin + +@testset "Allocation and deallocation" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 2, B}(undef, 16, 12) + @test size(x) == (16, 12) + @test x.buf[].mem isa B + AMDGPU.unsafe_free!(x) +end + +@testset "Host and device access" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 16) + + x .= 1.0f0 + @test Array(x) == ones(Float32, 16) + + x .+= 2.0f0 + @test Array(x) == fill(3.0f0, 16) +end + +@testset "Memory advise" begin + dev = AMDGPU.device() + if HIP.attribute(dev, HIP.hipDeviceAttributeManagedMemory) == 1 + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 1024) + buf = x.buf[].mem + + AMDGPU.Runtime.Mem.advise!(buf, HIP.hipMemAdviseSetReadMostly) + @test AMDGPU.Runtime.Mem.query_attribute(buf, HIP.hipMemRangeAttributeReadMostly) + + AMDGPU.Runtime.Mem.advise!(buf, HIP.hipMemAdviseUnsetReadMostly) + @test !AMDGPU.Runtime.Mem.query_attribute(buf, HIP.hipMemRangeAttributeReadMostly) + else + @test_skip "Device does not support managed memory" + end +end + +@testset "Prefetching" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 1024) + buf = x.buf[].mem + + AMDGPU.Runtime.Mem.prefetch!(buf) + x .= 1.0f0 + @test Array(x) == ones(Float32, 1024) +end + +@testset "Preferred location" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 512) + buf = x.buf[].mem + + AMDGPU.Runtime.Mem.advise!(buf, HIP.hipMemAdviseSetPreferredLocation, AMDGPU.device()) + x .= 2.0f0 + @test Array(x) == fill(2.0f0, 512) +end + +@testset "Coarse-grain coherency" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 256) + buf = x.buf[].mem + + AMDGPU.Runtime.Mem.advise!(buf, HIP.hipMemAdviseSetCoarseGrain) + x .= 3.0f0 + @test Array(x) == fill(3.0f0, 256) + + AMDGPU.Runtime.Mem.advise!(buf, HIP.hipMemAdviseUnsetCoarseGrain) +end + +@testset "Interop with HIPBuffer" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 32) + y = AMDGPU.ones(Float32, 32) + + copyto!(x, y) + @test Array(x) == ones(Float32, 32) + + x .+= 1.0f0 + copyto!(y, x) + @test Array(y) == fill(2.0f0, 32) +end + +@testset "Broadcasting" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 64) + y = ROCArray{Float32, 1, B}(undef, 64) + + x .= 1.0f0 + y .= 2.0f0 + z = x .+ y + @test Array(z) == fill(3.0f0, 64) +end + +@testset "Reduction operations" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 128) + + x .= 1.0f0 + @test sum(x) == 128.0f0 + @test maximum(x) == 1.0f0 + @test minimum(x) == 1.0f0 +end + +@testset "View operations" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 1, B}(undef, 16) + + x .= 1.0f0:16.0f0 + v = view(x, 5:12) + @test Array(v) == collect(5.0f0:12.0f0) + + v .= 0.0f0 + @test Array(x[5:12]) == zeros(Float32, 8) +end + +@testset "Multidimensional arrays" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Float32, 3, B}(undef, 4, 8, 16) + + x .= 1.0f0 + @test size(x) == (4, 8, 16) + @test Array(x) == ones(Float32, 4, 8, 16) +end + +@testset "resize!" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + x = ROCArray{Int, 1, B}(undef, 10) + + x .= 1:10 + resize!(x, 15) + x[11:15] .= 11:15 + @test Array(x) == 1:15 + + resize!(x, 5) + @test Array(x) == 1:5 +end + +@testset "unsafe_wrap(Array, ::ROCArray)" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + + # Test with unified memory + x = ROCArray{Float32, 1, B}(undef, 100) + x .= 1.0f0 + AMDGPU.synchronize() + + # Wrap as CPU array + cpu_view = unsafe_wrap(Array, x) + @test cpu_view isa Array{Float32, 1} + @test size(cpu_view) == size(x) + @test cpu_view ≈ ones(Float32, 100) + + # Modify through CPU view + cpu_view[1] = 42.0f0 + cpu_view[50] = 99.0f0 + @test Array(x)[1] == 42.0f0 + @test Array(x)[50] == 99.0f0 + + # Test with multidimensional array + y = ROCArray{Int32, 2, B}(undef, 10, 20) + y .= 1 + AMDGPU.synchronize() + + cpu_view2 = unsafe_wrap(Array, y) + @test cpu_view2 isa Array{Int32, 2} + @test size(cpu_view2) == (10, 20) + @test all(cpu_view2 .== 1) + + # Test that it fails for device-only memory + z = ROCArray{Float32, 1}(undef, 10) # Default HIPBuffer + @test_throws ArgumentError unsafe_wrap(Array, z) + + # Test with HostBuffer + HB = AMDGPU.Runtime.Mem.HostBuffer + h = ROCArray{Float64, 1, HB}(undef, 50) + h .= 2.0 + AMDGPU.synchronize() + + cpu_view3 = unsafe_wrap(Array, h) + @test cpu_view3 isa Array{Float64, 1} + @test size(cpu_view3) == (50,) + @test cpu_view3 == fill(2.0, 50) + + cpu_view3[25] = 3.14 + @test Array(h)[25] == 3.14 +end + +@testset "Scalar indexing" begin + B = AMDGPU.Runtime.Mem.HIPUnifiedBuffer + + # Test scalar getindex with unified memory + x = ROCArray{Float32, 1, B}(undef, 100) + x .= 1.0f0:100.0f0 + AMDGPU.synchronize() + + @test x[1] == 1.0f0 + @test x[50] == 50.0f0 + @test x[100] == 100.0f0 + + # Test scalar setindex! with unified memory + x[1] = 42.0f0 + x[50] = 99.0f0 + AMDGPU.synchronize() + + @test x[1] == 42.0f0 + @test x[50] == 99.0f0 + @test Array(x)[1] == 42.0f0 + @test Array(x)[50] == 99.0f0 + + # Test with multidimensional arrays + y = ROCArray{Int32, 2, B}(undef, 10, 20) + y .= reshape(1:200, 10, 20) + AMDGPU.synchronize() + + @test y[1, 1] == 1 + @test y[10, 20] == 200 + @test y[5, 10] == 95 + + y[1, 1] = 999 + y[5, 10] = 888 + AMDGPU.synchronize() + + @test y[1, 1] == 999 + @test y[5, 10] == 888 + + # Test with HostBuffer + HB = AMDGPU.Runtime.Mem.HostBuffer + h = ROCArray{Float64, 1, HB}(undef, 50) + h .= 2.0:51.0 + AMDGPU.synchronize() + + @test h[1] == 2.0 + @test h[25] == 26.0 + @test h[50] == 51.0 + + h[25] = 3.14 + AMDGPU.synchronize() + @test h[25] == 3.14 + + # Test bounds checking + @test_throws BoundsError x[0] + @test_throws BoundsError x[101] + @test_throws BoundsError y[0, 1] + @test_throws BoundsError y[1, 21] +end + +@testset "KernelAbstractions integration" begin + import KernelAbstractions as KA + + # Test supports_unified + backend = ROCBackend() + @test KA.supports_unified(backend) isa Bool + + if KA.supports_unified(backend) + # Test unified allocate with kwarg + x = KA.allocate(backend, Float32, (16, 16); unified=true) + @test x isa ROCArray{Float32, 2, AMDGPU.Runtime.Mem.HIPUnifiedBuffer} + @test size(x) == (16, 16) + + # Test basic operations with unified memory + fill!(x, 1.0f0) + @test Array(x) == ones(Float32, 16, 16) + + # Test kernel execution with unified memory + @kernel function addone_kernel!(x) + i = @index(Global) + x[i] += 1.0f0 + end + addone_kernel!(backend)(x, ndrange=length(x)) + KA.synchronize(backend) + @test Array(x) == fill(2.0f0, 16, 16) + + AMDGPU.unsafe_free!(x) + + # Test non-unified allocation (default) + y = KA.allocate(backend, Float32, (16, 16); unified=false) + @test y isa ROCArray{Float32, 2, AMDGPU.Runtime.Mem.HIPBuffer} + AMDGPU.unsafe_free!(y) + else + # Device doesn't support unified memory + @test_throws ArgumentError KA.allocate(backend, Float32, (16, 16); unified=true) + end +end + +end