Skip to content

Commit 2807156

Browse files
authored
Prefer aligned_sizeof (#2757)
1 parent 9ac471b commit 2807156

File tree

10 files changed

+53
-39
lines changed

10 files changed

+53
-39
lines changed

lib/cudadrv/CUDAdrv.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ using Printf
66

77
using LazyArtifacts
88

9+
# Julia has several notions of `sizeof`
10+
# - Base.sizeof is the size of an object in memory
11+
# - Base.aligned_sizeof is the size of an object in an array/inline alloced
12+
# Both of them are equivalent for immutable objects, but differ for mutable singtons and Symbol
13+
# We use `aligned_sizeof` since we care about the size of a type in an array
14+
import Base: aligned_sizeof
915

1016
# low-level wrappers
1117
include("libcuda.jl")

lib/cudadrv/memory.jl

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ for (fn, srcPtrTy, dstPtrTy) in (("cuMemcpyDtoHAsync_v2", :CuPtr, :Ptr),
411411
@eval function Base.unsafe_copyto!(dst::$dstPtrTy{T}, src::$srcPtrTy{T}, N::Integer;
412412
stream::CuStream=stream(),
413413
async::Bool=false) where T
414-
$(getproperty(CUDA, Symbol(fn)))(dst, src, N*sizeof(T), stream)
414+
$(getproperty(CUDA, Symbol(fn)))(dst, src, N*aligned_sizeof(T), stream)
415415
async || synchronize(stream)
416416
return dst
417417
end
@@ -423,11 +423,11 @@ function Base.unsafe_copyto!(dst::CuPtr{T}, src::CuPtr{T}, N::Integer;
423423
dst_dev = device(dst)
424424
src_dev = device(src)
425425
if dst_dev == src_dev
426-
cuMemcpyDtoDAsync_v2(dst, src, N*sizeof(T), stream)
426+
cuMemcpyDtoDAsync_v2(dst, src, N*aligned_sizeof(T), stream)
427427
else
428428
cuMemcpyPeerAsync(dst, context(dst_dev),
429429
src, context(src_dev),
430-
N*sizeof(T), stream)
430+
N*aligned_sizeof(T), stream)
431431
end
432432
async || synchronize(stream)
433433
return dst
@@ -436,24 +436,24 @@ end
436436
function Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::Ptr{T}, N::Integer;
437437
stream::CuStream=stream(),
438438
async::Bool=false) where T
439-
cuMemcpyHtoAAsync_v2(dst, doffs, src, N*sizeof(T), stream)
439+
cuMemcpyHtoAAsync_v2(dst, doffs, src, N*aligned_sizeof(T), stream)
440440
async || synchronize(stream)
441441
return dst
442442
end
443443

444444
function Base.unsafe_copyto!(dst::Ptr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer;
445445
stream::CuStream=stream(),
446446
async::Bool=false) where T
447-
cuMemcpyAtoHAsync_v2(dst, src, soffs, N*sizeof(T), stream)
447+
cuMemcpyAtoHAsync_v2(dst, src, soffs, N*aligned_sizeof(T), stream)
448448
async || synchronize(stream)
449449
return dst
450450
end
451451

452452
Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::CuPtr{T}, N::Integer) where {T} =
453-
cuMemcpyDtoA_v2(dst, doffs, src, N*sizeof(T))
453+
cuMemcpyDtoA_v2(dst, doffs, src, N*aligned_sizeof(T))
454454

455455
Base.unsafe_copyto!(dst::CuPtr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer) where {T} =
456-
cuMemcpyAtoD_v2(dst, src, soffs, N*sizeof(T))
456+
cuMemcpyAtoD_v2(dst, src, soffs, N*aligned_sizeof(T))
457457

458458
Base.unsafe_copyto!(dst::CuArrayPtr, src, N::Integer; kwargs...) =
459459
Base.unsafe_copyto!(dst, 0, src, N; kwargs...)
@@ -529,15 +529,15 @@ function unsafe_copy2d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
529529

530530
params_ref = Ref(CUDA_MEMCPY2D(
531531
# source
532-
(srcPos.x-1)*sizeof(T), srcPos.y-1,
532+
(srcPos.x-1)*aligned_sizeof(T), srcPos.y-1,
533533
srcMemoryType, srcHost, srcDevice, srcArray,
534534
srcPitch,
535535
# destination
536-
(dstPos.x-1)*sizeof(T), dstPos.y-1,
536+
(dstPos.x-1)*aligned_sizeof(T), dstPos.y-1,
537537
dstMemoryType, dstHost, dstDevice, dstArray,
538538
dstPitch,
539539
# extent
540-
width*sizeof(T), height
540+
width*aligned_sizeof(T), height
541541
))
542542
cuMemcpy2DAsync_v2(params_ref, stream)
543543
async || synchronize(stream)
@@ -569,8 +569,8 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
569569
# when using the stream-ordered memory allocator
570570
# NOTE: we apply the workaround unconditionally, since we want to keep this call cheap.
571571
if v"11.2" <= driver_version() <= v"11.3" #&& pools[device()].stream_ordered
572-
srcOffset = (srcPos.x-1)*sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
573-
dstOffset = (dstPos.x-1)*sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
572+
srcOffset = (srcPos.x-1)*aligned_sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
573+
dstOffset = (dstPos.x-1)*aligned_sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
574574
else
575575
srcOffset = 0
576576
dstOffset = 0
@@ -622,23 +622,23 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
622622

623623
params_ref = Ref(CUDA_MEMCPY3D(
624624
# source
625-
srcOffset==0 ? (srcPos.x-1)*sizeof(T) : 0,
625+
srcOffset==0 ? (srcPos.x-1)*aligned_sizeof(T) : 0,
626626
srcOffset==0 ? srcPos.y-1 : 0,
627627
srcOffset==0 ? srcPos.z-1 : 0,
628628
0, # LOD
629629
srcMemoryType, srcHost, srcDevice, srcArray,
630630
C_NULL, # reserved
631631
srcPitch, srcHeight,
632632
# destination
633-
dstOffset==0 ? (dstPos.x-1)*sizeof(T) : 0,
633+
dstOffset==0 ? (dstPos.x-1)*aligned_sizeof(T) : 0,
634634
dstOffset==0 ? dstPos.y-1 : 0,
635635
dstOffset==0 ? dstPos.z-1 : 0,
636636
0, # LOD
637637
dstMemoryType, dstHost, dstDevice, dstArray,
638638
C_NULL, # reserved
639639
dstPitch, dstHeight,
640640
# extent
641-
width*sizeof(T), height, depth
641+
width*aligned_sizeof(T), height, depth
642642
))
643643
cuMemcpy3DAsync_v2(params_ref, stream)
644644
async || synchronize(stream)
@@ -698,7 +698,7 @@ function pin(ref::Base.RefValue{T}) where T
698698
ctx = context()
699699
ptr = Base.unsafe_convert(Ptr{T}, ref)
700700

701-
__pin(ptr, sizeof(T))
701+
__pin(ptr, aligned_sizeof(T))
702702
finalizer(ref) do _
703703
__unpin(ptr, ctx)
704704
end

lib/cudadrv/module/global.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ struct CuGlobal{T}
1919
ptr_ref = Ref{CuPtr{Cvoid}}()
2020
nbytes_ref = Ref{Csize_t}()
2121
cuModuleGetGlobal_v2(ptr_ref, nbytes_ref, mod, name)
22-
if nbytes_ref[] != sizeof(T)
22+
if nbytes_ref[] != aligned_sizeof(T)
2323
throw(ArgumentError("size of global '$name' does not match type parameter type $T"))
2424
end
2525
buf = DeviceMemory(device(), context(), ptr_ref[], nbytes_ref[], false)

src/CUDA.jl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,18 @@ import NVTX
4747

4848
using Printf
4949

50+
# Julia has several notions of `sizeof`
51+
# - Base.sizeof is the size of an object in memory
52+
# - Base.aligned_sizeof is the size of an object in an array/inline alloced
53+
# Both of them are equivalent for immutable objects, but differ for mutable singtons and Symbol
54+
# We use `aligned_sizeof` since we care about the size of a type in an array
55+
@static if VERSION < v"1.11.0"
56+
@generated function aligned_sizeof(::Type{T}) where T
57+
return :($(Base.aligned_sizeof(T)))
58+
end
59+
else
60+
import Base: aligned_sizeof
61+
end
5062

5163
## source code includes
5264

src/array.jl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
6767

6868
function CuArray{T,N,M}(::UndefInitializer, dims::Dims{N}) where {T,N,M}
6969
check_eltype("CuArray", T)
70-
maxsize = prod(dims) * sizeof(T)
70+
maxsize = prod(dims) * aligned_sizeof(T)
7171
bufsize = if Base.isbitsunion(T)
7272
# type tag array past the data
7373
maxsize + prod(dims)
@@ -84,7 +84,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
8484
end
8585

8686
function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N};
87-
maxsize::Int=prod(dims) * sizeof(T), offset::Int=0) where {T,N,M}
87+
maxsize::Int=prod(dims) * aligned_sizeof(T), offset::Int=0) where {T,N,M}
8888
check_eltype("CuArray", T)
8989
obj = new{T,N,M}(data, maxsize, offset, dims)
9090
finalizer(unsafe_free!, obj)
@@ -235,7 +235,7 @@ function Base.unsafe_wrap(::Type{CuArray{T,N,M}},
235235
ptr::CuPtr{T}, dims::NTuple{N,Int};
236236
own::Bool=false, ctx::CuContext=context()) where {T,N,M}
237237
isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
238-
sz = prod(dims) * sizeof(T)
238+
sz = prod(dims) * aligned_sizeof(T)
239239

240240
# create a memory object
241241
mem = if M == UnifiedMemory
@@ -290,7 +290,7 @@ supports_hmm(dev) = driver_version() >= v"12.2" &&
290290
function Base.unsafe_wrap(::Type{CuArray{T,N,M}}, p::Ptr{T}, dims::NTuple{N,Int};
291291
ctx::CuContext=context()) where {T,N,M<:AbstractMemory}
292292
isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
293-
sz = prod(dims) * sizeof(T)
293+
sz = prod(dims) * aligned_sizeof(T)
294294

295295
data = if M == UnifiedMemory
296296
# HMM extends unified memory to include system memory
@@ -338,7 +338,7 @@ Base.unsafe_wrap(::Type{CuArray{T,N,M}}, a::Array{T,N}) where {T,N,M} =
338338

339339
## array interface
340340

341-
Base.elsize(::Type{<:CuArray{T}}) where {T} = sizeof(T)
341+
Base.elsize(::Type{<:CuArray{T}}) where {T} = aligned_sizeof(T)
342342

343343
Base.size(x::CuArray) = x.dims
344344
Base.sizeof(x::CuArray) = Base.elsize(x) * length(x)
@@ -837,7 +837,7 @@ end
837837
## derived arrays
838838

839839
function GPUArrays.derive(::Type{T}, a::CuArray, dims::Dims{N}, offset::Int) where {T,N}
840-
offset = (a.offset * Base.elsize(a)) ÷ sizeof(T) + offset
840+
offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
841841
CuArray{T,N}(copy(a.data), dims; a.maxsize, offset)
842842
end
843843

@@ -851,7 +851,7 @@ function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{
851851
end
852852
function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{Union{Base.RangeIndex,Base.ReshapedUnitRange}}}}) where {T,N,P}
853853
return Base.unsafe_convert(CuPtr{T}, parent(V)) +
854-
(Base.first_index(V)-1)*sizeof(T)
854+
(Base.first_index(V)-1)*aligned_sizeof(T)
855855
end
856856

857857

@@ -874,7 +874,7 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T
874874
n == length(A) && return A
875875

876876
# TODO: add additional space to allow for quicker resizing
877-
maxsize = n * sizeof(T)
877+
maxsize = n * aligned_sizeof(T)
878878
bufsize = if isbitstype(T)
879879
maxsize
880880
else

src/compiler/compilation.jl

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -242,11 +242,6 @@ end
242242
CompilerConfig(target, params; kernel, name, always_inline)
243243
end
244244

245-
# a version of `sizeof` that returns the size of the argument we'll pass.
246-
# for example, it supports Symbols where `sizeof(Symbol)` would fail.
247-
argsize(x::Any) = sizeof(x)
248-
argsize(::Type{Symbol}) = sizeof(Ptr{Cvoid})
249-
250245
# compile to executable machine code
251246
function compile(@nospecialize(job::CompilerJob))
252247
# lower to PTX
@@ -286,7 +281,7 @@ function compile(@nospecialize(job::CompilerJob))
286281
argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
287282
!isghosttype(dt) && !Core.Compiler.isconstType(dt)
288283
end
289-
param_usage = sum(argsize, argtypes)
284+
param_usage = sum(aligned_sizeof, argtypes)
290285
param_limit = 4096
291286
if cap >= v"7.0" && ptx >= v"8.1"
292287
param_limit = 32764
@@ -310,7 +305,7 @@ function compile(@nospecialize(job::CompilerJob))
310305
continue
311306
end
312307
name = source_argnames[i]
313-
details *= "\n [$(i-1)] $name::$typ uses $(Base.format_bytes(sizeof(typ)))"
308+
details *= "\n [$(i-1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
314309
end
315310
details *= "\n"
316311

src/device/array.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
2929

3030
# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
3131
CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
32-
maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
32+
maxsize::Int=prod(dims)*aligned_sizeof(T)) where {T,A,N} =
3333
new(ptr, maxsize, dims, prod(dims))
3434
end
3535

@@ -39,7 +39,7 @@ const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}
3939

4040
## array interface
4141

42-
Base.elsize(::Type{<:CuDeviceArray{T}}) where {T} = sizeof(T)
42+
Base.elsize(::Type{<:CuDeviceArray{T}}) where {T} = aligned_sizeof(T)
4343

4444
Base.size(g::CuDeviceArray) = g.dims
4545
Base.sizeof(x::CuDeviceArray) = Base.elsize(x) * length(x)
@@ -239,12 +239,12 @@ function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
239239
err = GPUArrays._reinterpret_exception(T, a)
240240
err === nothing || throw(err)
241241

242-
if sizeof(T) == sizeof(S) # fast case
242+
if aligned_sizeof(T) == aligned_sizeof(S) # fast case
243243
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
244244
end
245245

246246
isize = size(a)
247-
size1 = div(isize[1]*sizeof(S), sizeof(T))
247+
size1 = div(isize[1]*aligned_sizeof(S), aligned_sizeof(T))
248248
osize = tuple(size1, Base.tail(isize)...)
249249
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
250250
end

src/device/texture.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Base.convert(::Type{CUtexObject}, t::CuDeviceTexture) = t.handle
3535

3636
## array interface
3737

38-
Base.elsize(::Type{<:CuDeviceTexture{T}}) where {T} = sizeof(T)
38+
Base.elsize(::Type{<:CuDeviceTexture{T}}) where {T} = aligned_sizeof(T)
3939

4040
Base.size(tm::CuDeviceTexture) = tm.dims
4141
Base.sizeof(tm::CuDeviceTexture) = Base.elsize(x) * length(x)

src/refpointer.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ mutable struct CuRefValue{T} <: AbstractCuRef{T}
4242

4343
function CuRefValue{T}() where {T}
4444
check_eltype("CuRef", T)
45-
buf = pool_alloc(DeviceMemory, sizeof(T))
45+
buf = pool_alloc(DeviceMemory, aligned_sizeof(T))
4646
obj = new(buf)
4747
finalizer(obj) do _
4848
pool_free(buf)

src/texture.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,9 @@ Base.size(tm::CuTextureArray) = tm.dims
5959
Base.length(tm::CuTextureArray) = prod(size(tm))
6060

6161
Base.eltype(tm::CuTextureArray{T,N}) where {T,N} = T
62+
Base.elsize(tm::CuTextureArray) = aligned_sizeof(eltype(tm))
6263

63-
Base.sizeof(tm::CuTextureArray) = sizeof(eltype(tm)) * length(tm)
64+
Base.sizeof(tm::CuTextureArray) = Base.elsize(tm) * length(tm)
6465

6566
Base.pointer(t::CuTextureArray) = t.mem.ptr
6667

0 commit comments

Comments
 (0)