Skip to content

Commit 5774411

Browse files
committed
Prefer aligned_sizeof
1 parent d2f8da2 commit 5774411

File tree

10 files changed

+38
-39
lines changed

10 files changed

+38
-39
lines changed

lib/cudadrv/CUDAdrv.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ using Printf
66

77
using LazyArtifacts
88

9+
import Base: aligned_sizeof
910

1011
# low-level wrappers
1112
include("libcuda.jl")

lib/cudadrv/memory.jl

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ for (fn, srcPtrTy, dstPtrTy) in (("cuMemcpyDtoHAsync_v2", :CuPtr, :Ptr),
411411
@eval function Base.unsafe_copyto!(dst::$dstPtrTy{T}, src::$srcPtrTy{T}, N::Integer;
412412
stream::CuStream=stream(),
413413
async::Bool=false) where T
414-
$(getproperty(CUDA, Symbol(fn)))(dst, src, N*sizeof(T), stream)
414+
$(getproperty(CUDA, Symbol(fn)))(dst, src, N*aligned_sizeof(T), stream)
415415
async || synchronize(stream)
416416
return dst
417417
end
@@ -423,11 +423,11 @@ function Base.unsafe_copyto!(dst::CuPtr{T}, src::CuPtr{T}, N::Integer;
423423
dst_dev = device(dst)
424424
src_dev = device(src)
425425
if dst_dev == src_dev
426-
cuMemcpyDtoDAsync_v2(dst, src, N*sizeof(T), stream)
426+
cuMemcpyDtoDAsync_v2(dst, src, N*aligned_sizeof(T), stream)
427427
else
428428
cuMemcpyPeerAsync(dst, context(dst_dev),
429429
src, context(src_dev),
430-
N*sizeof(T), stream)
430+
N*aligned_sizeof(T), stream)
431431
end
432432
async || synchronize(stream)
433433
return dst
@@ -436,24 +436,24 @@ end
436436
function Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::Ptr{T}, N::Integer;
437437
stream::CuStream=stream(),
438438
async::Bool=false) where T
439-
cuMemcpyHtoAAsync_v2(dst, doffs, src, N*sizeof(T), stream)
439+
cuMemcpyHtoAAsync_v2(dst, doffs, src, N*aligned_sizeof(T), stream)
440440
async || synchronize(stream)
441441
return dst
442442
end
443443

444444
function Base.unsafe_copyto!(dst::Ptr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer;
445445
stream::CuStream=stream(),
446446
async::Bool=false) where T
447-
cuMemcpyAtoHAsync_v2(dst, src, soffs, N*sizeof(T), stream)
447+
cuMemcpyAtoHAsync_v2(dst, src, soffs, N*aligned_sizeof(T), stream)
448448
async || synchronize(stream)
449449
return dst
450450
end
451451

452452
Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::CuPtr{T}, N::Integer) where {T} =
453-
cuMemcpyDtoA_v2(dst, doffs, src, N*sizeof(T))
453+
cuMemcpyDtoA_v2(dst, doffs, src, N*aligned_sizeof(T))
454454

455455
Base.unsafe_copyto!(dst::CuPtr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer) where {T} =
456-
cuMemcpyAtoD_v2(dst, src, soffs, N*sizeof(T))
456+
cuMemcpyAtoD_v2(dst, src, soffs, N*aligned_sizeof(T))
457457

458458
Base.unsafe_copyto!(dst::CuArrayPtr, src, N::Integer; kwargs...) =
459459
Base.unsafe_copyto!(dst, 0, src, N; kwargs...)
@@ -529,15 +529,15 @@ function unsafe_copy2d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
529529

530530
params_ref = Ref(CUDA_MEMCPY2D(
531531
# source
532-
(srcPos.x-1)*sizeof(T), srcPos.y-1,
532+
(srcPos.x-1)*aligned_sizeof(T), srcPos.y-1,
533533
srcMemoryType, srcHost, srcDevice, srcArray,
534534
srcPitch,
535535
# destination
536-
(dstPos.x-1)*sizeof(T), dstPos.y-1,
536+
(dstPos.x-1)*aligned_sizeof(T), dstPos.y-1,
537537
dstMemoryType, dstHost, dstDevice, dstArray,
538538
dstPitch,
539539
# extent
540-
width*sizeof(T), height
540+
width*aligned_sizeof(T), height
541541
))
542542
cuMemcpy2DAsync_v2(params_ref, stream)
543543
async || synchronize(stream)
@@ -569,8 +569,8 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
569569
# when using the stream-ordered memory allocator
570570
# NOTE: we apply the workaround unconditionally, since we want to keep this call cheap.
571571
if v"11.2" <= driver_version() <= v"11.3" #&& pools[device()].stream_ordered
572-
srcOffset = (srcPos.x-1)*sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
573-
dstOffset = (dstPos.x-1)*sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
572+
srcOffset = (srcPos.x-1)*aligned_sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
573+
dstOffset = (dstPos.x-1)*aligned_sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
574574
else
575575
srcOffset = 0
576576
dstOffset = 0
@@ -622,23 +622,23 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
622622

623623
params_ref = Ref(CUDA_MEMCPY3D(
624624
# source
625-
srcOffset==0 ? (srcPos.x-1)*sizeof(T) : 0,
625+
srcOffset==0 ? (srcPos.x-1)*aligned_sizeof(T) : 0,
626626
srcOffset==0 ? srcPos.y-1 : 0,
627627
srcOffset==0 ? srcPos.z-1 : 0,
628628
0, # LOD
629629
srcMemoryType, srcHost, srcDevice, srcArray,
630630
C_NULL, # reserved
631631
srcPitch, srcHeight,
632632
# destination
633-
dstOffset==0 ? (dstPos.x-1)*sizeof(T) : 0,
633+
dstOffset==0 ? (dstPos.x-1)*aligned_sizeof(T) : 0,
634634
dstOffset==0 ? dstPos.y-1 : 0,
635635
dstOffset==0 ? dstPos.z-1 : 0,
636636
0, # LOD
637637
dstMemoryType, dstHost, dstDevice, dstArray,
638638
C_NULL, # reserved
639639
dstPitch, dstHeight,
640640
# extent
641-
width*sizeof(T), height, depth
641+
width*aligned_sizeof(T), height, depth
642642
))
643643
cuMemcpy3DAsync_v2(params_ref, stream)
644644
async || synchronize(stream)
@@ -698,7 +698,7 @@ function pin(ref::Base.RefValue{T}) where T
698698
ctx = context()
699699
ptr = Base.unsafe_convert(Ptr{T}, ref)
700700

701-
__pin(ptr, sizeof(T))
701+
__pin(ptr, aligned_sizeof(T))
702702
finalizer(ref) do _
703703
__unpin(ptr, ctx)
704704
end

lib/cudadrv/module/global.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ struct CuGlobal{T}
1919
ptr_ref = Ref{CuPtr{Cvoid}}()
2020
nbytes_ref = Ref{Csize_t}()
2121
cuModuleGetGlobal_v2(ptr_ref, nbytes_ref, mod, name)
22-
if nbytes_ref[] != sizeof(T)
22+
if nbytes_ref[] != aligned_sizeof(T)
2323
throw(ArgumentError("size of global '$name' does not match type parameter type $T"))
2424
end
2525
buf = DeviceMemory(device(), context(), ptr_ref[], nbytes_ref[], false)

src/CUDA.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ import NVTX
4848
using Printf
4949

5050

51+
import Base: aligned_sizeof
52+
5153
## source code includes
5254

5355
include("pointer.jl")

src/array.jl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
6464

6565
function CuArray{T,N,M}(::UndefInitializer, dims::Dims{N}) where {T,N,M}
6666
check_eltype("CuArray", T)
67-
maxsize = prod(dims) * sizeof(T)
67+
maxsize = prod(dims) * aligned_sizeof(T)
6868
bufsize = if Base.isbitsunion(T)
6969
# type tag array past the data
7070
maxsize + prod(dims)
@@ -81,7 +81,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
8181
end
8282

8383
function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N};
84-
maxsize::Int=prod(dims) * sizeof(T), offset::Int=0) where {T,N,M}
84+
maxsize::Int=prod(dims) * aligned_sizeof(T), offset::Int=0) where {T,N,M}
8585
check_eltype("CuArray", T)
8686
obj = new{T,N,M}(data, maxsize, offset, dims)
8787
finalizer(unsafe_free!, obj)
@@ -232,7 +232,7 @@ function Base.unsafe_wrap(::Type{CuArray{T,N,M}},
232232
ptr::CuPtr{T}, dims::NTuple{N,Int};
233233
own::Bool=false, ctx::CuContext=context()) where {T,N,M}
234234
isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
235-
sz = prod(dims) * sizeof(T)
235+
sz = prod(dims) * aligned_sizeof(T)
236236

237237
# create a memory object
238238
mem = if M == UnifiedMemory
@@ -287,7 +287,7 @@ supports_hmm(dev) = driver_version() >= v"12.2" &&
287287
function Base.unsafe_wrap(::Type{CuArray{T,N,M}}, p::Ptr{T}, dims::NTuple{N,Int};
288288
ctx::CuContext=context()) where {T,N,M<:AbstractMemory}
289289
isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
290-
sz = prod(dims) * sizeof(T)
290+
sz = prod(dims) * aligned_sizeof(T)
291291

292292
data = if M == UnifiedMemory
293293
# HMM extends unified memory to include system memory
@@ -335,7 +335,7 @@ Base.unsafe_wrap(::Type{CuArray{T,N,M}}, a::Array{T,N}) where {T,N,M} =
335335

336336
## array interface
337337

338-
Base.elsize(::Type{<:CuArray{T}}) where {T} = sizeof(T)
338+
Base.elsize(::Type{<:CuArray{T}}) where {T} = aligned_sizeof(T)
339339

340340
Base.size(x::CuArray) = x.dims
341341
Base.sizeof(x::CuArray) = Base.elsize(x) * length(x)
@@ -834,7 +834,7 @@ end
834834
## derived arrays
835835

836836
function GPUArrays.derive(::Type{T}, a::CuArray, dims::Dims{N}, offset::Int) where {T,N}
837-
offset = (a.offset * Base.elsize(a)) ÷ sizeof(T) + offset
837+
offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
838838
CuArray{T,N}(copy(a.data), dims; a.maxsize, offset)
839839
end
840840

@@ -848,7 +848,7 @@ function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{
848848
end
849849
function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{Union{Base.RangeIndex,Base.ReshapedUnitRange}}}}) where {T,N,P}
850850
return Base.unsafe_convert(CuPtr{T}, parent(V)) +
851-
(Base.first_index(V)-1)*sizeof(T)
851+
(Base.first_index(V)-1)*aligned_sizeof(T)
852852
end
853853

854854

@@ -871,7 +871,7 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T
871871
n == length(A) && return A
872872

873873
# TODO: add additional space to allow for quicker resizing
874-
maxsize = n * sizeof(T)
874+
maxsize = n * aligned_sizeof(T)
875875
bufsize = if isbitstype(T)
876876
maxsize
877877
else

src/compiler/compilation.jl

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -242,11 +242,6 @@ end
242242
CompilerConfig(target, params; kernel, name, always_inline)
243243
end
244244

245-
# a version of `sizeof` that returns the size of the argument we'll pass.
246-
# for example, it supports Symbols where `sizeof(Symbol)` would fail.
247-
argsize(x::Any) = sizeof(x)
248-
argsize(::Type{Symbol}) = sizeof(Ptr{Cvoid})
249-
250245
# compile to executable machine code
251246
function compile(@nospecialize(job::CompilerJob))
252247
# lower to PTX
@@ -286,7 +281,7 @@ function compile(@nospecialize(job::CompilerJob))
286281
argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
287282
!isghosttype(dt) && !Core.Compiler.isconstType(dt)
288283
end
289-
param_usage = sum(argsize, argtypes)
284+
param_usage = sum(aligned_sizeof, argtypes)
290285
param_limit = 4096
291286
if cap >= v"7.0" && ptx >= v"8.1"
292287
param_limit = 32764
@@ -310,7 +305,7 @@ function compile(@nospecialize(job::CompilerJob))
310305
continue
311306
end
312307
name = source_argnames[i]
313-
details *= "\n [$(i-1)] $name::$typ uses $(Base.format_bytes(sizeof(typ)))"
308+
details *= "\n [$(i-1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
314309
end
315310
details *= "\n"
316311

src/device/array.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
2929

3030
# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
3131
CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
32-
maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
32+
maxsize::Int=prod(dims)*aligned_sizeof(T)) where {T,A,N} =
3333
new(ptr, maxsize, dims, prod(dims))
3434
end
3535

@@ -39,7 +39,7 @@ const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}
3939

4040
## array interface
4141

42-
Base.elsize(::Type{<:CuDeviceArray{T}}) where {T} = sizeof(T)
42+
Base.elsize(::Type{<:CuDeviceArray{T}}) where {T} = aligned_sizeof(T)
4343

4444
Base.size(g::CuDeviceArray) = g.dims
4545
Base.sizeof(x::CuDeviceArray) = Base.elsize(x) * length(x)
@@ -239,12 +239,12 @@ function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
239239
err = GPUArrays._reinterpret_exception(T, a)
240240
err === nothing || throw(err)
241241

242-
if sizeof(T) == sizeof(S) # fast case
242+
if aligned_sizeof(T) == aligned_sizeof(S) # fast case
243243
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
244244
end
245245

246246
isize = size(a)
247-
size1 = div(isize[1]*sizeof(S), sizeof(T))
247+
size1 = div(isize[1]*aligned_sizeof(S), aligned_sizeof(T))
248248
osize = tuple(size1, Base.tail(isize)...)
249249
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
250250
end

src/device/texture.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Base.convert(::Type{CUtexObject}, t::CuDeviceTexture) = t.handle
3535

3636
## array interface
3737

38-
Base.elsize(::Type{<:CuDeviceTexture{T}}) where {T} = sizeof(T)
38+
Base.elsize(::Type{<:CuDeviceTexture{T}}) where {T} = aligned_sizeof(T)
3939

4040
Base.size(tm::CuDeviceTexture) = tm.dims
4141
Base.sizeof(tm::CuDeviceTexture) = Base.elsize(x) * length(x)

src/refpointer.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ mutable struct CuRefValue{T} <: AbstractCuRef{T}
4242

4343
function CuRefValue{T}() where {T}
4444
check_eltype("CuRef", T)
45-
buf = pool_alloc(DeviceMemory, sizeof(T))
45+
buf = pool_alloc(DeviceMemory, aligned_sizeof(T))
4646
obj = new(buf)
4747
finalizer(obj) do _
4848
pool_free(buf)

src/texture.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,9 @@ Base.size(tm::CuTextureArray) = tm.dims
5959
Base.length(tm::CuTextureArray) = prod(size(tm))
6060

6161
Base.eltype(tm::CuTextureArray{T,N}) where {T,N} = T
62+
Base.elsize(tm::CuTextureArray) = aligned_sizeof(eltype(tm))
6263

63-
Base.sizeof(tm::CuTextureArray) = sizeof(eltype(tm)) * length(tm)
64+
Base.sizeof(tm::CuTextureArray) = elsize(tm) * length(tm)
6465

6566
Base.pointer(t::CuTextureArray) = t.mem.ptr
6667

0 commit comments

Comments
 (0)