-
Notifications
You must be signed in to change notification settings - Fork 256
Prefer aligned_sizeof #2757
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prefer aligned_sizeof #2757
Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index 5981ad3d0..61000465c 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -411,7 +411,7 @@ for (fn, srcPtrTy, dstPtrTy) in (("cuMemcpyDtoHAsync_v2", :CuPtr, :Ptr),
@eval function Base.unsafe_copyto!(dst::$dstPtrTy{T}, src::$srcPtrTy{T}, N::Integer;
stream::CuStream=stream(),
async::Bool=false) where T
- $(getproperty(CUDA, Symbol(fn)))(dst, src, N*aligned_sizeof(T), stream)
+ $(getproperty(CUDA, Symbol(fn)))(dst, src, N * aligned_sizeof(T), stream)
async || synchronize(stream)
return dst
end
@@ -423,11 +423,12 @@ function Base.unsafe_copyto!(dst::CuPtr{T}, src::CuPtr{T}, N::Integer;
dst_dev = device(dst)
src_dev = device(src)
if dst_dev == src_dev
- cuMemcpyDtoDAsync_v2(dst, src, N*aligned_sizeof(T), stream)
+ cuMemcpyDtoDAsync_v2(dst, src, N * aligned_sizeof(T), stream)
else
cuMemcpyPeerAsync(dst, context(dst_dev),
src, context(src_dev),
- N*aligned_sizeof(T), stream)
+ N * aligned_sizeof(T), stream
+ )
end
async || synchronize(stream)
return dst
@@ -436,7 +437,7 @@ end
function Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::Ptr{T}, N::Integer;
stream::CuStream=stream(),
async::Bool=false) where T
- cuMemcpyHtoAAsync_v2(dst, doffs, src, N*aligned_sizeof(T), stream)
+ cuMemcpyHtoAAsync_v2(dst, doffs, src, N * aligned_sizeof(T), stream)
async || synchronize(stream)
return dst
end
@@ -444,16 +445,16 @@ end
function Base.unsafe_copyto!(dst::Ptr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer;
stream::CuStream=stream(),
async::Bool=false) where T
- cuMemcpyAtoHAsync_v2(dst, src, soffs, N*aligned_sizeof(T), stream)
+ cuMemcpyAtoHAsync_v2(dst, src, soffs, N * aligned_sizeof(T), stream)
async || synchronize(stream)
return dst
end
Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::CuPtr{T}, N::Integer) where {T} =
- cuMemcpyDtoA_v2(dst, doffs, src, N*aligned_sizeof(T))
+ cuMemcpyDtoA_v2(dst, doffs, src, N * aligned_sizeof(T))
Base.unsafe_copyto!(dst::CuPtr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer) where {T} =
- cuMemcpyAtoD_v2(dst, src, soffs, N*aligned_sizeof(T))
+ cuMemcpyAtoD_v2(dst, src, soffs, N * aligned_sizeof(T))
Base.unsafe_copyto!(dst::CuArrayPtr, src, N::Integer; kwargs...) =
Base.unsafe_copyto!(dst, 0, src, N; kwargs...)
@@ -529,15 +530,15 @@ function unsafe_copy2d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
params_ref = Ref(CUDA_MEMCPY2D(
# source
- (srcPos.x-1)*aligned_sizeof(T), srcPos.y-1,
+ (srcPos.x - 1) * aligned_sizeof(T), srcPos.y - 1,
srcMemoryType, srcHost, srcDevice, srcArray,
srcPitch,
# destination
- (dstPos.x-1)*aligned_sizeof(T), dstPos.y-1,
+ (dstPos.x - 1) * aligned_sizeof(T), dstPos.y - 1,
dstMemoryType, dstHost, dstDevice, dstArray,
dstPitch,
# extent
- width*aligned_sizeof(T), height
+ width * aligned_sizeof(T), height
))
cuMemcpy2DAsync_v2(params_ref, stream)
async || synchronize(stream)
@@ -569,8 +570,8 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
# when using the stream-ordered memory allocator
# NOTE: we apply the workaround unconditionally, since we want to keep this call cheap.
if v"11.2" <= driver_version() <= v"11.3" #&& pools[device()].stream_ordered
- srcOffset = (srcPos.x-1)*aligned_sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
- dstOffset = (dstPos.x-1)*aligned_sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
+ srcOffset = (srcPos.x - 1) * aligned_sizeof(T) + srcPitch * ((srcPos.y - 1) + srcHeight * (srcPos.z - 1))
+ dstOffset = (dstPos.x - 1) * aligned_sizeof(T) + dstPitch * ((dstPos.y - 1) + dstHeight * (dstPos.z - 1))
else
srcOffset = 0
dstOffset = 0
@@ -622,7 +623,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
params_ref = Ref(CUDA_MEMCPY3D(
# source
- srcOffset==0 ? (srcPos.x-1)*aligned_sizeof(T) : 0,
+ srcOffset == 0 ? (srcPos.x - 1) * aligned_sizeof(T) : 0,
srcOffset==0 ? srcPos.y-1 : 0,
srcOffset==0 ? srcPos.z-1 : 0,
0, # LOD
@@ -630,7 +631,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
C_NULL, # reserved
srcPitch, srcHeight,
# destination
- dstOffset==0 ? (dstPos.x-1)*aligned_sizeof(T) : 0,
+ dstOffset == 0 ? (dstPos.x - 1) * aligned_sizeof(T) : 0,
dstOffset==0 ? dstPos.y-1 : 0,
dstOffset==0 ? dstPos.z-1 : 0,
0, # LOD
@@ -638,7 +639,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
C_NULL, # reserved
dstPitch, dstHeight,
# extent
- width*aligned_sizeof(T), height, depth
+ width * aligned_sizeof(T), height, depth
))
cuMemcpy3DAsync_v2(params_ref, stream)
async || synchronize(stream)
diff --git a/src/CUDA.jl b/src/CUDA.jl
index a524f4eac..a430cf439 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -53,9 +53,9 @@ using Printf
# Both of them are equivalent for immutable objects, but differ for mutable singtons and Symbol
# We use `aligned_sizeof` since we care about the size of a type in an array
@static if VERSION < v"1.11.0"
- @generated function aligned_sizeof(::Type{T}) where T
+ @generated function aligned_sizeof(::Type{T}) where {T}
return :($(Base.aligned_sizeof(T)))
- end
+ end
else
import Base: aligned_sizeof
end
diff --git a/src/array.jl b/src/array.jl
index dbf3949b6..e69a398d0 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -67,7 +67,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
function CuArray{T,N,M}(::UndefInitializer, dims::Dims{N}) where {T,N,M}
check_eltype("CuArray", T)
- maxsize = prod(dims) * aligned_sizeof(T)
+ maxsize = prod(dims) * aligned_sizeof(T)
bufsize = if Base.isbitsunion(T)
# type tag array past the data
maxsize + prod(dims)
@@ -84,7 +84,8 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
end
function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N};
- maxsize::Int=prod(dims) * aligned_sizeof(T), offset::Int=0) where {T,N,M}
+ maxsize::Int = prod(dims) * aligned_sizeof(T), offset::Int = 0
+ ) where {T, N, M}
check_eltype("CuArray", T)
obj = new{T,N,M}(data, maxsize, offset, dims)
finalizer(unsafe_free!, obj)
@@ -235,7 +236,7 @@ function Base.unsafe_wrap(::Type{CuArray{T,N,M}},
ptr::CuPtr{T}, dims::NTuple{N,Int};
own::Bool=false, ctx::CuContext=context()) where {T,N,M}
isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
- sz = prod(dims) * aligned_sizeof(T)
+ sz = prod(dims) * aligned_sizeof(T)
# create a memory object
mem = if M == UnifiedMemory
@@ -290,7 +291,7 @@ supports_hmm(dev) = driver_version() >= v"12.2" &&
function Base.unsafe_wrap(::Type{CuArray{T,N,M}}, p::Ptr{T}, dims::NTuple{N,Int};
ctx::CuContext=context()) where {T,N,M<:AbstractMemory}
isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
- sz = prod(dims) * aligned_sizeof(T)
+ sz = prod(dims) * aligned_sizeof(T)
data = if M == UnifiedMemory
# HMM extends unified memory to include system memory
@@ -837,7 +838,7 @@ end
## derived arrays
function GPUArrays.derive(::Type{T}, a::CuArray, dims::Dims{N}, offset::Int) where {T,N}
- offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
+ offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
CuArray{T,N}(copy(a.data), dims; a.maxsize, offset)
end
@@ -851,7 +852,7 @@ function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{
end
function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{Union{Base.RangeIndex,Base.ReshapedUnitRange}}}}) where {T,N,P}
return Base.unsafe_convert(CuPtr{T}, parent(V)) +
- (Base.first_index(V)-1)*aligned_sizeof(T)
+ (Base.first_index(V) - 1) * aligned_sizeof(T)
end
@@ -874,7 +875,7 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T
n == length(A) && return A
# TODO: add additional space to allow for quicker resizing
- maxsize = n * aligned_sizeof(T)
+ maxsize = n * aligned_sizeof(T)
bufsize = if isbitstype(T)
maxsize
else
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 93a232d04..b48d7073b 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -305,7 +305,7 @@ function compile(@nospecialize(job::CompilerJob))
continue
end
name = source_argnames[i]
- details *= "\n [$(i-1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
+ details *= "\n [$(i - 1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
end
details *= "\n"
diff --git a/src/device/array.jl b/src/device/array.jl
index 59322349c..0adb40c85 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -29,7 +29,8 @@ struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
- maxsize::Int=prod(dims)*aligned_sizeof(T)) where {T,A,N} =
+ maxsize::Int = prod(dims) * aligned_sizeof(T)
+ ) where {T, A, N} =
new(ptr, maxsize, dims, prod(dims))
end
@@ -239,12 +240,12 @@ function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
err = GPUArrays._reinterpret_exception(T, a)
err === nothing || throw(err)
- if aligned_sizeof(T) == aligned_sizeof(S) # fast case
+ if aligned_sizeof(T) == aligned_sizeof(S) # fast case
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
end
isize = size(a)
- size1 = div(isize[1]*aligned_sizeof(S), aligned_sizeof(T))
+ size1 = div(isize[1] * aligned_sizeof(S), aligned_sizeof(T))
osize = tuple(size1, Base.tail(isize)...)
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
| Benchmark suite | Current: 71da935 | Previous: c113666 | Ratio |
|---|---|---|---|
latency/precompile |
49238463615.5 ns |
48835237613 ns |
1.01 |
latency/ttfp |
7233454087 ns |
7263249839 ns |
1.00 |
latency/import |
3466573189 ns |
3465493701 ns |
1.00 |
integration/volumerhs |
9622632.5 ns |
9621812 ns |
1.00 |
integration/byval/slices=1 |
146963 ns |
147442 ns |
1.00 |
integration/byval/slices=3 |
425235 ns |
425693 ns |
1.00 |
integration/byval/reference |
145230 ns |
145206 ns |
1.00 |
integration/byval/slices=2 |
286099 ns |
286758 ns |
1.00 |
integration/cudadevrt |
103434 ns |
103612 ns |
1.00 |
kernel/indexing |
14000.5 ns |
14399 ns |
0.97 |
kernel/indexing_checked |
14794 ns |
15068 ns |
0.98 |
kernel/occupancy |
714.9477611940298 ns |
702.0419580419581 ns |
1.02 |
kernel/launch |
2264 ns |
2258.4444444444443 ns |
1.00 |
kernel/rand |
14502 ns |
16717 ns |
0.87 |
array/reverse/1d |
19484.5 ns |
20195 ns |
0.96 |
array/reverse/2d |
24764 ns |
24757 ns |
1.00 |
array/reverse/1d_inplace |
10195 ns |
11401 ns |
0.89 |
array/reverse/2d_inplace |
10844 ns |
13509 ns |
0.80 |
array/copy |
20748 ns |
21451.5 ns |
0.97 |
array/iteration/findall/int |
157574 ns |
159568 ns |
0.99 |
array/iteration/findall/bool |
138648 ns |
140978 ns |
0.98 |
array/iteration/findfirst/int |
153631.5 ns |
155188 ns |
0.99 |
array/iteration/findfirst/bool |
154431.5 ns |
155942 ns |
0.99 |
array/iteration/scalar |
71660 ns |
73335 ns |
0.98 |
array/iteration/logical |
214107.5 ns |
214260.5 ns |
1.00 |
array/iteration/findmin/1d |
41621 ns |
43021 ns |
0.97 |
array/iteration/findmin/2d |
94009 ns |
94859 ns |
0.99 |
array/reductions/reduce/1d |
44188 ns |
37155 ns |
1.19 |
array/reductions/reduce/2d |
42692.5 ns |
41718 ns |
1.02 |
array/reductions/mapreduce/1d |
39533.5 ns |
35382 ns |
1.12 |
array/reductions/mapreduce/2d |
41897.5 ns |
52285.5 ns |
0.80 |
array/broadcast |
20666 ns |
21333 ns |
0.97 |
array/copyto!/gpu_to_gpu |
13740 ns |
11882 ns |
1.16 |
array/copyto!/cpu_to_gpu |
208011 ns |
211668 ns |
0.98 |
array/copyto!/gpu_to_cpu |
244959 ns |
244976.5 ns |
1.00 |
array/accumulate/1d |
109544 ns |
110064 ns |
1.00 |
array/accumulate/2d |
80451 ns |
81465 ns |
0.99 |
array/construct |
1326.1 ns |
1277.4 ns |
1.04 |
array/random/randn/Float32 |
49877 ns |
45327.5 ns |
1.10 |
array/random/randn!/Float32 |
26410 ns |
27071 ns |
0.98 |
array/random/rand!/Int64 |
27000 ns |
27414 ns |
0.98 |
array/random/rand!/Float32 |
8538.333333333334 ns |
8820 ns |
0.97 |
array/random/rand/Int64 |
38105 ns |
38724 ns |
0.98 |
array/random/rand/Float32 |
13098 ns |
13425 ns |
0.98 |
array/permutedims/4d |
61290 ns |
61677 ns |
0.99 |
array/permutedims/2d |
55674 ns |
55974 ns |
0.99 |
array/permutedims/3d |
56742 ns |
56818 ns |
1.00 |
array/sorting/1d |
2775548 ns |
2778484 ns |
1.00 |
array/sorting/by |
3366359 ns |
3369313 ns |
1.00 |
array/sorting/2d |
1084810 ns |
1086275.5 ns |
1.00 |
cuda/synchronization/stream/auto |
1035.090909090909 ns |
1003.9230769230769 ns |
1.03 |
cuda/synchronization/stream/nonblocking |
6540.2 ns |
6655.2 ns |
0.98 |
cuda/synchronization/stream/blocking |
797.0194174757281 ns |
783.8139534883721 ns |
1.02 |
cuda/synchronization/context/auto |
1153.4 ns |
1147.9 ns |
1.00 |
cuda/synchronization/context/nonblocking |
6778.2 ns |
6850.8 ns |
0.99 |
cuda/synchronization/context/blocking |
898.94 ns |
947.1111111111111 ns |
0.95 |
This comment was automatically generated by workflow using github-action-benchmark.
|
CI failure related, looks like this isn't optimized by codegen / const-propped away. Probably should be fixed upstream? In the mean time, we can use a shadowed copy that has the necessary const-prop annotations, or a Could you also document at the import site when we should use |
d812291 to
2364bc0
Compare
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #2757 +/- ##
=======================================
Coverage 89.43% 89.43%
=======================================
Files 153 153
Lines 13185 13186 +1
=======================================
+ Hits 11792 11793 +1
Misses 1393 1393 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
2364bc0 to
71da935
Compare
Base itself uses
aligned_sizeoffor things likeelsize(::Array).aligned_sizeofcalculates the inline sizof, in contrast to the memory sizeof.Matters for Symbols and
mutablesingleton objects x-ref #2753