Skip to content

Conversation

@vchuravy
Copy link
Member

@vchuravy vchuravy commented Apr 22, 2025

Base itself uses aligned_sizeof for things like elsize(::Array). aligned_sizeof calculates the inline sizof, in contrast to the memory sizeof.

Matters for Symbols and mutable singleton objects x-ref #2753

@github-actions
Copy link
Contributor

github-actions bot commented Apr 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index 5981ad3d0..61000465c 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -411,7 +411,7 @@ for (fn, srcPtrTy, dstPtrTy) in (("cuMemcpyDtoHAsync_v2", :CuPtr, :Ptr),
     @eval function Base.unsafe_copyto!(dst::$dstPtrTy{T}, src::$srcPtrTy{T}, N::Integer;
                                        stream::CuStream=stream(),
                                        async::Bool=false) where T
-        $(getproperty(CUDA, Symbol(fn)))(dst, src, N*aligned_sizeof(T), stream)
+        $(getproperty(CUDA, Symbol(fn)))(dst, src, N * aligned_sizeof(T), stream)
         async || synchronize(stream)
         return dst
     end
@@ -423,11 +423,12 @@ function Base.unsafe_copyto!(dst::CuPtr{T}, src::CuPtr{T}, N::Integer;
     dst_dev = device(dst)
     src_dev = device(src)
     if dst_dev == src_dev
-        cuMemcpyDtoDAsync_v2(dst, src, N*aligned_sizeof(T), stream)
+        cuMemcpyDtoDAsync_v2(dst, src, N * aligned_sizeof(T), stream)
     else
         cuMemcpyPeerAsync(dst, context(dst_dev),
                           src, context(src_dev),
-                          N*aligned_sizeof(T), stream)
+            N * aligned_sizeof(T), stream
+        )
     end
     async || synchronize(stream)
     return dst
@@ -436,7 +437,7 @@ end
 function Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::Ptr{T}, N::Integer;
                              stream::CuStream=stream(),
                              async::Bool=false) where T
-    cuMemcpyHtoAAsync_v2(dst, doffs, src, N*aligned_sizeof(T), stream)
+    cuMemcpyHtoAAsync_v2(dst, doffs, src, N * aligned_sizeof(T), stream)
     async || synchronize(stream)
     return dst
 end
@@ -444,16 +445,16 @@ end
 function Base.unsafe_copyto!(dst::Ptr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer;
                              stream::CuStream=stream(),
                              async::Bool=false) where T
-    cuMemcpyAtoHAsync_v2(dst, src, soffs, N*aligned_sizeof(T), stream)
+    cuMemcpyAtoHAsync_v2(dst, src, soffs, N * aligned_sizeof(T), stream)
     async || synchronize(stream)
     return dst
 end
 
 Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::CuPtr{T}, N::Integer) where {T} =
-    cuMemcpyDtoA_v2(dst, doffs, src, N*aligned_sizeof(T))
+    cuMemcpyDtoA_v2(dst, doffs, src, N * aligned_sizeof(T))
 
 Base.unsafe_copyto!(dst::CuPtr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer) where {T} =
-    cuMemcpyAtoD_v2(dst, src, soffs, N*aligned_sizeof(T))
+    cuMemcpyAtoD_v2(dst, src, soffs, N * aligned_sizeof(T))
 
 Base.unsafe_copyto!(dst::CuArrayPtr, src, N::Integer; kwargs...) =
     Base.unsafe_copyto!(dst, 0, src, N; kwargs...)
@@ -529,15 +530,15 @@ function unsafe_copy2d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
 
     params_ref = Ref(CUDA_MEMCPY2D(
         # source
-        (srcPos.x-1)*aligned_sizeof(T), srcPos.y-1,
+            (srcPos.x - 1) * aligned_sizeof(T), srcPos.y - 1,
         srcMemoryType, srcHost, srcDevice, srcArray,
         srcPitch,
         # destination
-        (dstPos.x-1)*aligned_sizeof(T), dstPos.y-1,
+            (dstPos.x - 1) * aligned_sizeof(T), dstPos.y - 1,
         dstMemoryType, dstHost, dstDevice, dstArray,
         dstPitch,
         # extent
-        width*aligned_sizeof(T), height
+            width * aligned_sizeof(T), height
     ))
     cuMemcpy2DAsync_v2(params_ref, stream)
     async || synchronize(stream)
@@ -569,8 +570,8 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
     #                       when using the stream-ordered memory allocator
     # NOTE: we apply the workaround unconditionally, since we want to keep this call cheap.
     if v"11.2" <= driver_version() <= v"11.3" #&& pools[device()].stream_ordered
-        srcOffset = (srcPos.x-1)*aligned_sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
-        dstOffset = (dstPos.x-1)*aligned_sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
+        srcOffset = (srcPos.x - 1) * aligned_sizeof(T) + srcPitch * ((srcPos.y - 1) + srcHeight * (srcPos.z - 1))
+        dstOffset = (dstPos.x - 1) * aligned_sizeof(T) + dstPitch * ((dstPos.y - 1) + dstHeight * (dstPos.z - 1))
     else
         srcOffset = 0
         dstOffset = 0
@@ -622,7 +623,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
 
     params_ref = Ref(CUDA_MEMCPY3D(
         # source
-        srcOffset==0 ? (srcPos.x-1)*aligned_sizeof(T) : 0,
+            srcOffset == 0 ? (srcPos.x - 1) * aligned_sizeof(T) : 0,
         srcOffset==0 ? srcPos.y-1             : 0,
         srcOffset==0 ? srcPos.z-1             : 0,
         0, # LOD
@@ -630,7 +631,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
         C_NULL, # reserved
         srcPitch, srcHeight,
         # destination
-        dstOffset==0 ? (dstPos.x-1)*aligned_sizeof(T) : 0,
+            dstOffset == 0 ? (dstPos.x - 1) * aligned_sizeof(T) : 0,
         dstOffset==0 ? dstPos.y-1             : 0,
         dstOffset==0 ? dstPos.z-1             : 0,
         0, # LOD
@@ -638,7 +639,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
         C_NULL, # reserved
         dstPitch, dstHeight,
         # extent
-        width*aligned_sizeof(T), height, depth
+            width * aligned_sizeof(T), height, depth
     ))
     cuMemcpy3DAsync_v2(params_ref, stream)
     async || synchronize(stream)
diff --git a/src/CUDA.jl b/src/CUDA.jl
index a524f4eac..a430cf439 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -53,9 +53,9 @@ using Printf
 # Both of them are equivalent for immutable objects, but differ for mutable singtons and Symbol
 # We use `aligned_sizeof` since we care about the size of a type in an array
 @static if VERSION < v"1.11.0"
-   @generated function aligned_sizeof(::Type{T}) where T
+    @generated function aligned_sizeof(::Type{T}) where {T}
         return :($(Base.aligned_sizeof(T)))
-   end
+    end
 else
     import Base: aligned_sizeof
 end
diff --git a/src/array.jl b/src/array.jl
index dbf3949b6..e69a398d0 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -67,7 +67,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
 
   function CuArray{T,N,M}(::UndefInitializer, dims::Dims{N}) where {T,N,M}
     check_eltype("CuArray", T)
-    maxsize = prod(dims) * aligned_sizeof(T)
+        maxsize = prod(dims) * aligned_sizeof(T)
     bufsize = if Base.isbitsunion(T)
       # type tag array past the data
       maxsize + prod(dims)
@@ -84,7 +84,8 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
   end
 
   function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N};
-                        maxsize::Int=prod(dims) * aligned_sizeof(T), offset::Int=0) where {T,N,M}
+            maxsize::Int = prod(dims) * aligned_sizeof(T), offset::Int = 0
+        ) where {T, N, M}
     check_eltype("CuArray", T)
     obj = new{T,N,M}(data, maxsize, offset, dims)
     finalizer(unsafe_free!, obj)
@@ -235,7 +236,7 @@ function Base.unsafe_wrap(::Type{CuArray{T,N,M}},
                           ptr::CuPtr{T}, dims::NTuple{N,Int};
                           own::Bool=false, ctx::CuContext=context()) where {T,N,M}
   isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
-  sz = prod(dims) * aligned_sizeof(T)
+    sz = prod(dims) * aligned_sizeof(T)
 
   # create a memory object
   mem = if M == UnifiedMemory
@@ -290,7 +291,7 @@ supports_hmm(dev) = driver_version() >= v"12.2" &&
 function Base.unsafe_wrap(::Type{CuArray{T,N,M}}, p::Ptr{T}, dims::NTuple{N,Int};
                           ctx::CuContext=context()) where {T,N,M<:AbstractMemory}
   isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
-  sz = prod(dims) * aligned_sizeof(T)
+    sz = prod(dims) * aligned_sizeof(T)
 
   data = if M == UnifiedMemory
     # HMM extends unified memory to include system memory
@@ -837,7 +838,7 @@ end
 ## derived arrays
 
 function GPUArrays.derive(::Type{T}, a::CuArray, dims::Dims{N}, offset::Int) where {T,N}
-  offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
+    offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
   CuArray{T,N}(copy(a.data), dims; a.maxsize, offset)
 end
 
@@ -851,7 +852,7 @@ function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{
 end
 function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{Union{Base.RangeIndex,Base.ReshapedUnitRange}}}}) where {T,N,P}
    return Base.unsafe_convert(CuPtr{T}, parent(V)) +
-          (Base.first_index(V)-1)*aligned_sizeof(T)
+        (Base.first_index(V) - 1) * aligned_sizeof(T)
 end
 
 
@@ -874,7 +875,7 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T
   n == length(A) && return A
 
   # TODO: add additional space to allow for quicker resizing
-  maxsize = n * aligned_sizeof(T)
+    maxsize = n * aligned_sizeof(T)
   bufsize = if isbitstype(T)
     maxsize
   else
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 93a232d04..b48d7073b 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -305,7 +305,7 @@ function compile(@nospecialize(job::CompilerJob))
                     continue
                 end
                 name = source_argnames[i]
-                details *= "\n  [$(i-1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
+                details *= "\n  [$(i - 1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
             end
             details *= "\n"
 
diff --git a/src/device/array.jl b/src/device/array.jl
index 59322349c..0adb40c85 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -29,7 +29,8 @@ struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
 
     # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
     CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
-                         maxsize::Int=prod(dims)*aligned_sizeof(T)) where {T,A,N} =
+        maxsize::Int = prod(dims) * aligned_sizeof(T)
+    ) where {T, A, N} =
         new(ptr, maxsize, dims, prod(dims))
 end
 
@@ -239,12 +240,12 @@ function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
   err = GPUArrays._reinterpret_exception(T, a)
   err === nothing || throw(err)
 
-  if aligned_sizeof(T) == aligned_sizeof(S) # fast case
+    if aligned_sizeof(T) == aligned_sizeof(S) # fast case
     return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
   end
 
   isize = size(a)
-  size1 = div(isize[1]*aligned_sizeof(S), aligned_sizeof(T))
+    size1 = div(isize[1] * aligned_sizeof(S), aligned_sizeof(T))
   osize = tuple(size1, Base.tail(isize)...)
   return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
 end

Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Benchmark suite Current: 71da935 Previous: c113666 Ratio
latency/precompile 49238463615.5 ns 48835237613 ns 1.01
latency/ttfp 7233454087 ns 7263249839 ns 1.00
latency/import 3466573189 ns 3465493701 ns 1.00
integration/volumerhs 9622632.5 ns 9621812 ns 1.00
integration/byval/slices=1 146963 ns 147442 ns 1.00
integration/byval/slices=3 425235 ns 425693 ns 1.00
integration/byval/reference 145230 ns 145206 ns 1.00
integration/byval/slices=2 286099 ns 286758 ns 1.00
integration/cudadevrt 103434 ns 103612 ns 1.00
kernel/indexing 14000.5 ns 14399 ns 0.97
kernel/indexing_checked 14794 ns 15068 ns 0.98
kernel/occupancy 714.9477611940298 ns 702.0419580419581 ns 1.02
kernel/launch 2264 ns 2258.4444444444443 ns 1.00
kernel/rand 14502 ns 16717 ns 0.87
array/reverse/1d 19484.5 ns 20195 ns 0.96
array/reverse/2d 24764 ns 24757 ns 1.00
array/reverse/1d_inplace 10195 ns 11401 ns 0.89
array/reverse/2d_inplace 10844 ns 13509 ns 0.80
array/copy 20748 ns 21451.5 ns 0.97
array/iteration/findall/int 157574 ns 159568 ns 0.99
array/iteration/findall/bool 138648 ns 140978 ns 0.98
array/iteration/findfirst/int 153631.5 ns 155188 ns 0.99
array/iteration/findfirst/bool 154431.5 ns 155942 ns 0.99
array/iteration/scalar 71660 ns 73335 ns 0.98
array/iteration/logical 214107.5 ns 214260.5 ns 1.00
array/iteration/findmin/1d 41621 ns 43021 ns 0.97
array/iteration/findmin/2d 94009 ns 94859 ns 0.99
array/reductions/reduce/1d 44188 ns 37155 ns 1.19
array/reductions/reduce/2d 42692.5 ns 41718 ns 1.02
array/reductions/mapreduce/1d 39533.5 ns 35382 ns 1.12
array/reductions/mapreduce/2d 41897.5 ns 52285.5 ns 0.80
array/broadcast 20666 ns 21333 ns 0.97
array/copyto!/gpu_to_gpu 13740 ns 11882 ns 1.16
array/copyto!/cpu_to_gpu 208011 ns 211668 ns 0.98
array/copyto!/gpu_to_cpu 244959 ns 244976.5 ns 1.00
array/accumulate/1d 109544 ns 110064 ns 1.00
array/accumulate/2d 80451 ns 81465 ns 0.99
array/construct 1326.1 ns 1277.4 ns 1.04
array/random/randn/Float32 49877 ns 45327.5 ns 1.10
array/random/randn!/Float32 26410 ns 27071 ns 0.98
array/random/rand!/Int64 27000 ns 27414 ns 0.98
array/random/rand!/Float32 8538.333333333334 ns 8820 ns 0.97
array/random/rand/Int64 38105 ns 38724 ns 0.98
array/random/rand/Float32 13098 ns 13425 ns 0.98
array/permutedims/4d 61290 ns 61677 ns 0.99
array/permutedims/2d 55674 ns 55974 ns 0.99
array/permutedims/3d 56742 ns 56818 ns 1.00
array/sorting/1d 2775548 ns 2778484 ns 1.00
array/sorting/by 3366359 ns 3369313 ns 1.00
array/sorting/2d 1084810 ns 1086275.5 ns 1.00
cuda/synchronization/stream/auto 1035.090909090909 ns 1003.9230769230769 ns 1.03
cuda/synchronization/stream/nonblocking 6540.2 ns 6655.2 ns 0.98
cuda/synchronization/stream/blocking 797.0194174757281 ns 783.8139534883721 ns 1.02
cuda/synchronization/context/auto 1153.4 ns 1147.9 ns 1.00
cuda/synchronization/context/nonblocking 6778.2 ns 6850.8 ns 0.99
cuda/synchronization/context/blocking 898.94 ns 947.1111111111111 ns 0.95

This comment was automatically generated by workflow using github-action-benchmark.

@maleadt
Copy link
Member

maleadt commented Apr 23, 2025

CI failure related, looks like this isn't optimized by codegen / const-propped away. Probably should be fixed upstream? In the mean time, we can use a shadowed copy that has the necessary const-prop annotations, or a @generated version that forces early evaluation.

Could you also document at the import site when we should use aligned_sizeof, and when sizeof suffices?

@vchuravy vchuravy force-pushed the vc/aligned_sizeof branch from d812291 to 2364bc0 Compare April 24, 2025 13:15
@codecov
Copy link

codecov bot commented Apr 24, 2025

Codecov Report

Attention: Patch coverage is 90.47619% with 2 lines in your changes missing coverage. Please review.

Project coverage is 89.43%. Comparing base (c113666) to head (71da935).
Report is 2 commits behind head on master.

Files with missing lines Patch % Lines
lib/cudadrv/memory.jl 71.42% 2 Missing ⚠️
Additional details and impacted files
@@           Coverage Diff           @@
##           master    #2757   +/-   ##
=======================================
  Coverage   89.43%   89.43%           
=======================================
  Files         153      153           
  Lines       13185    13186    +1     
=======================================
+ Hits        11792    11793    +1     
  Misses       1393     1393           

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@vchuravy vchuravy force-pushed the vc/aligned_sizeof branch from 2364bc0 to 71da935 Compare April 25, 2025 12:42
@vchuravy vchuravy enabled auto-merge (squash) April 25, 2025 12:42
@vchuravy vchuravy disabled auto-merge April 25, 2025 16:20
@vchuravy vchuravy merged commit 2807156 into master Apr 25, 2025
3 checks passed
@vchuravy vchuravy deleted the vc/aligned_sizeof branch April 25, 2025 16:20
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants