Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CUDACore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand Down Expand Up @@ -59,6 +60,7 @@ LazyArtifacts = "1"
Libdl = "1"
LinearAlgebra = "1"
Logging = "1"
PrecompileTools = "1"
Preferences = "1"
Printf = "1"
Random = "1"
Expand Down
2 changes: 2 additions & 0 deletions CUDACore/src/CUDACore.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ using Libdl

using Printf

using PrecompileTools

# Julia has several notions of `sizeof`
# - Base.sizeof is the size of an object in memory
# - Base.aligned_sizeof is the size of an object in an array/inline alloced
Expand Down
2 changes: 1 addition & 1 deletion CUDACore/src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ function compile(@nospecialize(job::CompilerJob))
# lower to PTX
# TODO: on 1.9, this actually creates a context. cache those.
asm, meta = JuliaContext() do ctx
GPUCompiler.compile(:asm, job)
invoke_frozen(GPUCompiler.compile, :asm, job)
end

# check if we'll need the device runtime
Expand Down
27 changes: 27 additions & 0 deletions CUDACore/src/initialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,29 @@
const _initialized = Ref{Bool}(false)
const _initialization_error = Ref{String}()

# World age captured at __init__ time. Running the GPU compiler infrastructure
# (typeinf_local, etc.) in this world avoids recompilation of native code that was
# cached during precompilation but invalidated by later method definitions.
# Default to typemax(UInt) so that during precompilation (before __init__ runs)
# invoke_in_world clamps to the current world and behaves normally.
const _initialization_world = Ref{UInt}(typemax(UInt))

"""
invoke_frozen(f, args...; kwargs...)

Invoke `f(args...; kwargs...)` in the world captured at `__init__` time.
This allows precompiled native code for the GPU compiler infrastructure
(typeinf_local, etc.) to be reused, avoiding expensive recompilation.
"""
function invoke_frozen(f, args...; kwargs...)
@inline
kwargs = merge(NamedTuple(), kwargs)
if isempty(kwargs)
return Base.invoke_in_world(_initialization_world[], f, args...)
end
return Base.invoke_in_world(_initialization_world[], Core.kwcall, kwargs, f, args...)
end

"""
functional(show_reason=false)

Expand Down Expand Up @@ -207,6 +230,10 @@ function __init__()
end
end

# capture the world age so that the compiler infrastructure can be invoked
# in this world, reusing precompiled native code for typeinf_local etc.
_initialization_world[] = Base.get_world_counter()

_initialized[] = true
end

Expand Down
50 changes: 37 additions & 13 deletions CUDACore/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,40 @@
@compile_workload begin
# compile a dummy kernel to PTX to precompile the GPUCompiler pipeline.
# this doesn't need a GPU — it only uses LLVM.
let
function _precompile_vadd(a)
i = threadIdx().x
@inbounds a[i] += 1f0
return nothing
end

# array
precompile(CuArray, (Vector{Int},))
llvm_support = llvm_compat()
llvm_cap = maximum(filter(<=(v"7.5"), llvm_support.cap))
llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx))

# compilation
precompile(compiler_cache, (CuContext,))
#precompile(compiler_config, (CuDevice,))
precompile(compile, (CompilerJob,))
precompile(link, (CompilerJob,NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}))
precompile(create_exceptions!, (CuModule,))
precompile(run_and_collect, (Cmd,))
target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true)
params = CUDACompilerParams(; cap=llvm_cap, ptx=llvm_ptx)
config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false)

# launch
precompile(cudaconvert, (Function,))
precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
tt = Tuple{CuDeviceArray{Float32,1,AS.Global}}
source = methodinstance(typeof(_precompile_vadd), tt)
job = CompilerJob(source, config)

JuliaContext() do ctx
GPUCompiler.compile(:asm, job)
end
end
end

# kernel launch infrastructure
precompile(Tuple{typeof(cufunction), typeof(identity), Type{Tuple{Nothing}}})
precompile(Tuple{typeof(link), CompilerJob, NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}})

# GPUCompiler compilation pipeline (specialized for CUDACore's compile/link)
precompile(Tuple{typeof(GPUCompiler.actual_compilation),
Dict{Any, CuFunction}, Core.MethodInstance, UInt64,
CUDACompilerConfig, typeof(compile), typeof(link)})

# scalar reference (used by cuBLAS for alpha/beta parameters)
precompile(Tuple{Type{CuRefValue{Float32}}, Float32})
precompile(Tuple{typeof(pool_free), Managed{DeviceMemory}})
2 changes: 2 additions & 0 deletions CUDATools/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
NVML = "611af6d1-644e-4c5d-bd58-854d7d1254b9"
NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Expand All @@ -31,6 +32,7 @@ GPUCompiler = "1.4"
LLVM = "9.3.1"
NVML = "=6.0.0"
NVTX = "1"
PrecompileTools = "1"
Preferences = "1"
PrettyTables = "3"
Printf = "1"
Expand Down
2 changes: 2 additions & 0 deletions CUDATools/src/CUDATools.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ include("reflection.jl")
include("profile.jl")
include("utilities.jl")

include("precompile.jl")

end
46 changes: 46 additions & 0 deletions CUDATools/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# @profile infrastructure (GPU-dependent, can't execute during precompilation)
precompile(Tuple{typeof(Profile.detect_cupti)})
precompile(Tuple{typeof(Profile.profile_internally), Function})
precompile(Tuple{typeof(Profile.capture), CUPTI.ActivityConfig})

using PrecompileTools

@compile_workload begin
# exercise the @profile display path with a dummy result (no GPU needed).
# the show method expects at least two cuCtxSynchronize entries in the host trace
# to delimit the profiled region, and at least one event between them.
dummy = Profile.ProfileResults(;
host = (
id = Int[1, 2, 3, 4],
start = Float64[0.0, 0.001, 0.002, 0.010],
stop = Float64[0.001, 0.002, 0.009, 0.011],
name = String["cuCtxSynchronize", "cuCtxSynchronize",
"cuLaunchKernel", "cuCtxSynchronize"],
tid = Int[1, 1, 1, 1],
),
device = (
id = Int[3],
start = Float64[0.003],
stop = Float64[0.008],
name = String["kernel"],
device = Int[0],
context = Int[1],
stream = Int[1],
grid = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
block = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
registers = Union{Missing,Int64}[32],
shared_mem = Union{Missing,@NamedTuple{static::Int64,dynamic::Int64}}[(static=0,dynamic=0)],
local_mem = Union{Missing,@NamedTuple{thread::Int64,total::Int64}}[(thread=0,total=0)],
size = Union{Missing,Int64}[missing],
),
nvtx = (
id = Int[],
start = Float64[],
type = Symbol[],
tid = Int[],
name = Union{Missing,String}[],
domain = Union{Missing,String}[],
),
)
show(devnull, dummy)
end
6 changes: 3 additions & 3 deletions CUDATools/src/profile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ end
# external profiler
#

function profile_externally(f)
function profile_externally(@nospecialize(f))
# wait for the device to become idle
CUDACore.cuCtxSynchronize()

Expand Down Expand Up @@ -366,7 +366,7 @@ Base.@kwdef struct ProfileResults
raw::Bool=false
end

function profile_internally(f; concurrent=true, kwargs...)
function profile_internally(@nospecialize(f); concurrent=true, kwargs...)
activity_kinds = [
# API calls
CUPTI.CUPTI_ACTIVITY_KIND_DRIVER,
Expand All @@ -390,7 +390,7 @@ function profile_internally(f; concurrent=true, kwargs...)
# wait for the device to become idle
CUDACore.cuCtxSynchronize()

CUPTI.enable!(cfg) do
CUPTI.@enable! cfg begin
# perform dummy operations to "warm up" the profiler, and avoid slow first calls.
# we'll skip everything up until the synchronization call during processing
CuArray([1])
Expand Down
25 changes: 10 additions & 15 deletions CUDATools/src/reflection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,9 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
# NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
# unless the activity API is first activated. This is fixed in 12.5 Update 1.
if v"2024.1.1" <= CUPTI.library_version() <= v"2024.2.0"
cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
CUPTI.enable!(cfg) do
# do nothing
end
warmup_cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
CUPTI.@enable! warmup_cfg nothing
end

cfg = CUPTI.CallbackConfig([CUPTI.CUPTI_CB_DOMAIN_RESOURCE]) do domain, id, data
Expand All @@ -78,9 +76,7 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
end

compiled = CUDACore.compile(job)
CUPTI.enable!(cfg) do
CUDACore.link(job, compiled)
end
CUPTI.@enable! cfg CUDACore.link(job, compiled)

return
end
Expand All @@ -96,11 +92,9 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
# NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
# unless the activity API is first activated. This is fixed in 12.5 Update 1.
if v"2024.1.1" <= CUPTI.library_version() <= v"2024.2.0"
cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
CUPTI.enable!(cfg) do
# do nothing
end
warmup_cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
CUPTI.@enable! warmup_cfg nothing
end

seen_modules = Set{UInt32}()
Expand All @@ -121,7 +115,7 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
disassemble_cubin(io, cubin; raw)
end

CUPTI.enable!(f, cfg)
CUPTI.@enable! cfg f()

return
end
Expand Down Expand Up @@ -177,7 +171,8 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
source = methodinstance(typeof(func), Base.to_tuple_type(types))
config = CUDACore.compiler_config(device(); kernel, compiler_kwargs...)
job = CompilerJob(source, config)
GPUCompiler.$method($(args...); kwargs...)
# use frozen world to avoid recompiling the compiler infrastructure
CUDACore.invoke_frozen(GPUCompiler.$method, $(args...); kwargs...)
end
$method(@nospecialize(func), @nospecialize(types); kwargs...) =
$method(stdout, func, types; kwargs...)
Expand Down
2 changes: 2 additions & 0 deletions lib/cublas/src/cuBLAS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ function __init__()
_initialized[] = true
end

include("precompile.jl")

# deprecated binding for backwards compatibility
Base.@deprecate_binding CUBLAS cuBLAS false

Expand Down
16 changes: 16 additions & 0 deletions lib/cublas/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# handle creation
precompile(Tuple{typeof(handle_ctor), CUDACore.CuContext})

# GEMM for common types
for T in (Float32, Float64, ComplexF32, ComplexF64)
precompile(Tuple{typeof(gemm!), Char, Char, T,
CUDACore.CuMatrix{T}, CUDACore.CuMatrix{T},
T, CUDACore.CuMatrix{T}})
end

# high-level matmul
for T in (Float32, Float64)
precompile(Tuple{typeof(*),
CUDACore.CuArray{T, 2, CUDACore.DeviceMemory},
CUDACore.CuArray{T, 2, CUDACore.DeviceMemory}})
end
2 changes: 2 additions & 0 deletions lib/cudnn/src/cuDNN.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,6 @@ function __init__()
_initialized[] = true
end

include("precompile.jl")

end
2 changes: 2 additions & 0 deletions lib/cudnn/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# handle creation
precompile(Tuple{typeof(handle_ctor), CUDACore.CuContext})
2 changes: 2 additions & 0 deletions lib/cufft/src/cuFFT.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ function __init__()
_initialized[] = true
end

include("precompile.jl")

# deprecated binding for backwards compatibility
Base.@deprecate_binding CUFFT cuFFT false

Expand Down
9 changes: 9 additions & 0 deletions lib/cufft/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# FFT plan creation for common types/dimensions
for T in (ComplexF32, ComplexF64)
precompile(Tuple{typeof(plan_fft!), CUDACore.CuArray{T, 1}, Tuple{Int}})
precompile(Tuple{typeof(plan_fft!), CUDACore.CuArray{T, 2}, Tuple{Int, Int}})
end
for T in (Float32, Float64)
precompile(Tuple{typeof(plan_rfft), CUDACore.CuArray{T, 1}, Tuple{Int}})
precompile(Tuple{typeof(plan_rfft), CUDACore.CuArray{T, 2}, Tuple{Int, Int}})
end
2 changes: 2 additions & 0 deletions lib/cupti/src/CUPTI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,6 @@ function __init__()
end
end

include("precompile.jl")

end
1 change: 1 addition & 0 deletions lib/cupti/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
precompile(Tuple{typeof(version)})
Loading
Loading