Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CUDACore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand Down Expand Up @@ -59,6 +60,7 @@ LazyArtifacts = "1"
Libdl = "1"
LinearAlgebra = "1"
Logging = "1"
PrecompileTools = "1"
Preferences = "1"
Printf = "1"
Random = "1"
Expand Down
2 changes: 2 additions & 0 deletions CUDACore/src/CUDACore.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ using Libdl

using Printf

using PrecompileTools

# Julia has several notions of `sizeof`
# - Base.sizeof is the size of an object in memory
# - Base.aligned_sizeof is the size of an object in an array/inline alloced
Expand Down
2 changes: 1 addition & 1 deletion CUDACore/src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ function compile(@nospecialize(job::CompilerJob))
# lower to PTX
# TODO: on 1.9, this actually creates a context. cache those.
asm, meta = JuliaContext() do ctx
GPUCompiler.compile(:asm, job)
invoke_frozen(GPUCompiler.compile, :asm, job)
end

# check if we'll need the device runtime
Expand Down
27 changes: 27 additions & 0 deletions CUDACore/src/initialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,29 @@
const _initialized = Ref{Bool}(false)
const _initialization_error = Ref{String}()

# World age captured at __init__ time. Running the GPU compiler infrastructure
# (typeinf_local, etc.) in this world avoids recompilation of native code that was
# cached during precompilation but invalidated by later method definitions.
# Default to typemax(UInt) so that during precompilation (before __init__ runs)
# invoke_in_world clamps to the current world and behaves normally.
const _initialization_world = Ref{UInt}(typemax(UInt))

"""
invoke_frozen(f, args...; kwargs...)

Invoke `f(args...; kwargs...)` in the world captured at `__init__` time.
This allows precompiled native code for the GPU compiler infrastructure
(typeinf_local, etc.) to be reused, avoiding expensive recompilation.
"""
function invoke_frozen(f, args...; kwargs...)
@inline
kwargs = merge(NamedTuple(), kwargs)
if isempty(kwargs)
return Base.invoke_in_world(_initialization_world[], f, args...)
end
return Base.invoke_in_world(_initialization_world[], Core.kwcall, kwargs, f, args...)
end

"""
functional(show_reason=false)

Expand Down Expand Up @@ -207,6 +230,10 @@ function __init__()
end
end

# capture the world age so that the compiler infrastructure can be invoked
# in this world, reusing precompiled native code for typeinf_local etc.
_initialization_world[] = Base.get_world_counter()

_initialized[] = true
end

Expand Down
55 changes: 42 additions & 13 deletions CUDACore/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,45 @@
@compile_workload begin
# compile a dummy kernel to PTX to precompile the GPUCompiler pipeline.
# this doesn't need a GPU — it only uses LLVM.
let
function _precompile_vadd(a)
i = threadIdx().x
@inbounds a[i] += 1f0
return nothing
end

# array
precompile(CuArray, (Vector{Int},))
llvm_support = llvm_compat()
llvm_cap = maximum(filter(<=(v"7.5"), llvm_support.cap))
llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx))

# compilation
precompile(compiler_cache, (CuContext,))
#precompile(compiler_config, (CuDevice,))
precompile(compile, (CompilerJob,))
precompile(link, (CompilerJob,NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}))
precompile(create_exceptions!, (CuModule,))
precompile(run_and_collect, (Cmd,))
target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true)
params = CUDACompilerParams(; cap=llvm_cap, ptx=llvm_ptx)
config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false)

# launch
precompile(cudaconvert, (Function,))
precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
tt = Tuple{CuDeviceArray{Float32,1,AS.Global}}
source = methodinstance(typeof(_precompile_vadd), tt)
job = CompilerJob(source, config)

# On Julia < 1.12, GPU compilation during precompilation leaks foreign
# MIs into native compilation, causing LLVM errors
# (e.g. "Cannot select: intrinsic %llvm.nvvm.membar.sys").
@static if VERSION >= v"1.12-"
JuliaContext() do ctx
GPUCompiler.compile(:asm, job)
end
end
end
end

# kernel launch infrastructure
precompile(Tuple{typeof(cufunction), typeof(identity), Type{Tuple{Nothing}}})
precompile(Tuple{typeof(link), CompilerJob, NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}})

# GPUCompiler compilation pipeline (specialized for CUDACore's compile/link)
precompile(Tuple{typeof(GPUCompiler.actual_compilation),
Dict{Any, CuFunction}, Core.MethodInstance, UInt64,
CUDACompilerConfig, typeof(compile), typeof(link)})

# scalar reference (used by cuBLAS for alpha/beta parameters)
precompile(Tuple{Type{CuRefValue{Float32}}, Float32})
precompile(Tuple{typeof(pool_free), Managed{DeviceMemory}})
2 changes: 2 additions & 0 deletions CUDATools/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
NVML = "611af6d1-644e-4c5d-bd58-854d7d1254b9"
NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Expand All @@ -31,6 +32,7 @@ GPUCompiler = "1.4"
LLVM = "9.3.1"
NVML = "=6.0.0"
NVTX = "1"
PrecompileTools = "1"
Preferences = "1"
PrettyTables = "3"
Printf = "1"
Expand Down
2 changes: 2 additions & 0 deletions CUDATools/src/CUDATools.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ include("reflection.jl")
include("profile.jl")
include("utilities.jl")

include("precompile.jl")

end
46 changes: 46 additions & 0 deletions CUDATools/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# @profile infrastructure (GPU-dependent, can't execute during precompilation)
precompile(Tuple{typeof(Profile.detect_cupti)})
precompile(Tuple{typeof(Profile.profile_internally), Function})
precompile(Tuple{typeof(Profile.capture), CUPTI.ActivityConfig})

using PrecompileTools

@compile_workload begin
# exercise the @profile display path with a dummy result (no GPU needed).
# the show method expects at least two cuCtxSynchronize entries in the host trace
# to delimit the profiled region, and at least one event between them.
dummy = Profile.ProfileResults(;
host = (
id = Int[1, 2, 3, 4],
start = Float64[0.0, 0.001, 0.002, 0.010],
stop = Float64[0.001, 0.002, 0.009, 0.011],
name = String["cuCtxSynchronize", "cuCtxSynchronize",
"cuLaunchKernel", "cuCtxSynchronize"],
tid = Int[1, 1, 1, 1],
),
device = (
id = Int[3],
start = Float64[0.003],
stop = Float64[0.008],
name = String["kernel"],
device = Int[0],
context = Int[1],
stream = Int[1],
grid = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
block = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
registers = Union{Missing,Int64}[32],
shared_mem = Union{Missing,@NamedTuple{static::Int64,dynamic::Int64}}[(static=0,dynamic=0)],
local_mem = Union{Missing,@NamedTuple{thread::Int64,total::Int64}}[(thread=0,total=0)],
size = Union{Missing,Int64}[missing],
),
nvtx = (
id = Int[],
start = Float64[],
type = Symbol[],
tid = Int[],
name = Union{Missing,String}[],
domain = Union{Missing,String}[],
),
)
show(devnull, dummy)
end
6 changes: 3 additions & 3 deletions CUDATools/src/profile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ end
# external profiler
#

function profile_externally(f)
function profile_externally(@nospecialize(f))
# wait for the device to become idle
CUDACore.cuCtxSynchronize()

Expand Down Expand Up @@ -366,7 +366,7 @@ Base.@kwdef struct ProfileResults
raw::Bool=false
end

function profile_internally(f; concurrent=true, kwargs...)
function profile_internally(@nospecialize(f); concurrent=true, kwargs...)
activity_kinds = [
# API calls
CUPTI.CUPTI_ACTIVITY_KIND_DRIVER,
Expand All @@ -390,7 +390,7 @@ function profile_internally(f; concurrent=true, kwargs...)
# wait for the device to become idle
CUDACore.cuCtxSynchronize()

CUPTI.enable!(cfg) do
CUPTI.@enable! cfg begin
# perform dummy operations to "warm up" the profiler, and avoid slow first calls.
# we'll skip everything up until the synchronization call during processing
CuArray([1])
Expand Down
25 changes: 10 additions & 15 deletions CUDATools/src/reflection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,9 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
# NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
# unless the activity API is first activated. This is fixed in 12.5 Update 1.
if v"2024.1.1" <= CUPTI.library_version() <= v"2024.2.0"
cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
CUPTI.enable!(cfg) do
# do nothing
end
warmup_cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
CUPTI.@enable! warmup_cfg nothing
end

cfg = CUPTI.CallbackConfig([CUPTI.CUPTI_CB_DOMAIN_RESOURCE]) do domain, id, data
Expand All @@ -78,9 +76,7 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
end

compiled = CUDACore.compile(job)
CUPTI.enable!(cfg) do
CUDACore.link(job, compiled)
end
CUPTI.@enable! cfg CUDACore.link(job, compiled)

return
end
Expand All @@ -96,11 +92,9 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
# NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
# unless the activity API is first activated. This is fixed in 12.5 Update 1.
if v"2024.1.1" <= CUPTI.library_version() <= v"2024.2.0"
cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
CUPTI.enable!(cfg) do
# do nothing
end
warmup_cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
CUPTI.@enable! warmup_cfg nothing
end

seen_modules = Set{UInt32}()
Expand All @@ -121,7 +115,7 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
disassemble_cubin(io, cubin; raw)
end

CUPTI.enable!(f, cfg)
CUPTI.@enable! cfg f()

return
end
Expand Down Expand Up @@ -177,7 +171,8 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
source = methodinstance(typeof(func), Base.to_tuple_type(types))
config = CUDACore.compiler_config(device(); kernel, compiler_kwargs...)
job = CompilerJob(source, config)
GPUCompiler.$method($(args...); kwargs...)
# use frozen world to avoid recompiling the compiler infrastructure
CUDACore.invoke_frozen(GPUCompiler.$method, $(args...); kwargs...)
end
$method(@nospecialize(func), @nospecialize(types); kwargs...) =
$method(stdout, func, types; kwargs...)
Expand Down
2 changes: 2 additions & 0 deletions lib/cublas/src/cuBLAS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ function __init__()
_initialized[] = true
end

include("precompile.jl")

# deprecated binding for backwards compatibility
Base.@deprecate_binding CUBLAS cuBLAS false

Expand Down
16 changes: 16 additions & 0 deletions lib/cublas/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# handle creation
precompile(Tuple{typeof(handle_ctor), CUDACore.CuContext})

# GEMM for common types
for T in (Float32, Float64, ComplexF32, ComplexF64)
precompile(Tuple{typeof(gemm!), Char, Char, T,
CUDACore.CuMatrix{T}, CUDACore.CuMatrix{T},
T, CUDACore.CuMatrix{T}})
end

# high-level matmul
for T in (Float32, Float64)
precompile(Tuple{typeof(*),
CUDACore.CuArray{T, 2, CUDACore.DeviceMemory},
CUDACore.CuArray{T, 2, CUDACore.DeviceMemory}})
end
2 changes: 2 additions & 0 deletions lib/cudnn/src/cuDNN.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,6 @@ function __init__()
_initialized[] = true
end

include("precompile.jl")

end
2 changes: 2 additions & 0 deletions lib/cudnn/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# handle creation
precompile(Tuple{typeof(handle_ctor), CUDACore.CuContext})
2 changes: 2 additions & 0 deletions lib/cufft/src/cuFFT.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ function __init__()
_initialized[] = true
end

include("precompile.jl")

# deprecated binding for backwards compatibility
Base.@deprecate_binding CUFFT cuFFT false

Expand Down
9 changes: 9 additions & 0 deletions lib/cufft/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# FFT plan creation for common types/dimensions
for T in (ComplexF32, ComplexF64)
precompile(Tuple{typeof(plan_fft!), CUDACore.CuArray{T, 1}, Tuple{Int}})
precompile(Tuple{typeof(plan_fft!), CUDACore.CuArray{T, 2}, Tuple{Int, Int}})
end
for T in (Float32, Float64)
precompile(Tuple{typeof(plan_rfft), CUDACore.CuArray{T, 1}, Tuple{Int}})
precompile(Tuple{typeof(plan_rfft), CUDACore.CuArray{T, 2}, Tuple{Int, Int}})
end
2 changes: 2 additions & 0 deletions lib/cupti/src/CUPTI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,6 @@ function __init__()
end
end

include("precompile.jl")

end
1 change: 1 addition & 0 deletions lib/cupti/src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
precompile(Tuple{typeof(version)})
Loading
Loading