JuliaGPU · maleadt · Mar 30, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
@@ -21,6 +21,7 @@ LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -59,6 +60,7 @@ LazyArtifacts = "1"
 Libdl = "1"
 LinearAlgebra = "1"
 Logging = "1"
+PrecompileTools = "1"
 Preferences = "1"
 Printf = "1"
 Random = "1"

diff --git a/CUDACore/src/CUDACore.jl b/CUDACore/src/CUDACore.jl
@@ -54,6 +54,8 @@ using Libdl
 
 using Printf
 
+using PrecompileTools
+
 # Julia has several notions of `sizeof`
 # - Base.sizeof is the size of an object in memory
 # - Base.aligned_sizeof is the size of an object in an array/inline alloced

diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
@@ -247,7 +247,7 @@ function compile(@nospecialize(job::CompilerJob))
     # lower to PTX
     # TODO: on 1.9, this actually creates a context. cache those.
     asm, meta = JuliaContext() do ctx
-        GPUCompiler.compile(:asm, job)
+        invoke_frozen(GPUCompiler.compile, :asm, job)
     end
 
     # check if we'll need the device runtime

diff --git a/CUDACore/src/initialization.jl b/CUDACore/src/initialization.jl
@@ -9,6 +9,29 @@
 const _initialized = Ref{Bool}(false)
 const _initialization_error = Ref{String}()
 
+# World age captured at __init__ time. Running the GPU compiler infrastructure
+# (typeinf_local, etc.) in this world avoids recompilation of native code that was
+# cached during precompilation but invalidated by later method definitions.
+# Default to typemax(UInt) so that during precompilation (before __init__ runs)
+# invoke_in_world clamps to the current world and behaves normally.
+const _initialization_world = Ref{UInt}(typemax(UInt))
+
+"""
+    invoke_frozen(f, args...; kwargs...)
+
+Invoke `f(args...; kwargs...)` in the world captured at `__init__` time.
+This allows precompiled native code for the GPU compiler infrastructure
+(typeinf_local, etc.) to be reused, avoiding expensive recompilation.
+"""
+function invoke_frozen(f, args...; kwargs...)
+    @inline
+    kwargs = merge(NamedTuple(), kwargs)
+    if isempty(kwargs)
+        return Base.invoke_in_world(_initialization_world[], f, args...)
+    end
+    return Base.invoke_in_world(_initialization_world[], Core.kwcall, kwargs, f, args...)
+end
+
 """
     functional(show_reason=false)
 
@@ -207,6 +230,10 @@ function __init__()
         end
     end
 
+    # capture the world age so that the compiler infrastructure can be invoked
+    # in this world, reusing precompiled native code for typeinf_local etc.
+    _initialization_world[] = Base.get_world_counter()
+
     _initialized[] = true
 end
 

diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl
@@ -1,16 +1,45 @@
+@compile_workload begin
+    # compile a dummy kernel to PTX to precompile the GPUCompiler pipeline.
+    # this doesn't need a GPU — it only uses LLVM.
+    let
+        function _precompile_vadd(a)
+            i = threadIdx().x
+            @inbounds a[i] += 1f0
+            return nothing
+        end
 
-# array
-precompile(CuArray, (Vector{Int},))
+        llvm_support = llvm_compat()
+        llvm_cap = maximum(filter(<=(v"7.5"), llvm_support.cap))
+        llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx))
 
-# compilation
-precompile(compiler_cache, (CuContext,))
-#precompile(compiler_config, (CuDevice,))
-precompile(compile, (CompilerJob,))
-precompile(link, (CompilerJob,NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}))
-precompile(create_exceptions!, (CuModule,))
-precompile(run_and_collect, (Cmd,))
+        target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true)
+        params = CUDACompilerParams(; cap=llvm_cap, ptx=llvm_ptx)
+        config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false)
 
-# launch
-precompile(cudaconvert, (Function,))
-precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
-precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
+        tt = Tuple{CuDeviceArray{Float32,1,AS.Global}}
+        source = methodinstance(typeof(_precompile_vadd), tt)
+        job = CompilerJob(source, config)
+
+        # On Julia < 1.12, GPU compilation during precompilation leaks foreign
+        # MIs into native compilation, causing LLVM errors
+        # (e.g. "Cannot select: intrinsic %llvm.nvvm.membar.sys").
+        @static if VERSION >= v"1.12-"
+            JuliaContext() do ctx
+                GPUCompiler.compile(:asm, job)
+            end
+        end
+    end
+end
+
+# kernel launch infrastructure
+precompile(Tuple{typeof(cufunction), typeof(identity), Type{Tuple{Nothing}}})
+precompile(Tuple{typeof(link), CompilerJob, NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}})
+
+# GPUCompiler compilation pipeline (specialized for CUDACore's compile/link)
+precompile(Tuple{typeof(GPUCompiler.actual_compilation),
+    Dict{Any, CuFunction}, Core.MethodInstance, UInt64,
+    CUDACompilerConfig, typeof(compile), typeof(link)})
+
+# scalar reference (used by cuBLAS for alpha/beta parameters)
+precompile(Tuple{Type{CuRefValue{Float32}}, Float32})
+precompile(Tuple{typeof(pool_free), Managed{DeviceMemory}})
diff --git a/CUDATools/Project.toml b/CUDATools/Project.toml
@@ -12,6 +12,7 @@ GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 NVML = "611af6d1-644e-4c5d-bd58-854d7d1254b9"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -31,6 +32,7 @@ GPUCompiler = "1.4"
 LLVM = "9.3.1"
 NVML = "=6.0.0"
 NVTX = "1"
+PrecompileTools = "1"
 Preferences = "1"
 PrettyTables = "3"
 Printf = "1"

diff --git a/CUDATools/src/CUDATools.jl b/CUDATools/src/CUDATools.jl
@@ -40,4 +40,6 @@ include("reflection.jl")
 include("profile.jl")
 include("utilities.jl")
 
+include("precompile.jl")
+
 end
diff --git a/CUDATools/src/precompile.jl b/CUDATools/src/precompile.jl
@@ -0,0 +1,46 @@
+# @profile infrastructure (GPU-dependent, can't execute during precompilation)
+precompile(Tuple{typeof(Profile.detect_cupti)})
+precompile(Tuple{typeof(Profile.profile_internally), Function})
+precompile(Tuple{typeof(Profile.capture), CUPTI.ActivityConfig})
+
+using PrecompileTools
+
+@compile_workload begin
+    # exercise the @profile display path with a dummy result (no GPU needed).
+    # the show method expects at least two cuCtxSynchronize entries in the host trace
+    # to delimit the profiled region, and at least one event between them.
+    dummy = Profile.ProfileResults(;
+        host = (
+            id      = Int[1, 2, 3, 4],
+            start   = Float64[0.0, 0.001, 0.002, 0.010],
+            stop    = Float64[0.001, 0.002, 0.009, 0.011],
+            name    = String["cuCtxSynchronize", "cuCtxSynchronize",
+                             "cuLaunchKernel", "cuCtxSynchronize"],
+            tid     = Int[1, 1, 1, 1],
+        ),
+        device = (
+            id      = Int[3],
+            start   = Float64[0.003],
+            stop    = Float64[0.008],
+            name    = String["kernel"],
+            device  = Int[0],
+            context = Int[1],
+            stream  = Int[1],
+            grid            = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
+            block           = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
+            registers       = Union{Missing,Int64}[32],
+            shared_mem      = Union{Missing,@NamedTuple{static::Int64,dynamic::Int64}}[(static=0,dynamic=0)],
+            local_mem       = Union{Missing,@NamedTuple{thread::Int64,total::Int64}}[(thread=0,total=0)],
+            size            = Union{Missing,Int64}[missing],
+        ),
+        nvtx = (
+            id      = Int[],
+            start   = Float64[],
+            type    = Symbol[],
+            tid     = Int[],
+            name    = Union{Missing,String}[],
+            domain  = Union{Missing,String}[],
+        ),
+    )
+    show(devnull, dummy)
+end
diff --git a/CUDATools/src/profile.jl b/CUDATools/src/profile.jl
@@ -147,7 +147,7 @@ end
 # external profiler
 #
 
-function profile_externally(f)
+function profile_externally(@nospecialize(f))
     # wait for the device to become idle
     CUDACore.cuCtxSynchronize()
 
@@ -366,7 +366,7 @@ Base.@kwdef struct ProfileResults
     raw::Bool=false
 end
 
-function profile_internally(f; concurrent=true, kwargs...)
+function profile_internally(@nospecialize(f); concurrent=true, kwargs...)
     activity_kinds = [
         # API calls
         CUPTI.CUPTI_ACTIVITY_KIND_DRIVER,
@@ -390,7 +390,7 @@ function profile_internally(f; concurrent=true, kwargs...)
     # wait for the device to become idle
     CUDACore.cuCtxSynchronize()
 
-    CUPTI.enable!(cfg) do
+    CUPTI.@enable! cfg begin
         # perform dummy operations to "warm up" the profiler, and avoid slow first calls.
         # we'll skip everything up until the synchronization call during processing
         CuArray([1])

diff --git a/CUDATools/src/reflection.jl b/CUDATools/src/reflection.jl
@@ -59,11 +59,9 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
     # NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
     # unless the activity API is first activated. This is fixed in 12.5 Update 1.
     if v"2024.1.1" <= CUPTI.library_version() <= v"2024.2.0"
-        cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
-                                    CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
-        CUPTI.enable!(cfg) do
-            # do nothing
-        end
+        warmup_cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
+                                           CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
+        CUPTI.@enable! warmup_cfg nothing
     end
 
     cfg = CUPTI.CallbackConfig([CUPTI.CUPTI_CB_DOMAIN_RESOURCE]) do domain, id, data
@@ -78,9 +76,7 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
     end
 
     compiled = CUDACore.compile(job)
-    CUPTI.enable!(cfg) do
-        CUDACore.link(job, compiled)
-    end
+    CUPTI.@enable! cfg CUDACore.link(job, compiled)
 
     return
 end
@@ -96,11 +92,9 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
     # NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
     # unless the activity API is first activated. This is fixed in 12.5 Update 1.
     if v"2024.1.1" <= CUPTI.library_version() <= v"2024.2.0"
-        cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
-                                    CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
-        CUPTI.enable!(cfg) do
-            # do nothing
-        end
+        warmup_cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
+                                           CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
+        CUPTI.@enable! warmup_cfg nothing
     end
 
     seen_modules = Set{UInt32}()
@@ -121,7 +115,7 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
         disassemble_cubin(io, cubin; raw)
     end
 
-    CUPTI.enable!(f, cfg)
+    CUPTI.@enable! cfg f()
 
     return
 end
@@ -177,7 +171,8 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
             source = methodinstance(typeof(func), Base.to_tuple_type(types))
             config = CUDACore.compiler_config(device(); kernel, compiler_kwargs...)
             job = CompilerJob(source, config)
-            GPUCompiler.$method($(args...); kwargs...)
+            # use frozen world to avoid recompiling the compiler infrastructure
+            CUDACore.invoke_frozen(GPUCompiler.$method, $(args...); kwargs...)
         end
         $method(@nospecialize(func), @nospecialize(types); kwargs...) =
             $method(stdout, func, types; kwargs...)

diff --git a/lib/cublas/src/cuBLAS.jl b/lib/cublas/src/cuBLAS.jl
@@ -281,6 +281,8 @@ function __init__()
     _initialized[] = true
 end
 
+include("precompile.jl")
+
 # deprecated binding for backwards compatibility
 Base.@deprecate_binding CUBLAS cuBLAS false
 

diff --git a/lib/cublas/src/precompile.jl b/lib/cublas/src/precompile.jl
@@ -0,0 +1,16 @@
+# handle creation
+precompile(Tuple{typeof(handle_ctor), CUDACore.CuContext})
+
+# GEMM for common types
+for T in (Float32, Float64, ComplexF32, ComplexF64)
+    precompile(Tuple{typeof(gemm!), Char, Char, T,
+        CUDACore.CuMatrix{T}, CUDACore.CuMatrix{T},
+        T, CUDACore.CuMatrix{T}})
+end
+
+# high-level matmul
+for T in (Float32, Float64)
+    precompile(Tuple{typeof(*),
+        CUDACore.CuArray{T, 2, CUDACore.DeviceMemory},
+        CUDACore.CuArray{T, 2, CUDACore.DeviceMemory}})
+end
diff --git a/lib/cudnn/src/cuDNN.jl b/lib/cudnn/src/cuDNN.jl
@@ -177,4 +177,6 @@ function __init__()
     _initialized[] = true
 end
 
+include("precompile.jl")
+
 end
diff --git a/lib/cudnn/src/precompile.jl b/lib/cudnn/src/precompile.jl
@@ -0,0 +1,2 @@
+# handle creation
+precompile(Tuple{typeof(handle_ctor), CUDACore.CuContext})
diff --git a/lib/cufft/src/cuFFT.jl b/lib/cufft/src/cuFFT.jl
@@ -55,6 +55,8 @@ function __init__()
     _initialized[] = true
 end
 
+include("precompile.jl")
+
 # deprecated binding for backwards compatibility
 Base.@deprecate_binding CUFFT cuFFT false
 

diff --git a/lib/cufft/src/precompile.jl b/lib/cufft/src/precompile.jl
@@ -0,0 +1,9 @@
+# FFT plan creation for common types/dimensions
+for T in (ComplexF32, ComplexF64)
+    precompile(Tuple{typeof(plan_fft!), CUDACore.CuArray{T, 1}, Tuple{Int}})
+    precompile(Tuple{typeof(plan_fft!), CUDACore.CuArray{T, 2}, Tuple{Int, Int}})
+end
+for T in (Float32, Float64)
+    precompile(Tuple{typeof(plan_rfft), CUDACore.CuArray{T, 1}, Tuple{Int}})
+    precompile(Tuple{typeof(plan_rfft), CUDACore.CuArray{T, 2}, Tuple{Int, Int}})
+end
diff --git a/lib/cupti/src/CUPTI.jl b/lib/cupti/src/CUPTI.jl
@@ -41,4 +41,6 @@ function __init__()
     end
 end
 
+include("precompile.jl")
+
 end
diff --git a/lib/cupti/src/precompile.jl b/lib/cupti/src/precompile.jl
@@ -0,0 +1 @@
+precompile(Tuple{typeof(version)})
-Original file line number
+Diff line change
@@ Expand Up / @@ -177,4 +177,6 @@ function __init__() @@
         _initialized[] = true
     end
+    include("precompile.jl")
     end
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# handle creation
		precompile(Tuple{typeof(handle_ctor), CUDACore.CuContext})