diff --git a/src/driver.jl b/src/driver.jl
index 9e05eb63..3b372e01 100644
--- a/src/driver.jl
+++ b/src/driver.jl
@@ -43,53 +43,48 @@ end
 
 export compile
 
-# NOTE: the keyword arguments to compile/codegen control those aspects of compilation that
-#       might have to be changed (e.g. set libraries=false when recursing, or set
-#       strip=true for reflection). What remains defines the compilation job itself,
-#       and those values are contained in the CompilerJob struct.
-
 # (::CompilerJob)
 const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 
 """
-    compile(target::Symbol, job::CompilerJob; kwargs...)
-
-Compile a function `f` invoked with types `tt` for device capability `cap` to one of the
-following formats as specified by the `target` argument: `:julia` for Julia IR, `:llvm` for
-LLVM IR and `:asm` for machine code.
-
-The following keyword arguments are supported:
-- `toplevel`: indicates that this compilation is the outermost invocation of the compiler
-  (default: true)
-- `libraries`: link the GPU runtime and `libdevice` libraries (default: true, if toplevel)
-- `optimize`: optimize the code (default: true, if toplevel)
-- `cleanup`: run cleanup passes on the code (default: true, if toplevel)
-- `validate`: enable optional validation of input and outputs (default: true, if toplevel)
-- `strip`: strip non-functional metadata and debug information (default: false)
-- `only_entry`: only keep the entry function, remove all others (default: false).
-  This option is only for internal use, to implement reflection's `dump_module`.
-
-Other keyword arguments can be found in the documentation of [`cufunction`](@ref).
+    compile(target::Symbol, job::CompilerJob)
+
+Compile a `job` to one of the following formats as specified by the `target` argument:
+`:julia` for Julia IR, `:llvm` for LLVM IR and `:asm` for machine code.
 """
 function compile(target::Symbol, @nospecialize(job::CompilerJob); kwargs...)
+    # XXX: remove on next major version
+    if !isempty(kwargs)
+        Base.depwarn("The GPUCompiler `compile` API does not take keyword arguments anymore. Use CompilerConfig instead.", :compile)
+        config = CompilerConfig(job.config; kwargs...)
+        job = CompilerJob(job.source, config)
+    end
+
     if compile_hook[] !== nothing
         compile_hook[](job)
     end
 
-    return codegen(target, job; kwargs...)
+    return compile_unhooked(target, job)
 end
 
-function codegen(output::Symbol, @nospecialize(job::CompilerJob); toplevel::Bool=true,
-                 libraries::Bool=toplevel, optimize::Bool=toplevel, cleanup::Bool=toplevel,
-                 validate::Bool=toplevel, strip::Bool=false, only_entry::Bool=false,
-                 parent_job::Union{Nothing, CompilerJob}=nothing)
+# XXX: remove on next major version
+function codegen(output::Symbol, @nospecialize(job::CompilerJob); kwargs...)
+    if !isempty(kwargs)
+        Base.depwarn("The GPUCompiler `codegen` function is an internal API. Use `GPUCompiler.compile` (with any kwargs passed to `CompilerConfig`) instead.", :codegen)
+        config = CompilerConfig(job.config; kwargs...)
+        job = CompilerJob(job.source, config)
+    end
+    compile_unhooked(output, job)
+end
+
+function compile_unhooked(output::Symbol, @nospecialize(job::CompilerJob); kwargs...)
     if context(; throw_error=false) === nothing
         error("No active LLVM context. Use `JuliaContext()` do-block syntax to create one.")
     end
 
     @timeit_debug to "Validation" begin
         check_method(job)   # not optional
-        validate && check_invocation(job)
+        job.config.validate && check_invocation(job)
     end
 
     prepare_job!(job)
@@ -97,10 +92,10 @@ function codegen(output::Symbol, @nospecialize(job::CompilerJob); toplevel::Bool
 
     ## LLVM IR
 
-    ir, ir_meta = emit_llvm(job; libraries, toplevel, optimize, cleanup, only_entry, validate)
+    ir, ir_meta = emit_llvm(job)
 
     if output == :llvm
-        if strip
+        if job.config.strip
             @timeit_debug to "strip debug info" strip_debuginfo!(ir)
         end
 
@@ -117,7 +112,7 @@ function codegen(output::Symbol, @nospecialize(job::CompilerJob); toplevel::Bool
     else
         error("Unknown assembly format $output")
     end
-    asm, asm_meta = emit_asm(job, ir; strip, validate, format)
+    asm, asm_meta = emit_asm(job, ir, format)
 
     if output == :asm || output == :obj
         return asm, (; asm_meta..., ir_meta..., ir)
@@ -156,9 +151,14 @@ end
 
 const __llvm_initialized = Ref(false)
 
-@locked function emit_llvm(@nospecialize(job::CompilerJob); toplevel::Bool,
-                           libraries::Bool, optimize::Bool, cleanup::Bool,
-                           validate::Bool, only_entry::Bool)
+@locked function emit_llvm(@nospecialize(job::CompilerJob); kwargs...)
+    # XXX: remove on next major version
+    if !isempty(kwargs)
+        Base.depwarn("The GPUCompiler `emit_llvm` function is an internal API. Use `GPUCompiler.compile` (with any kwargs passed to `CompilerConfig`) instead.", :emit_llvm)
+        config = CompilerConfig(job.config; kwargs...)
+        job = CompilerJob(job.source, config)
+    end
+
     if !__llvm_initialized[]
         InitializeAllTargets()
         InitializeAllTargetInfos()
@@ -183,7 +183,8 @@ const __llvm_initialized = Ref(false)
     entry = finish_module!(job, ir, entry)
 
     # deferred code generation
-    has_deferred_jobs = toplevel && !only_entry && haskey(functions(ir), "deferred_codegen")
+    has_deferred_jobs = job.config.toplevel && !job.config.only_entry &&
+                        haskey(functions(ir), "deferred_codegen")
     jobs = Dict{CompilerJob, String}(job => entry_fn)
     if has_deferred_jobs
         dyn_marker = functions(ir)["deferred_codegen"]
@@ -221,8 +222,8 @@ const __llvm_initialized = Ref(false)
             for dyn_job in keys(worklist)
                 # cached compilation
                 dyn_entry_fn = get!(jobs, dyn_job) do
-                    dyn_ir, dyn_meta = codegen(:llvm, dyn_job; toplevel=false,
-                                                               parent_job=job)
+                    config = CompilerConfig(dyn_job.config; toplevel=false)
+                    dyn_ir, dyn_meta = codegen(:llvm, CompilerJob(dyn_job; config))
                     dyn_entry_fn = LLVM.name(dyn_meta.entry)
                     merge!(compiled, dyn_meta.compiled)
                     @assert context(dyn_ir) == context(ir)
@@ -258,7 +259,7 @@ const __llvm_initialized = Ref(false)
         erase!(dyn_marker)
     end
 
-    if libraries
+    if job.config.toplevel && job.config.libraries
         # load the runtime outside of a timing block (because it recurses into the compiler)
         if !uses_julia_runtime(job)
             runtime = load_runtime(job)
@@ -284,7 +285,7 @@ const __llvm_initialized = Ref(false)
         # mark everything internal except for entrypoints and any exported
         # global variables. this makes sure that the optimizer can, e.g.,
         # rewrite function signatures.
-        if toplevel
+        if job.config.toplevel
             preserved_gvs = collect(values(jobs))
             for gvar in globals(ir)
                 if linkage(gvar) == LLVM.API.LLVMExternalLinkage
@@ -310,7 +311,7 @@ const __llvm_initialized = Ref(false)
             #       so that we can reconstruct the CompileJob instead of setting it globally
         end
 
-        if optimize
+        if job.config.toplevel && job.config.optimize
             @timeit_debug to "optimization" begin
                 optimize!(job, ir; job.config.opt_level)
 
@@ -337,7 +338,7 @@ const __llvm_initialized = Ref(false)
             entry = functions(ir)[entry_fn]
         end
 
-        if cleanup
+        if job.config.toplevel && job.config.cleanup
             @timeit_debug to "clean-up" begin
                 @dispose pb=NewPMPassBuilder() begin
                     add!(pb, RecomputeGlobalsAAPass())
@@ -355,7 +356,7 @@ const __llvm_initialized = Ref(false)
         # we want to finish the module after optimization, so we cannot do so
         # during deferred code generation. instead, process the deferred jobs
         # here.
-        if toplevel
+        if job.config.toplevel
             entry = finish_ir!(job, ir, entry)
 
             for (job′, fn′) in jobs
@@ -367,7 +368,7 @@ const __llvm_initialized = Ref(false)
         # replace non-entry function definitions with a declaration
         # NOTE: we can't do this before optimization, because the definitions of called
         #       functions may affect optimization.
-        if only_entry
+        if job.config.only_entry
             for f in functions(ir)
                 f == entry && continue
                 isdeclaration(f) && continue
@@ -377,7 +378,7 @@ const __llvm_initialized = Ref(false)
         end
     end
 
-    if validate
+    if job.config.toplevel && job.config.validate
         @timeit_debug to "Validation" begin
             check_ir(job, ir)
         end
@@ -390,10 +391,10 @@ const __llvm_initialized = Ref(false)
     return ir, (; entry, compiled)
 end
 
-@locked function emit_asm(@nospecialize(job::CompilerJob), ir::LLVM.Module;
-                          strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
+@locked function emit_asm(@nospecialize(job::CompilerJob), ir::LLVM.Module,
+                          format::LLVM.API.LLVMCodeGenFileType)
     # NOTE: strip after validation to get better errors
-    if strip
+    if job.config.strip
         @timeit_debug to "Debug info removal" strip_debuginfo!(ir)
     end
 
diff --git a/src/execution.jl b/src/execution.jl
index 95fc7a24..9b4940a7 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -8,12 +8,20 @@ export split_kwargs, assign_args!
 # split keyword arguments expressions into groups. returns vectors of keyword argument
 # values, one more than the number of groups (unmatched keywords in the last vector).
 # intended for use in macros; the resulting groups can be used in expressions.
+# can be used at run time, but not in performance critical code.
 function split_kwargs(kwargs, kw_groups...)
     kwarg_groups = ntuple(_->[], length(kw_groups) + 1)
     for kwarg in kwargs
         # decode
-        Meta.isexpr(kwarg, :(=)) || throw(ArgumentError("non-keyword argument like option '$kwarg'"))
-        key, val = kwarg.args
+        if Meta.isexpr(kwarg, :(=))
+            # use in macros
+            key, val = kwarg.args
+        elseif kwarg isa Pair{Symbol,<:Any}
+            # use in functions
+            key, val = kwarg
+        else
+            throw(ArgumentError("non-keyword argument like option '$kwarg'"))
+        end
         isa(key, Symbol) || throw(ArgumentError("non-symbolic keyword '$key'"))
 
         # find a matching group
@@ -182,7 +190,7 @@ end
 end
 
 struct DiskCacheEntry
-    src::Type # Originally MethodInstance, but upon deserialize they were not uniqued... 
+    src::Type # Originally MethodInstance, but upon deserialize they were not uniqued...
     cfg::CompilerConfig
     asm
 end
@@ -262,7 +270,16 @@ end
         obj = linker(job, asm)
 
         if ci === nothing
-            ci = ci_cache_lookup(ci_cache(job), src, world, world)::CodeInstance
+            ci = ci_cache_lookup(ci_cache(job), src, world, world)
+            if ci === nothing
+                error("""Did not find CodeInstance for $job.
+
+                         Pleaase make sure that the `compiler` function passed to `cached_compilation`
+                         invokes GPUCompiler with exactly the same configuration as passed to the API.
+
+                         Note that you should do this by calling `GPUCompiler.compile`, and not by
+                         using reflection functions (which alter the compiler configuration).""")
+            end
             key = (ci, cfg)
         end
         cache[key] = obj
diff --git a/src/interface.jl b/src/interface.jl
index 6de28906..f9c655bf 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -63,6 +63,9 @@ export CompilerConfig
 
 # the configuration of the compiler
 
+const CONFIG_KWARGS = [:kernel, :name, :entry_abi, :always_inline, :opt_level,
+                       :libraries, :optimize, :cleanup, :validate, :strip]
+
 """
     CompilerConfig(target, params; kernel=true, entry_abi=:specfunc, name=nothing,
                                    always_inline=false)
@@ -72,20 +75,27 @@ and `params`.
 
 Several keyword arguments can be used to customize the compilation process:
 
-- `kernel`: specifies if the function should be compiled as a kernel, or as a regular
-   function. This is used to determine the calling convention and for validation purposes.
-- `entry_abi`: can be either `:specfunc` the default, or `:func`. `:specfunc` expects the
-  arguments to be passed in registers, simple return values are returned in registers as
-   well, and complex return values are returned on the stack using `sret`, the calling
-   convention is `fastcc`. The `:func` abi is simpler with a calling convention of the first
-   argument being the function itself (to support closures), the second argument being a
-   pointer to a vector of boxed Julia values and the third argument being the number of
-   values, the return value will also be boxed. The `:func` abi will internally call the
-   `:specfunc` abi, but is generally easier to invoke directly.
+- `kernel`: specifies if the function should be compiled as a kernel (the default) or as a
+   plain function. This toggles certain optimizations, rewrites and validations.
 - `name`: the name that will be used for the entrypoint function. If `nothing` (the
    default), the name will be generated automatically.
+- `entry_abi`: can be either `:specfunc` (the default), or `:func`.
+   - `:specfunc` expects the arguments to be passed in registers, simple return values are
+     returned in registers as well, and complex return values are returned on the stack
+     using `sret`, the calling convention is `fastcc`.
+   - The `:func` abi is simpler with a calling convention of the first argument being the
+     function itself (to support closures), the second argument being a pointer to a vector
+     of boxed Julia values and the third argument being the number of values, the return
+     value will also be boxed. The `:func` abi will internally call the `:specfunc` abi, but
+     is generally easier to invoke directly.
 - `always_inline` specifies if the Julia front-end should inline all functions into one if
    possible.
+- `opt_level`: the optimization level to use (default: 2)
+- `libraries`: link the GPU runtime and `libdevice` libraries (default: true)
+- `optimize`: optimize the code (default: true)
+- `cleanup`: run cleanup passes on the code (default: true)
+- `validate`: enable optional validation of input and outputs (default: true)
+- `strip`: strip non-functional metadata and debug information (default: false)
 """
 struct CompilerConfig{T,P}
     target::T
@@ -96,27 +106,49 @@ struct CompilerConfig{T,P}
     entry_abi::Symbol
     always_inline::Bool
     opt_level::Int
-
-    function CompilerConfig(target::AbstractCompilerTarget,
-                            params::AbstractCompilerParams;
-                            kernel=true,
-                            name=nothing,
-                            entry_abi=:specfunc,
-                            always_inline=false,
-                            opt_level=2)
+    libraries::Bool
+    optimize::Bool
+    cleanup::Bool
+    validate::Bool
+    strip::Bool
+
+    # internal
+    toplevel::Bool
+    only_entry::Bool
+
+    function CompilerConfig(target::AbstractCompilerTarget, params::AbstractCompilerParams;
+                            kernel=true, name=nothing, entry_abi=:specfunc, toplevel=true,
+                            always_inline=false, opt_level=2, optimize=toplevel,
+                            libraries=toplevel, cleanup=toplevel, validate=toplevel,
+                            strip=false, only_entry=false)
         if entry_abi ∉ (:specfunc, :func)
             error("Unknown entry_abi=$entry_abi")
         end
         new{typeof(target), typeof(params)}(target, params, kernel, name, entry_abi,
-                                            always_inline, opt_level)
+                                            always_inline, opt_level, libraries, optimize,
+                                            cleanup, validate, strip, toplevel, only_entry)
     end
 end
 
 # copy constructor
-CompilerConfig(cfg::CompilerConfig; target=cfg.target, params=cfg.params,
-               kernel=cfg.kernel, name=cfg.name, entry_abi=cfg.entry_abi,
-               always_inline=cfg.always_inline, opt_level=cfg.opt_level) =
-    CompilerConfig(target, params; kernel, entry_abi, name, always_inline, opt_level)
+function CompilerConfig(cfg::CompilerConfig; target=cfg.target, params=cfg.params,
+                        kernel=cfg.kernel, name=cfg.name, entry_abi=cfg.entry_abi,
+                        always_inline=cfg.always_inline, opt_level=cfg.opt_level,
+                        libraries=cfg.libraries, optimize=cfg.optimize, cleanup=cfg.cleanup,
+                        validate=cfg.validate, strip=cfg.strip, toplevel=cfg.toplevel,
+                        only_entry=cfg.only_entry)
+    # deriving a non-toplevel job disables certain features
+    # XXX: should we keep track if any of these were set explicitly in the first place?
+    #      see how PkgEval does that.
+    if !toplevel
+        optimize = false
+        libraries = false
+        cleanup = false
+        validate = false
+    end
+    CompilerConfig(target, params; kernel, entry_abi, name, always_inline, opt_level,
+                   libraries, optimize, cleanup, validate, strip, toplevel, only_entry)
+end
 
 function Base.show(io::IO, @nospecialize(cfg::CompilerConfig{T})) where {T}
     print(io, "CompilerConfig for ", T)
@@ -131,6 +163,13 @@ function Base.hash(cfg::CompilerConfig, h::UInt)
     h = hash(cfg.entry_abi, h)
     h = hash(cfg.always_inline, h)
     h = hash(cfg.opt_level, h)
+    h = hash(cfg.libraries, h)
+    h = hash(cfg.optimize, h)
+    h = hash(cfg.cleanup, h)
+    h = hash(cfg.validate, h)
+    h = hash(cfg.strip, h)
+    h = hash(cfg.toplevel, h)
+    h = hash(cfg.only_entry, h)
 
     return h
 end
@@ -144,16 +183,26 @@ using Core: MethodInstance
 
 # a specific invocation of the compiler, bundling everything needed to generate code
 
+"""
+    CompilerJob(source::MethodInstance, config::CompilerConfig, [world=tls_world_age()])
+
+Construct a `CompilerJob` that will be used to drive compilation for the given `source` and
+`config` in a given `world`.
+"""
 struct CompilerJob{T,P}
     source::MethodInstance
     config::CompilerConfig{T,P}
     world::UInt
 
-    CompilerJob(src::MethodInstance, cfg::CompilerConfig{T,P},
+    CompilerJob(source::MethodInstance, config::CompilerConfig{T,P},
                 world=tls_world_age()) where {T,P} =
-        new{T,P}(src, cfg, world)
+        new{T,P}(source, config, world)
 end
 
+# copy constructor
+CompilerJob(job::CompilerJob; source=job.source, config=job.config, world=job.world) =
+    CompilerJob(source, config, world)
+
 function Base.hash(job::CompilerJob, h::UInt)
     h = hash(job.source, h)
     h = hash(job.config, h)
diff --git a/src/precompile.jl b/src/precompile.jl
index 2921f24c..8f62451b 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -26,13 +26,13 @@ using PrecompileTools: @setup_workload, @compile_workload
         source = methodinstance(typeof(kernel), Tuple{})
         target = NativeCompilerTarget()
         params = precompile_module.DummyCompilerParams()
-        config = CompilerConfig(target, params)
+        # XXX: on Windows, compiling the GPU runtime leaks GPU code in the native cache,
+        #      so prevent building the runtime library (see JuliaGPU/GPUCompiler.jl#601)
+        config = CompilerConfig(target, params; libraries=false)
         job = CompilerJob(source, config)
 
         JuliaContext() do ctx
-            # XXX: on Windows, compiling the GPU runtime leaks GPU code in the native cache,
-            #      so prevent building the runtime library (see JuliaGPU/GPUCompiler.jl#601)
-            GPUCompiler.compile(:asm, job; libraries=false)
+            GPUCompiler.compile(:asm, job)
         end
     end
 
diff --git a/src/reflection.jl b/src/reflection.jl
index 915e7af3..df1a0a43 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -186,8 +186,9 @@ See also: [`@device_code_llvm`](@ref), `InteractiveUtils.code_llvm`
 function code_llvm(io::IO, @nospecialize(job::CompilerJob); optimize::Bool=true, raw::Bool=false,
                    debuginfo::Symbol=:default, dump_module::Bool=false, kwargs...)
     # NOTE: jl_dump_function_ir supports stripping metadata, so don't do it in the driver
+    config = CompilerConfig(job.config; validate=false, strip=false)
     str = JuliaContext() do ctx
-        ir, meta = compile(:llvm, job; optimize=optimize, strip=false, validate=false, kwargs...)
+        ir, meta = compile(:llvm, CompilerJob(job; config))
         ts_mod = ThreadSafeModule(ir)
         entry_fn = meta.entry
         GC.@preserve ts_mod entry_fn begin
@@ -214,9 +215,11 @@ The following keyword arguments are supported:
 
 See also: [`@device_code_native`](@ref), `InteractiveUtils.code_llvm`
 """
-function code_native(io::IO, @nospecialize(job::CompilerJob); raw::Bool=false, dump_module::Bool=false)
+function code_native(io::IO, @nospecialize(job::CompilerJob);
+                     raw::Bool=false, dump_module::Bool=false)
+    config = CompilerConfig(job.config; strip=!raw, only_entry=!dump_module, validate=false)
     asm, meta = JuliaContext() do ctx
-        compile(:asm, job; strip=!raw, only_entry=!dump_module, validate=false)
+        compile(:asm, CompilerJob(job; config))
     end
     highlight(io, asm, source_code(job.config.target))
 end
diff --git a/src/rtlib.jl b/src/rtlib.jl
index 88f366b9..42faaebf 100644
--- a/src/rtlib.jl
+++ b/src/rtlib.jl
@@ -68,7 +68,7 @@ end
 function emit_function!(mod, config::CompilerConfig, f, method)
     tt = Base.to_tuple_type(method.types)
     source = generic_methodinstance(f, tt)
-    new_mod, meta = codegen(:llvm, CompilerJob(source, config); toplevel=false)
+    new_mod, meta = compile_unhooked(:llvm, CompilerJob(source, config))
     ft = function_type(meta.entry)
     expected_ft = convert(LLVM.FunctionType, method)
     if return_type(ft) != return_type(expected_ft)
@@ -99,7 +99,7 @@ function build_runtime(@nospecialize(job::CompilerJob))
 
     # the compiler job passed into here is identifies the job that requires the runtime.
     # derive a job that represents the runtime itself (notably with kernel=false).
-    config = CompilerConfig(job.config; kernel=false)
+    config = CompilerConfig(job.config; kernel=false, toplevel=false)
 
     for method in values(Runtime.methods)
         def = if isa(method.def, Symbol)
diff --git a/src/spirv.jl b/src/spirv.jl
index 45212173..7e27c346 100644
--- a/src/spirv.jl
+++ b/src/spirv.jl
@@ -184,8 +184,9 @@ end
 
 # reimplementation that uses `spirv-dis`, giving much more pleasant output
 function code_native(io::IO, job::CompilerJob{SPIRVCompilerTarget}; raw::Bool=false, dump_module::Bool=false)
+    config = CompilerConfig(job.config; strip=!raw, only_entry=!dump_module, validate=false)
     obj, _ = JuliaContext() do ctx
-        compile(:obj, job; strip=!raw, only_entry=!dump_module, validate=false)
+        compile(:obj, CompilerJob(job; config))
     end
     mktemp() do input_path, input_io
         write(input_io, obj)
diff --git a/test/helpers/bpf.jl b/test/helpers/bpf.jl
index 49d9d6e4..d66b6b48 100644
--- a/test/helpers/bpf.jl
+++ b/test/helpers/bpf.jl
@@ -6,12 +6,12 @@ import ..TestRuntime
 struct CompilerParams <: AbstractCompilerParams end
 GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime
 
-function create_job(@nospecialize(func), @nospecialize(types);
-                    kernel::Bool=false, always_inline=false, kwargs...)
+function create_job(@nospecialize(func), @nospecialize(types); kwargs...)
+    config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = BPFCompilerTarget()
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/helpers/gcn.jl b/test/helpers/gcn.jl
index 2cb371e9..f7f54f85 100644
--- a/test/helpers/gcn.jl
+++ b/test/helpers/gcn.jl
@@ -6,12 +6,12 @@ import ..TestRuntime
 struct CompilerParams <: AbstractCompilerParams end
 GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime
 
-function create_job(@nospecialize(func), @nospecialize(types);
-                 kernel::Bool=false, always_inline=false, kwargs...)
+function create_job(@nospecialize(func), @nospecialize(types); kwargs...)
+    config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = GCNCompilerTarget(dev_isa="gfx900")
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/helpers/metal.jl b/test/helpers/metal.jl
index c45ba4c4..d46f9a89 100644
--- a/test/helpers/metal.jl
+++ b/test/helpers/metal.jl
@@ -6,12 +6,12 @@ import ..TestRuntime
 struct CompilerParams <: AbstractCompilerParams end
 GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime
 
-function create_job(@nospecialize(func), @nospecialize(types);
-                    kernel::Bool=false, always_inline=false, kwargs...)
+function create_job(@nospecialize(func), @nospecialize(types); kwargs...)
+    config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = MetalCompilerTarget(; macos=v"12.2", metal=v"3.0", air=v"3.0")
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/helpers/native.jl b/test/helpers/native.jl
index c1c39ba0..d53ff172 100644
--- a/test/helpers/native.jl
+++ b/test/helpers/native.jl
@@ -20,13 +20,13 @@ GPUCompiler.runtime_module(::NativeCompilerJob) = TestRuntime
 GPUCompiler.method_table(@nospecialize(job::NativeCompilerJob)) = job.config.params.method_table
 GPUCompiler.can_safepoint(@nospecialize(job::NativeCompilerJob)) = job.config.params.entry_safepoint
 
-function create_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false,
-                    entry_abi=:specfunc, entry_safepoint::Bool=false, always_inline=false,
-                    method_table=test_method_table, kwargs...)
+function create_job(@nospecialize(func), @nospecialize(types);
+                    entry_safepoint::Bool=false, method_table=test_method_table, kwargs...)
+    config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = NativeCompilerTarget()
     params = CompilerParams(entry_safepoint, method_table)
-    config = CompilerConfig(target, params; kernel, entry_abi, always_inline)
+    config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
 end
 
@@ -71,7 +71,7 @@ const runtime_cache = Dict{Any, Any}()
 
 function compiler(job)
     JuliaContext() do ctx
-        GPUCompiler.compile(:asm, job, validate=false)
+        GPUCompiler.compile(:asm, job)
     end
 end
 
@@ -81,7 +81,7 @@ end
 
 # simulates cached codegen
 function cached_execution(@nospecialize(func), @nospecialize(types); kwargs...)
-    job, kwargs = create_job(func, types; kwargs...)
+    job, kwargs = create_job(func, types; validate=false, kwargs...)
     GPUCompiler.cached_compilation(runtime_cache, job.source, job.config, compiler, linker)
 end
 
diff --git a/test/helpers/ptx.jl b/test/helpers/ptx.jl
index a9f58871..5f8a3c48 100644
--- a/test/helpers/ptx.jl
+++ b/test/helpers/ptx.jl
@@ -35,15 +35,15 @@ module PTXTestRuntime
 end
 GPUCompiler.runtime_module(::PTXCompilerJob) = PTXTestRuntime
 
-function create_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false,
-                    minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing,
-                    maxregs=nothing, always_inline=false, kwargs...)
+function create_job(@nospecialize(func), @nospecialize(types);
+                    minthreads=nothing, maxthreads=nothing,
+                    blocks_per_sm=nothing, maxregs=nothing,
+                    kwargs...)
+    config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
-    target = PTXCompilerTarget(;cap=v"7.0",
-                               minthreads, maxthreads,
-                               blocks_per_sm, maxregs)
+    target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs)
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/helpers/spirv.jl b/test/helpers/spirv.jl
index 761cd495..73d030d1 100644
--- a/test/helpers/spirv.jl
+++ b/test/helpers/spirv.jl
@@ -7,14 +7,14 @@ struct CompilerParams <: AbstractCompilerParams end
 GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime
 
 function create_job(@nospecialize(func), @nospecialize(types);
-                   kernel::Bool=false, always_inline=false,
-                   supports_fp16=true, supports_fp64=true,
-                   backend::Symbol, kwargs...)
+                   supports_fp16=true, supports_fp64=true, backend::Symbol,
+                   kwargs...)
+    config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = SPIRVCompilerTarget(; backend, validate=true, optimize=true,
                                    supports_fp16, supports_fp64)
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/metal.jl b/test/metal.jl
index db626435..ae854353 100644
--- a/test/metal.jl
+++ b/test/metal.jl
@@ -114,7 +114,7 @@ end
         return
     end
 
-    @test_throws_message(InvalidIRError, Metal.code_llvm(devnull, kernel2, Tuple{Core.LLVMPtr{Float64,1}}; validate=true)) do msg
+    @test_throws_message(InvalidIRError, Metal.code_execution(kernel2, Tuple{Core.LLVMPtr{Float64,1}})) do msg
         occursin("unsupported use of double value", msg)
     end
 end
diff --git a/test/native.jl b/test/native.jl
index 4772c49e..19a6297c 100644
--- a/test/native.jl
+++ b/test/native.jl
@@ -50,10 +50,10 @@ end
         @noinline inner(x) = x+1
         foo(x) = sum(inner, fill(x, 10, 10))
 
-        job, _ = Native.create_job(foo, (Float64,))
+        job, _ = Native.create_job(foo, (Float64,); validate=false)
         JuliaContext() do ctx
             # shouldn't segfault
-            ir, meta = GPUCompiler.compile(:llvm, job; validate=false)
+            ir, meta = GPUCompiler.compile(:llvm, job)
 
             meth = only(methods(foo, (Float64,)))
 
@@ -87,8 +87,10 @@ end
         invocations = Ref(0)
         function compiler(job)
             invocations[] += 1
-            ir = sprint(io->GPUCompiler.code_llvm(io, job))
-            return ir
+            JuliaContext() do ctx
+                ir, ir_meta = GPUCompiler.compile(:llvm, job)
+                string(ir)
+            end
         end
         linker(job, compiled) = compiled
         cache = Dict()
diff --git a/test/native/precompile.jl b/test/native/precompile.jl
index c2648701..6fe981a5 100644
--- a/test/native/precompile.jl
+++ b/test/native/precompile.jl
@@ -56,7 +56,7 @@ precompile_test_harness("Inference caching") do load_path
         GPUCompiler.enable_disk_cache!()
         @test GPUCompiler.disk_cache_enabled() == true
 
-        job, _ = NativeCompiler.Native.create_job(NativeBackend.kernel, (Vector{Int}, Int))
+        job, _ = NativeCompiler.Native.create_job(NativeBackend.kernel, (Vector{Int}, Int); validate=false)
         @assert job.source == kernel_mi
         ci = GPUCompiler.ci_cache_lookup(GPUCompiler.ci_cache(job), job.source, job.world, job.world)
         @assert ci !== nothing
diff --git a/test/spirv.jl b/test/spirv.jl
index 2d7fb841..e14ccf77 100644
--- a/test/spirv.jl
+++ b/test/spirv.jl
@@ -48,28 +48,28 @@ end
     end
 
     ir = sprint(io->SPIRV.code_llvm(io, mod.kernel, Tuple{Ptr{Float16}, Float16};
-                                    backend, validate=true))
+                                    backend))
     @test occursin("store half", ir)
 
     ir = sprint(io->SPIRV.code_llvm(io, mod.kernel, Tuple{Ptr{Float32}, Float32};
-                                    backend, validate=true))
+                                    backend))
     @test occursin("store float", ir)
 
     ir = sprint(io->SPIRV.code_llvm(io, mod.kernel, Tuple{Ptr{Float64}, Float64};
-                                    backend, validate=true))
+                                    backend))
     @test occursin("store double", ir)
 
     @test_throws_message(InvalidIRError,
-                         SPIRV.code_llvm(devnull, mod.kernel, Tuple{Ptr{Float16}, Float16};
-                                         backend, supports_fp16=false, validate=true)) do msg
+                         SPIRV.code_execution(mod.kernel, Tuple{Ptr{Float16}, Float16};
+                                              backend, supports_fp16=false)) do msg
         occursin("unsupported use of half value", msg) &&
         occursin("[1] unsafe_store!", msg) &&
         occursin("[2] kernel", msg)
     end
 
     @test_throws_message(InvalidIRError,
-                         SPIRV.code_llvm(devnull, mod.kernel, Tuple{Ptr{Float64}, Float64};
-                                         backend, supports_fp64=false, validate=true)) do msg
+                         SPIRV.code_execution(mod.kernel, Tuple{Ptr{Float64}, Float64};
+                                              backend, supports_fp64=false)) do msg
         occursin("unsupported use of double value", msg) &&
         occursin("[1] unsafe_store!", msg) &&
         occursin("[2] kernel", msg)