diff --git a/Project.toml b/Project.toml
index fc258555..36d25dcb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,16 +13,21 @@ OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Random123 = "74087812-796a-5b5d-8853-05524746bad3"
+RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
 SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c"
 SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
+[sources]
+SPIRVIntrinsics = {path = "lib/intrinsics"}
+
 [compat]
 Adapt = "4"
 GPUArrays = "11.2.1"
-GPUCompiler = "1.6"
+GPUCompiler = "1.7.1"
 KernelAbstractions = "0.9.2"
 LLVM = "9.1"
 LinearAlgebra = "1"
@@ -30,12 +35,11 @@ OpenCL_jll = "=2024.10.24"
 Preferences = "1"
 Printf = "1"
 Random = "1"
+Random123 = "1.7.1"
+RandomNumbers = "1.6.0"
 Reexport = "1"
 SPIRVIntrinsics = "0.5"
 SPIRV_LLVM_Backend_jll = "20"
 SPIRV_Tools_jll = "2025.1"
 StaticArrays = "1"
 julia = "1.10"
-
-[sources]
-SPIRVIntrinsics = {path="lib/intrinsics"}
diff --git a/lib/cl/kernel.jl b/lib/cl/kernel.jl
index c8994813..d0243130 100644
--- a/lib/cl/kernel.jl
+++ b/lib/cl/kernel.jl
@@ -106,11 +106,59 @@ function Base.getproperty(ki::KernelWorkGroupInfo, s::Symbol)
     end
 end
 
+struct KernelSubGroupInfo
+    kernel::Kernel
+    device::Device
+    local_work_size::Vector{Csize_t}
+end
+sub_group_info(k::Kernel, d::Device, l) = KernelSubGroupInfo(k, d, Vector{Csize_t}(l))
+
+# Helper function for getting local size for a specific sub-group count
+function local_size_for_sub_group_count(ki::KernelSubGroupInfo, sub_group_count::Integer)
+    k = getfield(ki, :kernel)
+    d = getfield(ki, :device)
+    input_value = Ref{Csize_t}(sub_group_count)
+    result = Ref{NTuple{3, Csize_t}}()
+    clGetKernelSubGroupInfo(k, d, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT,
+                           sizeof(Csize_t), input_value, sizeof(NTuple{3, Csize_t}), result, C_NULL)
+    return Int.(result[])
+end
+
+function Base.getproperty(ki::KernelSubGroupInfo, s::Symbol)
+    k = getfield(ki, :kernel)
+    d = getfield(ki, :device)
+    lws = getfield(ki, :local_work_size)
+
+    function get(val, typ)
+        result = Ref{typ}()
+        clGetKernelSubGroupInfo(k, d, val, sizeof(lws), lws, sizeof(typ), result, C_NULL)
+        return result[]
+    end
+
+    if s == :max_sub_group_size
+        Int(get(CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, Csize_t))
+    elseif s == :sub_group_count
+        Int(get(CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, Csize_t))
+    elseif s == :local_size_for_sub_group_count
+        # This requires input_value to be the desired sub-group count
+        error("local_size_for_sub_group_count requires specifying desired sub-group count")
+    elseif s == :max_num_sub_groups
+        Int(get(CL_KERNEL_MAX_NUM_SUB_GROUPS, Csize_t))
+    elseif s == :compile_num_sub_groups
+        Int(get(CL_KERNEL_COMPILE_NUM_SUB_GROUPS, Csize_t))
+    elseif s == :compile_sub_group_size
+        Int(get(CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL, Csize_t))
+    else
+        getfield(ki, s)
+    end
+end
+
 
 ## kernel calling
 
 function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing;
-                        global_work_offset=nothing, wait_on::Vector{Event}=Event[])
+                        global_work_offset=nothing, wait_on::Vector{Event}=Event[],
+                        rng_state=false, nargs=nothing)
     max_work_dim = device().max_work_item_dims
     work_dim     = length(global_work_size)
     if work_dim > max_work_dim
@@ -153,6 +201,20 @@ function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing;
         # null local size means OpenCL decides
     end
 
+    if rng_state
+        if local_work_size !== nothing
+            num_sub_groups = KernelSubGroupInfo(k, device(), lsize).sub_group_count
+        else
+            num_sub_groups = KernelSubGroupInfo(k, device(), Csize_t[]).max_num_sub_groups
+        end
+        if nargs === nothing
+            nargs = k.num_args - 2
+        end
+        rng_state_size = sizeof(UInt32) * num_sub_groups
+        set_arg!(k, nargs + 1, LocalMem(UInt32, rng_state_size))
+        set_arg!(k, nargs + 2, LocalMem(UInt32, rng_state_size))
+    end
+
     if !isempty(wait_on)
         n_events = length(wait_on)
         wait_event_ids = [evt.id for evt in wait_on]
@@ -189,7 +251,8 @@ end
 function call(
         k::Kernel, args...; global_size = (1,), local_size = nothing,
         global_work_offset = nothing, wait_on::Vector{Event} = Event[],
-        indirect_memory::Vector{AbstractMemory} = AbstractMemory[]
+        indirect_memory::Vector{AbstractMemory} = AbstractMemory[],
+        rng_state=false,
     )
     set_args!(k, args...)
     if !isempty(indirect_memory)
@@ -243,7 +306,7 @@ function call(
             clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(usm_pointers), usm_pointers)
         end
     end
-    enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on)
+    enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on, rng_state, nargs=length(args))
 end
 
 # From `julia/base/reflection.jl`, adjusted to add specialization on `t`.
diff --git a/src/OpenCL.jl b/src/OpenCL.jl
index 4fe5c863..d4f32c60 100644
--- a/src/OpenCL.jl
+++ b/src/OpenCL.jl
@@ -24,6 +24,7 @@ Base.Experimental.@MethodTable(method_table)
 include("device/runtime.jl")
 include("device/array.jl")
 include("device/quirks.jl")
+include("device/random.jl")
 
 # high level implementation
 include("memory.jl")
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 1e6fc506..1ab715d6 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -18,6 +18,86 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
     in(fn, known_intrinsics) ||
     contains(fn, "__spirv_")
 
+GPUCompiler.kernel_state_type(::OpenCLCompilerJob) = KernelState
+
+function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
+                                    mod::LLVM.Module, entry::LLVM.Function)
+    entry = invoke(GPUCompiler.finish_module!,
+                   Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
+                   job, mod, entry)
+
+    # if this kernel uses our RNG, we should prime the shared state.
+    # XXX: these transformations should really happen at the Julia IR level...
+    if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel
+        # insert call to `initialize_rng_state`
+        f = initialize_rng_state
+        ft = typeof(f)
+        tt = Tuple{}
+
+        # create a deferred compilation job for `initialize_rng_state`
+        src = methodinstance(ft, tt, GPUCompiler.tls_world_age())
+        cfg = CompilerConfig(job.config; kernel=false, name=nothing)
+        job = CompilerJob(src, cfg, job.world)
+        id = length(GPUCompiler.deferred_codegen_jobs) + 1
+        GPUCompiler.deferred_codegen_jobs[id] = job
+
+        # generate IR for calls to `deferred_codegen` and the resulting function pointer
+        top_bb = first(blocks(entry))
+        bb = BasicBlock(top_bb, "initialize_rng")
+        @dispose builder=IRBuilder() begin
+            position!(builder, bb)
+            subprogram = LLVM.subprogram(entry)
+            if subprogram !== nothing
+                loc = DILocation(0, 0, subprogram)
+                debuglocation!(builder, loc)
+            end
+            debuglocation!(builder, first(instructions(top_bb)))
+
+            # call the `deferred_codegen` marker function
+            T_ptr = if LLVM.version() >= v"17"
+                LLVM.PointerType()
+            elseif VERSION >= v"1.12.0-DEV.225"
+                LLVM.PointerType(LLVM.Int8Type())
+            else
+                LLVM.Int64Type()
+            end
+            T_id = convert(LLVMType, Int)
+            deferred_codegen_ft = LLVM.FunctionType(T_ptr, [T_id])
+            deferred_codegen = if haskey(functions(mod), "deferred_codegen")
+                functions(mod)["deferred_codegen"]
+            else
+                LLVM.Function(mod, "deferred_codegen", deferred_codegen_ft)
+            end
+            fptr = call!(builder, deferred_codegen_ft, deferred_codegen, [ConstantInt(id)])
+
+            # call the `initialize_rng_state` function
+            rt = Core.Compiler.return_type(f, tt)
+            llvm_rt = convert(LLVMType, rt)
+            llvm_ft = LLVM.FunctionType(llvm_rt)
+            fptr = inttoptr!(builder, fptr, LLVM.PointerType(llvm_ft))
+            call!(builder, llvm_ft, fptr)
+            br!(builder, top_bb)
+
+            # note the use of the device-side RNG in this kernel
+            push!(function_attributes(entry), StringAttribute("julia.opencl.rng", ""))
+        end
+
+        # XXX: put some of the above behind GPUCompiler abstractions
+        #      (e.g., a compile-time version of `deferred_codegen`)
+    end
+    return entry
+end
+
+function GPUCompiler.finish_linked_module!(@nospecialize(job::OpenCLCompilerJob), mod::LLVM.Module)
+    for f in GPUCompiler.kernels(mod)
+        kernel_intrinsics = Dict(
+            "julia.opencl.random_keys" => (; name = "random_keys", typ = LLVMPtr{UInt32, AS.Workgroup}),
+            "julia.opencl.random_counters" => (; name = "random_counters", typ = LLVMPtr{UInt32, AS.Workgroup}),
+        )
+        GPUCompiler.add_input_arguments!(job, mod, f, kernel_intrinsics)
+    end
+    return
+end
 
 ## compiler implementation (cache, configure, compile, and link)
 
@@ -57,13 +137,17 @@ end
 # compile to executable machine code
 const compilations = Threads.Atomic{Int}(0)
 function compile(@nospecialize(job::CompilerJob))
+    compilations[] += 1
+
     # TODO: this creates a context; cache those.
     obj, meta = JuliaContext() do ctx
-        GPUCompiler.compile(:obj, job)
-    end
-    compilations[] += 1
+        obj, meta = GPUCompiler.compile(:obj, job)
 
-    (obj, entry=LLVM.name(meta.entry))
+        entry = LLVM.name(meta.entry)
+        device_rng = StringAttribute("julia.opencl.rng", "") in collect(function_attributes(meta.entry))
+
+        (; obj, entry, device_rng)
+    end
 end
 
 # link into an executable kernel
@@ -82,5 +166,5 @@ function link(@nospecialize(job::CompilerJob), compiled)
         cl.Program(; source)
     end
     cl.build!(prog)
-    cl.Kernel(prog, compiled.entry)
+    (; kernel=cl.Kernel(prog, compiled.entry), compiled.device_rng)
 end
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index a840f402..68885b8f 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -132,14 +132,15 @@ kernel_convert(arg, indirect_memory::Vector{cl.AbstractMemory} = cl.AbstractMemo
 
 abstract type AbstractKernel{F, TT} end
 
+pass_arg(@nospecialize dt) = !(isghosttype(dt) || Core.Compiler.isconstType(dt))
+
 @inline @generated function (kernel::AbstractKernel{F,TT})(args...;
                                                            call_kwargs...) where {F,TT}
     sig = Tuple{F, TT.parameters...}    # Base.signature_type with a function type
     args = (:(kernel.f), (:(kernel_convert(args[$i], indirect_memory)) for i in 1:length(args))...)
 
     # filter out ghost arguments that shouldn't be passed
-    predicate = dt -> isghosttype(dt) || Core.Compiler.isconstType(dt)
-    to_pass = map(!predicate, sig.parameters)
+    to_pass = map(pass_arg, sig.parameters)
     call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
     call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
 
@@ -151,12 +152,15 @@ abstract type AbstractKernel{F, TT} end
         end
     end
 
+    pushfirst!(call_t, KernelState)
+    pushfirst!(call_args, :(KernelState(kernel.rng_state ? Base.rand(UInt32) : UInt32(0))))
+
     # finalize types
     call_tt = Base.to_tuple_type(call_t)
 
     quote
         indirect_memory = cl.AbstractMemory[]
-        clcall(kernel.fun, $call_tt, $(call_args...); indirect_memory, call_kwargs...)
+        clcall(kernel.fun, $call_tt, $(call_args...); indirect_memory, kernel.rng_state, call_kwargs...)
     end
 end
 
@@ -167,6 +171,7 @@ end
 struct HostKernel{F,TT} <: AbstractKernel{F,TT}
     f::F
     fun::cl.Kernel
+    rng_state::Bool
 end
 
 
@@ -183,15 +188,15 @@ function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
         cache = compiler_cache(ctx)
         source = methodinstance(F, tt)
         config = compiler_config(dev; kwargs...)::OpenCLCompilerConfig
-        fun = GPUCompiler.cached_compilation(cache, source, config, compile, link)
+        linked = GPUCompiler.cached_compilation(cache, source, config, compile, link)
 
         # create a callable object that captures the function instance. we don't need to think
         # about world age here, as GPUCompiler already does and will return a different object
-        h = hash(fun, hash(f, hash(tt)))
+        h = hash(linked.kernel, hash(f, hash(tt)))
         kernel = get(_kernel_instances, h, nothing)
         if kernel === nothing
             # create the kernel state object
-            kernel = HostKernel{F,tt}(f, fun)
+            kernel = HostKernel{F,tt}(f, linked.kernel, linked.device_rng)
             _kernel_instances[h] = kernel
         end
         return kernel::HostKernel{F,tt}
diff --git a/src/device/random.jl b/src/device/random.jl
new file mode 100644
index 00000000..9b9917fe
--- /dev/null
+++ b/src/device/random.jl
@@ -0,0 +1,234 @@
+## random number generation
+
+using Random
+import RandomNumbers
+
+# local memory with the actual seed, per subgroup, set by `initialize_rng_state`` or overridden by calling `seed!`
+@inline function global_random_keys()
+    n = get_num_sub_groups()
+    ptr = random_keys()::LLVMPtr{UInt32, AS.Workgroup}
+    return CLDeviceArray{UInt32, 1, AS.Workgroup}((n,), ptr)
+end
+
+# local memory with per-subgroup counters, incremented when generating numbers
+@inline function global_random_counters()
+    n = get_num_sub_groups()
+    ptr = random_counters()::LLVMPtr{UInt32, AS.Workgroup}
+    return CLDeviceArray{UInt32, 1, AS.Workgroup}((n,), ptr)
+end
+
+# initialization function, called automatically at the start of each kernel
+function initialize_rng_state()
+    subgroup_id = get_sub_group_id()
+    @inbounds global_random_keys()[subgroup_id] = kernel_state().random_seed
+    @inbounds global_random_counters()[subgroup_id] = 0
+end
+
+# generators
+
+using Random123: philox2x_round, philox2x_bumpkey
+
+# GPU-compatible/optimized version of the generator from Random123.jl
+struct Philox2x32{R} <: RandomNumbers.AbstractRNG{UInt64} end
+
+# default to 7 rounds; enough to pass SmallCrush
+@inline Philox2x32() = Philox2x32{7}()
+
+@inline function Base.getproperty(rng::Philox2x32, field::Symbol)
+    subgroup_id = get_sub_group_id()
+
+    if field === :key
+        @inbounds global_random_keys()[subgroup_id]
+    elseif field === :ctr1
+        @inbounds global_random_counters()[subgroup_id]
+    elseif field === :ctr2
+        unsafe_trunc(UInt32, get_global_linear_id())
+    end
+end
+
+@inline function Base.setproperty!(rng::Philox2x32, field::Symbol, x)
+    subgroup_id = get_sub_group_id()
+
+    if field === :key
+        @inbounds global_random_keys()[subgroup_id] = x
+    elseif field === :ctr1
+        @inbounds global_random_counters()[subgroup_id] = x
+    end
+    return rng
+end
+
+@device_override @inline Random.default_rng() = Philox2x32()
+
+"""
+    Random.seed!(rng::Philox2x32, seed::Integer, [counter::Integer=0])
+
+Seed the on-device Philox2x32 generator with an UInt32 number.
+Should be called by at least one thread per warp.
+"""
+function Random.seed!(rng::Philox2x32, seed::Integer, counter::Integer=UInt32(0))
+    rng.key = seed % UInt32
+    rng.ctr1 = counter
+    return
+end
+
+# seeding the implicit default RNG
+@static if VERSION >= v"1.11-"
+    @device_override Random.seed!(seed) =
+        Random.seed!(Random.default_rng(), seed)
+else
+    @device_override Random.seed!(::Random._GLOBAL_RNG, seed) =
+        Random.seed!(Random.default_rng(), seed)
+end
+
+@static if VERSION >= v"1.11-"
+    # `Random.seed!(::AbstractRNG)` now passes a `nothing` seed value
+    # TODO: I don't think there is any way in OpenCL to make this nondeterministic
+    Random.seed!(rng::Philox2x32, seed::Nothing) =
+        Random.seed!(rng, kernel_state().random_seed)
+else
+    # ... where it used to call `Random_make_seed()`
+    @device_override Random.make_seed() = kernel_state().random_seed
+end
+
+"""
+    Random.rand(rng::Philox2x32, UInt32)
+
+Generate a byte of random data using the on-device Tausworthe generator.
+"""
+function Random.rand(rng::Philox2x32{R},::Type{UInt64}) where {R}
+    ctr1, ctr2, key = rng.ctr1, rng.ctr2, rng.key
+
+    if R > 0                               ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 1  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 2  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 3  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 4  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 5  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 6  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 7  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 8  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 9  key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 10 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 11 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 12 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 13 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 14 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+    if R > 15 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end
+
+    # update the warp counter
+    # NOTE: this performs the same update on every thread in the warp, but each warp writes
+    #       to a unique location so the duplicate writes are innocuous
+    # XXX: what if this overflows? we can't increment ctr2. bump the key?
+    rng.ctr1 += Int32(1)
+
+    # NOTE: it's too expensive to keep both numbers around in case the user only wanted one,
+    #       so just make our 2x32 generator return 64-bit numbers by default.
+    return (ctr1 % UInt64) << 32 | (ctr2 % UInt64)
+end
+
+
+
+# normally distributed random numbers using Ziggurat algorithm
+#
+# copied from Base because we don't support its global tables
+
+# a hacky method of exposing constant tables as constant GPU memory
+function emit_constant_array(name::Symbol, data::AbstractArray{T}) where {T}
+    @dispose ctx=Context() begin
+        T_val = convert(LLVMType, T)
+        T_ptr = convert(LLVMType, LLVMPtr{T,AS.UniformConstant})
+
+        # define function and get LLVM module
+        llvm_f, _ = create_function(T_ptr)
+        mod = LLVM.parent(llvm_f)
+
+        # create a global memory global variable
+        # TODO: global_var alignment?
+        T_global = LLVM.ArrayType(T_val, length(data))
+        # XXX: why can't we use a single name like emit_shmem
+        gv = GlobalVariable(mod, T_global, "gpu_$(name)_data", AS.UniformConstant)
+        linkage!(gv, LLVM.API.LLVMInternalLinkage)
+        initializer!(gv, ConstantArray(data))
+        alignment!(gv, 16)
+
+        # generate IR
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
+            position!(builder, entry)
+
+            ptr = gep!(builder, T_global, gv, [ConstantInt(0), ConstantInt(0)])
+
+            untyped_ptr = bitcast!(builder, ptr, T_ptr)
+
+            ret!(builder, untyped_ptr)
+        end
+
+        call_function(llvm_f, LLVMPtr{T,AS.UniformConstant})
+    end
+end
+
+for var in [:ki, :wi, :fi, :ke, :we, :fe]
+    val = getfield(Random, var)
+    gpu_var = Symbol("gpu_$var")
+    arr_typ = :(CLDeviceArray{$(eltype(val)),$(ndims(val)),AS.UniformConstant})
+    @eval @inline @generated function $gpu_var()
+        ptr = emit_constant_array($(QuoteNode(var)), $val)
+        Expr(:call, $arr_typ, $(size(val)), ptr)
+    end
+end
+
+## randn
+
+@device_override @inline function Random.randn(rng::AbstractRNG)
+    @label retry
+    r = Random.rand(rng, Random.UInt52Raw())
+    @inbounds begin
+        r &= 0x000fffffffffffff
+        rabs = Int64(r >> 1) # One bit for the sign
+        idx = rabs & 0xFF
+        x = ifelse(r % Bool, -rabs, rabs)*gpu_wi()[idx+1]
+        rabs < gpu_ki()[idx+1] && return x # 99.3% of the time we return here 1st try
+        # TODO: This code could be outlined once LLVM supports LDS access in recursively-called functions
+        @inbounds if idx == 0
+            while true
+                xx = -Random.ziggurat_nor_inv_r*log(Random.rand(rng))
+                yy = -log(Random.rand(rng))
+                yy+yy > xx*xx &&
+                    return (rabs >> 8) % Bool ? -Random.ziggurat_nor_r-xx : Random.ziggurat_nor_r+xx
+            end
+        elseif (gpu_fi()[idx] - gpu_fi()[idx+1])*Random.rand(rng) + gpu_fi()[idx+1] < exp(-0.5*x*x)
+            return x # return from the triangular area
+        else
+            @goto retry
+        end
+    end
+end
+
+@device_override @inline function Random.randn(rng::AbstractRNG, ::Type{T}) where {T <: Union{Float16, Float32}}
+    @invoke Random.randn(rng::AbstractRNG, T::Type{<:AbstractFloat})
+end
+
+## randexp
+
+@device_override @inline function Random.randexp(rng::AbstractRNG)
+    @label retry
+    ri = Random.rand(rng, Random.UInt52Raw())
+    @inbounds begin
+        ri &= 0x000fffffffffffff
+        idx = ri & 0xFF
+        x = ri*gpu_we()[idx+1]
+        ri < gpu_ke()[idx+1] && return x # 98.9% of the time we return here 1st try
+        # TODO: This code could be outlined once LLVM supports LDS access in recursively-called functions
+        @inbounds if idx == 0
+            return Random.ziggurat_exp_r - log(Random.rand(rng))
+        elseif (gpu_fe()[idx] - gpu_fe()[idx+1])*Random.rand(rng) + gpu_fe()[idx+1] < exp(-x)
+            return x # return from the triangular area
+        else
+            @goto retry
+        end
+    end
+end
+
+@device_override @inline function Random.randexp(rng::AbstractRNG, ::Type{T}) where {T <: Union{Float16, Float32}}
+    @invoke Random.randexp(rng::AbstractRNG, T::Type{<:AbstractFloat})
+end
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index b2a78a29..ef2f6083 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -12,3 +12,62 @@ report_exception(ex) = return
 report_exception_name(ex) = return
 
 report_exception_frame(idx, func, file, line) = return
+
+## kernel state
+
+struct KernelState
+    random_seed::UInt32
+end
+
+@inline @generated kernel_state() = GPUCompiler.kernel_state_value(KernelState)
+
+## intrinsics for adding and accessing additional kernel arguments
+
+# The amount of local shared memory we need for storing RNG state is determined
+# dynamically at kernel launch time, so needs to be passed as additional arguments
+# to the kernel.
+# We define intrinsics that get transformed into additional kernel arguments which
+# then get propagated across function calls to the caller.
+
+function additional_arg_intr(mod::LLVM.Module, T_state, name)
+    state_intr = if haskey(functions(mod), "julia.opencl.$name")
+        functions(mod)["julia.opencl.$name"]
+    else
+        LLVM.Function(mod, "julia.opencl.$name", LLVM.FunctionType(T_state))
+    end
+    push!(function_attributes(state_intr), EnumAttribute("readnone", 0))
+
+    return state_intr
+end
+
+# run-time equivalent
+function additional_arg_value(state, name)
+    @dispose ctx=Context() begin
+        T_state = convert(LLVMType, state)
+
+        # create function
+        llvm_f, _ = create_function(T_state)
+        mod = LLVM.parent(llvm_f)
+
+        # get intrinsic
+        state_intr = additional_arg_intr(mod, T_state, name)
+        state_intr_ft = function_type(state_intr)
+
+        # generate IR
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
+            position!(builder, entry)
+
+            val = call!(builder, state_intr_ft, state_intr, Value[], name)
+
+            ret!(builder, val)
+        end
+
+        call_function(llvm_f, state)
+    end
+end
+
+for name in [:random_keys, :random_counters]
+    @eval @inline @generated $name() =
+        additional_arg_value(LLVMPtr{UInt32, AS.Workgroup}, $(String(name)))
+end
diff --git a/test/device/random.jl b/test/device/random.jl
new file mode 100644
index 00000000..7a40df54
--- /dev/null
+++ b/test/device/random.jl
@@ -0,0 +1,137 @@
+using Random
+
+const n = 256
+
+function apply_seed(seed)
+    if seed === missing
+        # should result in different numbers across launches
+        Random.seed!()
+        # XXX: this currently doesn't work, because of the definition in Base,
+        #      `seed!(r::MersenneTwister=default_rng())`, which breaks overriding
+        #      `default_rng` with a non-MersenneTwister RNG.
+    elseif seed !== nothing
+        # should result in the same numbers
+        Random.seed!(seed)
+    elseif seed === nothing
+        # should result in different numbers across launches,
+        # as determined by the seed set during module loading.
+    end
+end
+
+eltypes = [filter(x -> !(x <: Complex), GPUArraysTestSuite.supported_eltypes(CLArray)); Bool; UInt16; UInt32; UInt64]
+
+@testset "rand($T), seed $seed" for T in eltypes, seed in (nothing, #=missing,=# 1234)
+    # different kernel invocations should get different numbers
+    @testset "across launches" begin
+        function kernel(A::AbstractArray{T}, seed) where {T}
+            apply_seed(seed)
+            tid = get_global_id(1)
+            A[tid] = rand(T)
+            return nothing
+        end
+
+        a = OpenCL.zeros(T, n)
+        b = OpenCL.zeros(T, n)
+
+        @opencl global_size=n local_size=n kernel(a, seed)
+        @opencl global_size=n local_size=n kernel(b, seed)
+
+        if seed === nothing || seed === missing
+            @test Array(a) != Array(b)
+        else
+            @test Array(a) == Array(b)
+        end
+    end
+
+    # multiple calls to rand should get different numbers
+    @testset "across calls" begin
+        function kernel(A::AbstractArray{T}, B::AbstractArray{T}, seed) where {T}
+            apply_seed(seed)
+            tid = get_global_id(1)
+            A[tid] = rand(T)
+            B[tid] = rand(T)
+            return nothing
+        end
+
+        a = OpenCL.zeros(T, n)
+        b = OpenCL.zeros(T, n)
+
+        @opencl global_size=n local_size=n kernel(a, b, seed)
+
+        @test Array(a) != Array(b)
+    end
+
+    if T != Bool
+        # different threads should get different numbers
+        @testset "across threads, dim $active_dim" for active_dim in 1:6
+            function kernel(A::AbstractArray{T}, seed) where {T}
+                apply_seed(seed)
+                id = get_local_id(1) * get_local_id(2) * get_local_id(3) *
+                     get_group_id(1) * get_group_id(2) * get_group_id(3)
+                if 1 <= id <= length(A)
+                    A[id] = rand(T)
+                end
+                return nothing
+            end
+
+            tx, ty, tz, bx, by, bz = [dim == active_dim ? 3 : 1 for dim in 1:6]
+            gx, gy, gz = tx*bx, ty*by, tz*bz
+            a = OpenCL.zeros(T, 3)
+
+            @opencl local_size=(tx, ty, tz) global_size=(gx, gy, gz) kernel(a, seed)
+
+            # NOTE: we don't just generate two numbers and compare them, instead generating a
+            #       couple more and checking they're not all the same, in order to avoid
+            #       occasional collisions with lower-precision types (i.e., Float16).
+            @test length(unique(Array(a))) > 1
+        end
+    end
+end
+
+@testset "basic randn($T), seed $seed" for T in filter(x -> x <: Base.IEEEFloat, eltypes), seed in (nothing, #=missing,=# 1234)
+    function kernel(A::AbstractArray{T}, seed) where {T}
+        apply_seed(seed)
+        tid = get_global_id(1)
+        A[tid] = randn(T)
+        return
+    end
+
+    a = OpenCL.zeros(T, n)
+    b = OpenCL.zeros(T, n)
+
+    @opencl global_size=n local_size=n kernel(a, seed)
+    @opencl global_size=n local_size=n kernel(b, seed)
+
+    if seed === nothing || seed === missing
+        @test Array(a) != Array(b)
+    else
+        @test Array(a) == Array(b)
+    end
+end
+
+randexp_eltypes = filter(x -> x <: Base.IEEEFloat, eltypes)
+if cl.platform().name == "Portable Computing Language"
+    # POCL doesn't support log1p for Float16
+    filter!(x -> x != Float16, randexp_eltypes)
+end
+
+@testset "basic randexp($T), seed $seed" for T in randexp_eltypes, seed in (nothing, #=missing,=# 1234)
+    function kernel(A::AbstractArray{T}, seed) where {T}
+        apply_seed(seed)
+        tid = get_global_id(1)
+        A[tid] = randexp(T)
+        return
+    end
+
+    a = OpenCL.zeros(T, n)
+    b = OpenCL.zeros(T, n)
+
+    @opencl global_size=n local_size=n kernel(a, seed)
+    @opencl global_size=n local_size=n kernel(b, seed)
+
+    if seed === nothing || seed === missing
+        @test Array(a) != Array(b)
+    else
+        @test Array(a) == Array(b)
+    end
+end
diff --git a/test/setup.jl b/test/setup.jl
index 066424c6..90337d36 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -91,7 +91,7 @@ function runtests(f, name, platform_filter)
 
         # some tests require native execution capabilities
         requires_il = name in ["atomics", "execution", "intrinsics", "kernelabstractions"] ||
-                      startswith(name, "gpuarrays/")
+                      startswith(name, "gpuarrays/") || startswith(name, "device/")
 
         ex = quote
             GC.gc(true)