diff --git a/Project.toml b/Project.toml index fc258555..36d25dcb 100644 --- a/Project.toml +++ b/Project.toml @@ -13,16 +13,21 @@ OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Random123 = "74087812-796a-5b5d-8853-05524746bad3" +RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c" SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +[sources] +SPIRVIntrinsics = {path = "lib/intrinsics"} + [compat] Adapt = "4" GPUArrays = "11.2.1" -GPUCompiler = "1.6" +GPUCompiler = "1.7.1" KernelAbstractions = "0.9.2" LLVM = "9.1" LinearAlgebra = "1" @@ -30,12 +35,11 @@ OpenCL_jll = "=2024.10.24" Preferences = "1" Printf = "1" Random = "1" +Random123 = "1.7.1" +RandomNumbers = "1.6.0" Reexport = "1" SPIRVIntrinsics = "0.5" SPIRV_LLVM_Backend_jll = "20" SPIRV_Tools_jll = "2025.1" StaticArrays = "1" julia = "1.10" - -[sources] -SPIRVIntrinsics = {path="lib/intrinsics"} diff --git a/lib/cl/kernel.jl b/lib/cl/kernel.jl index c8994813..d0243130 100644 --- a/lib/cl/kernel.jl +++ b/lib/cl/kernel.jl @@ -106,11 +106,59 @@ function Base.getproperty(ki::KernelWorkGroupInfo, s::Symbol) end end +struct KernelSubGroupInfo + kernel::Kernel + device::Device + local_work_size::Vector{Csize_t} +end +sub_group_info(k::Kernel, d::Device, l) = KernelSubGroupInfo(k, d, Vector{Csize_t}(l)) + +# Helper function for getting local size for a specific sub-group count +function local_size_for_sub_group_count(ki::KernelSubGroupInfo, sub_group_count::Integer) + k = getfield(ki, :kernel) + d = getfield(ki, :device) + input_value = Ref{Csize_t}(sub_group_count) + result = Ref{NTuple{3, Csize_t}}() + clGetKernelSubGroupInfo(k, d, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, + sizeof(Csize_t), input_value, sizeof(NTuple{3, Csize_t}), result, C_NULL) + return Int.(result[]) +end + +function Base.getproperty(ki::KernelSubGroupInfo, s::Symbol) + k = getfield(ki, :kernel) + d = getfield(ki, :device) + lws = getfield(ki, :local_work_size) + + function get(val, typ) + result = Ref{typ}() + clGetKernelSubGroupInfo(k, d, val, sizeof(lws), lws, sizeof(typ), result, C_NULL) + return result[] + end + + if s == :max_sub_group_size + Int(get(CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, Csize_t)) + elseif s == :sub_group_count + Int(get(CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, Csize_t)) + elseif s == :local_size_for_sub_group_count + # This requires input_value to be the desired sub-group count + error("local_size_for_sub_group_count requires specifying desired sub-group count") + elseif s == :max_num_sub_groups + Int(get(CL_KERNEL_MAX_NUM_SUB_GROUPS, Csize_t)) + elseif s == :compile_num_sub_groups + Int(get(CL_KERNEL_COMPILE_NUM_SUB_GROUPS, Csize_t)) + elseif s == :compile_sub_group_size + Int(get(CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL, Csize_t)) + else + getfield(ki, s) + end +end + ## kernel calling function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing; - global_work_offset=nothing, wait_on::Vector{Event}=Event[]) + global_work_offset=nothing, wait_on::Vector{Event}=Event[], + rng_state=false, nargs=nothing) max_work_dim = device().max_work_item_dims work_dim = length(global_work_size) if work_dim > max_work_dim @@ -153,6 +201,20 @@ function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing; # null local size means OpenCL decides end + if rng_state + if local_work_size !== nothing + num_sub_groups = KernelSubGroupInfo(k, device(), lsize).sub_group_count + else + num_sub_groups = KernelSubGroupInfo(k, device(), Csize_t[]).max_num_sub_groups + end + if nargs === nothing + nargs = k.num_args - 2 + end + rng_state_size = sizeof(UInt32) * num_sub_groups + set_arg!(k, nargs + 1, LocalMem(UInt32, rng_state_size)) + set_arg!(k, nargs + 2, LocalMem(UInt32, rng_state_size)) + end + if !isempty(wait_on) n_events = length(wait_on) wait_event_ids = [evt.id for evt in wait_on] @@ -189,7 +251,8 @@ end function call( k::Kernel, args...; global_size = (1,), local_size = nothing, global_work_offset = nothing, wait_on::Vector{Event} = Event[], - indirect_memory::Vector{AbstractMemory} = AbstractMemory[] + indirect_memory::Vector{AbstractMemory} = AbstractMemory[], + rng_state=false, ) set_args!(k, args...) if !isempty(indirect_memory) @@ -243,7 +306,7 @@ function call( clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(usm_pointers), usm_pointers) end end - enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on) + enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on, rng_state, nargs=length(args)) end # From `julia/base/reflection.jl`, adjusted to add specialization on `t`. diff --git a/src/OpenCL.jl b/src/OpenCL.jl index 4fe5c863..d4f32c60 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -24,6 +24,7 @@ Base.Experimental.@MethodTable(method_table) include("device/runtime.jl") include("device/array.jl") include("device/quirks.jl") +include("device/random.jl") # high level implementation include("memory.jl") diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 1e6fc506..1ab715d6 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -18,6 +18,86 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) = in(fn, known_intrinsics) || contains(fn, "__spirv_") +GPUCompiler.kernel_state_type(::OpenCLCompilerJob) = KernelState + +function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob), + mod::LLVM.Module, entry::LLVM.Function) + entry = invoke(GPUCompiler.finish_module!, + Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function}, + job, mod, entry) + + # if this kernel uses our RNG, we should prime the shared state. + # XXX: these transformations should really happen at the Julia IR level... + if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel + # insert call to `initialize_rng_state` + f = initialize_rng_state + ft = typeof(f) + tt = Tuple{} + + # create a deferred compilation job for `initialize_rng_state` + src = methodinstance(ft, tt, GPUCompiler.tls_world_age()) + cfg = CompilerConfig(job.config; kernel=false, name=nothing) + job = CompilerJob(src, cfg, job.world) + id = length(GPUCompiler.deferred_codegen_jobs) + 1 + GPUCompiler.deferred_codegen_jobs[id] = job + + # generate IR for calls to `deferred_codegen` and the resulting function pointer + top_bb = first(blocks(entry)) + bb = BasicBlock(top_bb, "initialize_rng") + @dispose builder=IRBuilder() begin + position!(builder, bb) + subprogram = LLVM.subprogram(entry) + if subprogram !== nothing + loc = DILocation(0, 0, subprogram) + debuglocation!(builder, loc) + end + debuglocation!(builder, first(instructions(top_bb))) + + # call the `deferred_codegen` marker function + T_ptr = if LLVM.version() >= v"17" + LLVM.PointerType() + elseif VERSION >= v"1.12.0-DEV.225" + LLVM.PointerType(LLVM.Int8Type()) + else + LLVM.Int64Type() + end + T_id = convert(LLVMType, Int) + deferred_codegen_ft = LLVM.FunctionType(T_ptr, [T_id]) + deferred_codegen = if haskey(functions(mod), "deferred_codegen") + functions(mod)["deferred_codegen"] + else + LLVM.Function(mod, "deferred_codegen", deferred_codegen_ft) + end + fptr = call!(builder, deferred_codegen_ft, deferred_codegen, [ConstantInt(id)]) + + # call the `initialize_rng_state` function + rt = Core.Compiler.return_type(f, tt) + llvm_rt = convert(LLVMType, rt) + llvm_ft = LLVM.FunctionType(llvm_rt) + fptr = inttoptr!(builder, fptr, LLVM.PointerType(llvm_ft)) + call!(builder, llvm_ft, fptr) + br!(builder, top_bb) + + # note the use of the device-side RNG in this kernel + push!(function_attributes(entry), StringAttribute("julia.opencl.rng", "")) + end + + # XXX: put some of the above behind GPUCompiler abstractions + # (e.g., a compile-time version of `deferred_codegen`) + end + return entry +end + +function GPUCompiler.finish_linked_module!(@nospecialize(job::OpenCLCompilerJob), mod::LLVM.Module) + for f in GPUCompiler.kernels(mod) + kernel_intrinsics = Dict( + "julia.opencl.random_keys" => (; name = "random_keys", typ = LLVMPtr{UInt32, AS.Workgroup}), + "julia.opencl.random_counters" => (; name = "random_counters", typ = LLVMPtr{UInt32, AS.Workgroup}), + ) + GPUCompiler.add_input_arguments!(job, mod, f, kernel_intrinsics) + end + return +end ## compiler implementation (cache, configure, compile, and link) @@ -57,13 +137,17 @@ end # compile to executable machine code const compilations = Threads.Atomic{Int}(0) function compile(@nospecialize(job::CompilerJob)) + compilations[] += 1 + # TODO: this creates a context; cache those. obj, meta = JuliaContext() do ctx - GPUCompiler.compile(:obj, job) - end - compilations[] += 1 + obj, meta = GPUCompiler.compile(:obj, job) - (obj, entry=LLVM.name(meta.entry)) + entry = LLVM.name(meta.entry) + device_rng = StringAttribute("julia.opencl.rng", "") in collect(function_attributes(meta.entry)) + + (; obj, entry, device_rng) + end end # link into an executable kernel @@ -82,5 +166,5 @@ function link(@nospecialize(job::CompilerJob), compiled) cl.Program(; source) end cl.build!(prog) - cl.Kernel(prog, compiled.entry) + (; kernel=cl.Kernel(prog, compiled.entry), compiled.device_rng) end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index a840f402..68885b8f 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -132,14 +132,15 @@ kernel_convert(arg, indirect_memory::Vector{cl.AbstractMemory} = cl.AbstractMemo abstract type AbstractKernel{F, TT} end +pass_arg(@nospecialize dt) = !(isghosttype(dt) || Core.Compiler.isconstType(dt)) + @inline @generated function (kernel::AbstractKernel{F,TT})(args...; call_kwargs...) where {F,TT} sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type args = (:(kernel.f), (:(kernel_convert(args[$i], indirect_memory)) for i in 1:length(args))...) # filter out ghost arguments that shouldn't be passed - predicate = dt -> isghosttype(dt) || Core.Compiler.isconstType(dt) - to_pass = map(!predicate, sig.parameters) + to_pass = map(pass_arg, sig.parameters) call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] @@ -151,12 +152,15 @@ abstract type AbstractKernel{F, TT} end end end + pushfirst!(call_t, KernelState) + pushfirst!(call_args, :(KernelState(kernel.rng_state ? Base.rand(UInt32) : UInt32(0)))) + # finalize types call_tt = Base.to_tuple_type(call_t) quote indirect_memory = cl.AbstractMemory[] - clcall(kernel.fun, $call_tt, $(call_args...); indirect_memory, call_kwargs...) + clcall(kernel.fun, $call_tt, $(call_args...); indirect_memory, kernel.rng_state, call_kwargs...) end end @@ -167,6 +171,7 @@ end struct HostKernel{F,TT} <: AbstractKernel{F,TT} f::F fun::cl.Kernel + rng_state::Bool end @@ -183,15 +188,15 @@ function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} cache = compiler_cache(ctx) source = methodinstance(F, tt) config = compiler_config(dev; kwargs...)::OpenCLCompilerConfig - fun = GPUCompiler.cached_compilation(cache, source, config, compile, link) + linked = GPUCompiler.cached_compilation(cache, source, config, compile, link) # create a callable object that captures the function instance. we don't need to think # about world age here, as GPUCompiler already does and will return a different object - h = hash(fun, hash(f, hash(tt))) + h = hash(linked.kernel, hash(f, hash(tt))) kernel = get(_kernel_instances, h, nothing) if kernel === nothing # create the kernel state object - kernel = HostKernel{F,tt}(f, fun) + kernel = HostKernel{F,tt}(f, linked.kernel, linked.device_rng) _kernel_instances[h] = kernel end return kernel::HostKernel{F,tt} diff --git a/src/device/random.jl b/src/device/random.jl new file mode 100644 index 00000000..9b9917fe --- /dev/null +++ b/src/device/random.jl @@ -0,0 +1,234 @@ +## random number generation + +using Random +import RandomNumbers + +# local memory with the actual seed, per subgroup, set by `initialize_rng_state`` or overridden by calling `seed!` +@inline function global_random_keys() + n = get_num_sub_groups() + ptr = random_keys()::LLVMPtr{UInt32, AS.Workgroup} + return CLDeviceArray{UInt32, 1, AS.Workgroup}((n,), ptr) +end + +# local memory with per-subgroup counters, incremented when generating numbers +@inline function global_random_counters() + n = get_num_sub_groups() + ptr = random_counters()::LLVMPtr{UInt32, AS.Workgroup} + return CLDeviceArray{UInt32, 1, AS.Workgroup}((n,), ptr) +end + +# initialization function, called automatically at the start of each kernel +function initialize_rng_state() + subgroup_id = get_sub_group_id() + @inbounds global_random_keys()[subgroup_id] = kernel_state().random_seed + @inbounds global_random_counters()[subgroup_id] = 0 +end + +# generators + +using Random123: philox2x_round, philox2x_bumpkey + +# GPU-compatible/optimized version of the generator from Random123.jl +struct Philox2x32{R} <: RandomNumbers.AbstractRNG{UInt64} end + +# default to 7 rounds; enough to pass SmallCrush +@inline Philox2x32() = Philox2x32{7}() + +@inline function Base.getproperty(rng::Philox2x32, field::Symbol) + subgroup_id = get_sub_group_id() + + if field === :key + @inbounds global_random_keys()[subgroup_id] + elseif field === :ctr1 + @inbounds global_random_counters()[subgroup_id] + elseif field === :ctr2 + unsafe_trunc(UInt32, get_global_linear_id()) + end +end + +@inline function Base.setproperty!(rng::Philox2x32, field::Symbol, x) + subgroup_id = get_sub_group_id() + + if field === :key + @inbounds global_random_keys()[subgroup_id] = x + elseif field === :ctr1 + @inbounds global_random_counters()[subgroup_id] = x + end + return rng +end + +@device_override @inline Random.default_rng() = Philox2x32() + +""" + Random.seed!(rng::Philox2x32, seed::Integer, [counter::Integer=0]) + +Seed the on-device Philox2x32 generator with an UInt32 number. +Should be called by at least one thread per warp. +""" +function Random.seed!(rng::Philox2x32, seed::Integer, counter::Integer=UInt32(0)) + rng.key = seed % UInt32 + rng.ctr1 = counter + return +end + +# seeding the implicit default RNG +@static if VERSION >= v"1.11-" + @device_override Random.seed!(seed) = + Random.seed!(Random.default_rng(), seed) +else + @device_override Random.seed!(::Random._GLOBAL_RNG, seed) = + Random.seed!(Random.default_rng(), seed) +end + +@static if VERSION >= v"1.11-" + # `Random.seed!(::AbstractRNG)` now passes a `nothing` seed value + # TODO: I don't think there is any way in OpenCL to make this nondeterministic + Random.seed!(rng::Philox2x32, seed::Nothing) = + Random.seed!(rng, kernel_state().random_seed) +else + # ... where it used to call `Random_make_seed()` + @device_override Random.make_seed() = kernel_state().random_seed +end + +""" + Random.rand(rng::Philox2x32, UInt32) + +Generate a byte of random data using the on-device Tausworthe generator. +""" +function Random.rand(rng::Philox2x32{R},::Type{UInt64}) where {R} + ctr1, ctr2, key = rng.ctr1, rng.ctr2, rng.key + + if R > 0 ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 1 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 2 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 3 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 4 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 5 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 6 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 7 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 8 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 9 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 10 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 11 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 12 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 13 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 14 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + if R > 15 key = philox2x_bumpkey(key); ctr1, ctr2 = philox2x_round(ctr1, ctr2, key); end + + # update the warp counter + # NOTE: this performs the same update on every thread in the warp, but each warp writes + # to a unique location so the duplicate writes are innocuous + # XXX: what if this overflows? we can't increment ctr2. bump the key? + rng.ctr1 += Int32(1) + + # NOTE: it's too expensive to keep both numbers around in case the user only wanted one, + # so just make our 2x32 generator return 64-bit numbers by default. + return (ctr1 % UInt64) << 32 | (ctr2 % UInt64) +end + + + +# normally distributed random numbers using Ziggurat algorithm +# +# copied from Base because we don't support its global tables + +# a hacky method of exposing constant tables as constant GPU memory +function emit_constant_array(name::Symbol, data::AbstractArray{T}) where {T} + @dispose ctx=Context() begin + T_val = convert(LLVMType, T) + T_ptr = convert(LLVMType, LLVMPtr{T,AS.UniformConstant}) + + # define function and get LLVM module + llvm_f, _ = create_function(T_ptr) + mod = LLVM.parent(llvm_f) + + # create a global memory global variable + # TODO: global_var alignment? + T_global = LLVM.ArrayType(T_val, length(data)) + # XXX: why can't we use a single name like emit_shmem + gv = GlobalVariable(mod, T_global, "gpu_$(name)_data", AS.UniformConstant) + linkage!(gv, LLVM.API.LLVMInternalLinkage) + initializer!(gv, ConstantArray(data)) + alignment!(gv, 16) + + # generate IR + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") + position!(builder, entry) + + ptr = gep!(builder, T_global, gv, [ConstantInt(0), ConstantInt(0)]) + + untyped_ptr = bitcast!(builder, ptr, T_ptr) + + ret!(builder, untyped_ptr) + end + + call_function(llvm_f, LLVMPtr{T,AS.UniformConstant}) + end +end + +for var in [:ki, :wi, :fi, :ke, :we, :fe] + val = getfield(Random, var) + gpu_var = Symbol("gpu_$var") + arr_typ = :(CLDeviceArray{$(eltype(val)),$(ndims(val)),AS.UniformConstant}) + @eval @inline @generated function $gpu_var() + ptr = emit_constant_array($(QuoteNode(var)), $val) + Expr(:call, $arr_typ, $(size(val)), ptr) + end +end + +## randn + +@device_override @inline function Random.randn(rng::AbstractRNG) + @label retry + r = Random.rand(rng, Random.UInt52Raw()) + @inbounds begin + r &= 0x000fffffffffffff + rabs = Int64(r >> 1) # One bit for the sign + idx = rabs & 0xFF + x = ifelse(r % Bool, -rabs, rabs)*gpu_wi()[idx+1] + rabs < gpu_ki()[idx+1] && return x # 99.3% of the time we return here 1st try + # TODO: This code could be outlined once LLVM supports LDS access in recursively-called functions + @inbounds if idx == 0 + while true + xx = -Random.ziggurat_nor_inv_r*log(Random.rand(rng)) + yy = -log(Random.rand(rng)) + yy+yy > xx*xx && + return (rabs >> 8) % Bool ? -Random.ziggurat_nor_r-xx : Random.ziggurat_nor_r+xx + end + elseif (gpu_fi()[idx] - gpu_fi()[idx+1])*Random.rand(rng) + gpu_fi()[idx+1] < exp(-0.5*x*x) + return x # return from the triangular area + else + @goto retry + end + end +end + +@device_override @inline function Random.randn(rng::AbstractRNG, ::Type{T}) where {T <: Union{Float16, Float32}} + @invoke Random.randn(rng::AbstractRNG, T::Type{<:AbstractFloat}) +end + +## randexp + +@device_override @inline function Random.randexp(rng::AbstractRNG) + @label retry + ri = Random.rand(rng, Random.UInt52Raw()) + @inbounds begin + ri &= 0x000fffffffffffff + idx = ri & 0xFF + x = ri*gpu_we()[idx+1] + ri < gpu_ke()[idx+1] && return x # 98.9% of the time we return here 1st try + # TODO: This code could be outlined once LLVM supports LDS access in recursively-called functions + @inbounds if idx == 0 + return Random.ziggurat_exp_r - log(Random.rand(rng)) + elseif (gpu_fe()[idx] - gpu_fe()[idx+1])*Random.rand(rng) + gpu_fe()[idx+1] < exp(-x) + return x # return from the triangular area + else + @goto retry + end + end +end + +@device_override @inline function Random.randexp(rng::AbstractRNG, ::Type{T}) where {T <: Union{Float16, Float32}} + @invoke Random.randexp(rng::AbstractRNG, T::Type{<:AbstractFloat}) +end diff --git a/src/device/runtime.jl b/src/device/runtime.jl index b2a78a29..ef2f6083 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -12,3 +12,62 @@ report_exception(ex) = return report_exception_name(ex) = return report_exception_frame(idx, func, file, line) = return + +## kernel state + +struct KernelState + random_seed::UInt32 +end + +@inline @generated kernel_state() = GPUCompiler.kernel_state_value(KernelState) + +## intrinsics for adding and accessing additional kernel arguments + +# The amount of local shared memory we need for storing RNG state is determined +# dynamically at kernel launch time, so needs to be passed as additional arguments +# to the kernel. +# We define intrinsics that get transformed into additional kernel arguments which +# then get propagated across function calls to the caller. + +function additional_arg_intr(mod::LLVM.Module, T_state, name) + state_intr = if haskey(functions(mod), "julia.opencl.$name") + functions(mod)["julia.opencl.$name"] + else + LLVM.Function(mod, "julia.opencl.$name", LLVM.FunctionType(T_state)) + end + push!(function_attributes(state_intr), EnumAttribute("readnone", 0)) + + return state_intr +end + +# run-time equivalent +function additional_arg_value(state, name) + @dispose ctx=Context() begin + T_state = convert(LLVMType, state) + + # create function + llvm_f, _ = create_function(T_state) + mod = LLVM.parent(llvm_f) + + # get intrinsic + state_intr = additional_arg_intr(mod, T_state, name) + state_intr_ft = function_type(state_intr) + + # generate IR + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") + position!(builder, entry) + + val = call!(builder, state_intr_ft, state_intr, Value[], name) + + ret!(builder, val) + end + + call_function(llvm_f, state) + end +end + +for name in [:random_keys, :random_counters] + @eval @inline @generated $name() = + additional_arg_value(LLVMPtr{UInt32, AS.Workgroup}, $(String(name))) +end diff --git a/test/device/random.jl b/test/device/random.jl new file mode 100644 index 00000000..7a40df54 --- /dev/null +++ b/test/device/random.jl @@ -0,0 +1,137 @@ +using Random + +const n = 256 + +function apply_seed(seed) + if seed === missing + # should result in different numbers across launches + Random.seed!() + # XXX: this currently doesn't work, because of the definition in Base, + # `seed!(r::MersenneTwister=default_rng())`, which breaks overriding + # `default_rng` with a non-MersenneTwister RNG. + elseif seed !== nothing + # should result in the same numbers + Random.seed!(seed) + elseif seed === nothing + # should result in different numbers across launches, + # as determined by the seed set during module loading. + end +end + +eltypes = [filter(x -> !(x <: Complex), GPUArraysTestSuite.supported_eltypes(CLArray)); Bool; UInt16; UInt32; UInt64] + +@testset "rand($T), seed $seed" for T in eltypes, seed in (nothing, #=missing,=# 1234) + # different kernel invocations should get different numbers + @testset "across launches" begin + function kernel(A::AbstractArray{T}, seed) where {T} + apply_seed(seed) + tid = get_global_id(1) + A[tid] = rand(T) + return nothing + end + + a = OpenCL.zeros(T, n) + b = OpenCL.zeros(T, n) + + @opencl global_size=n local_size=n kernel(a, seed) + @opencl global_size=n local_size=n kernel(b, seed) + + if seed === nothing || seed === missing + @test Array(a) != Array(b) + else + @test Array(a) == Array(b) + end + end + + # multiple calls to rand should get different numbers + @testset "across calls" begin + function kernel(A::AbstractArray{T}, B::AbstractArray{T}, seed) where {T} + apply_seed(seed) + tid = get_global_id(1) + A[tid] = rand(T) + B[tid] = rand(T) + return nothing + end + + a = OpenCL.zeros(T, n) + b = OpenCL.zeros(T, n) + + @opencl global_size=n local_size=n kernel(a, b, seed) + + @test Array(a) != Array(b) + end + + if T != Bool + # different threads should get different numbers + @testset "across threads, dim $active_dim" for active_dim in 1:6 + function kernel(A::AbstractArray{T}, seed) where {T} + apply_seed(seed) + id = get_local_id(1) * get_local_id(2) * get_local_id(3) * + get_group_id(1) * get_group_id(2) * get_group_id(3) + if 1 <= id <= length(A) + A[id] = rand(T) + end + return nothing + end + + tx, ty, tz, bx, by, bz = [dim == active_dim ? 3 : 1 for dim in 1:6] + gx, gy, gz = tx*bx, ty*by, tz*bz + a = OpenCL.zeros(T, 3) + + @opencl local_size=(tx, ty, tz) global_size=(gx, gy, gz) kernel(a, seed) + + # NOTE: we don't just generate two numbers and compare them, instead generating a + # couple more and checking they're not all the same, in order to avoid + # occasional collisions with lower-precision types (i.e., Float16). + @test length(unique(Array(a))) > 1 + end + end +end + +@testset "basic randn($T), seed $seed" for T in filter(x -> x <: Base.IEEEFloat, eltypes), seed in (nothing, #=missing,=# 1234) + function kernel(A::AbstractArray{T}, seed) where {T} + apply_seed(seed) + tid = get_global_id(1) + A[tid] = randn(T) + return + end + + a = OpenCL.zeros(T, n) + b = OpenCL.zeros(T, n) + + @opencl global_size=n local_size=n kernel(a, seed) + @opencl global_size=n local_size=n kernel(b, seed) + + if seed === nothing || seed === missing + @test Array(a) != Array(b) + else + @test Array(a) == Array(b) + end +end + +randexp_eltypes = filter(x -> x <: Base.IEEEFloat, eltypes) +if cl.platform().name == "Portable Computing Language" + # POCL doesn't support log1p for Float16 + filter!(x -> x != Float16, randexp_eltypes) +end + +@testset "basic randexp($T), seed $seed" for T in randexp_eltypes, seed in (nothing, #=missing,=# 1234) + function kernel(A::AbstractArray{T}, seed) where {T} + apply_seed(seed) + tid = get_global_id(1) + A[tid] = randexp(T) + return + end + + a = OpenCL.zeros(T, n) + b = OpenCL.zeros(T, n) + + @opencl global_size=n local_size=n kernel(a, seed) + @opencl global_size=n local_size=n kernel(b, seed) + + if seed === nothing || seed === missing + @test Array(a) != Array(b) + else + @test Array(a) == Array(b) + end +end diff --git a/test/setup.jl b/test/setup.jl index 066424c6..90337d36 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -91,7 +91,7 @@ function runtests(f, name, platform_filter) # some tests require native execution capabilities requires_il = name in ["atomics", "execution", "intrinsics", "kernelabstractions"] || - startswith(name, "gpuarrays/") + startswith(name, "gpuarrays/") || startswith(name, "device/") ex = quote GC.gc(true)