wip

William Moses · William Moses · commit b6d3169997a9 · 2024-12-07T16:38:07.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -6,6 +6,7 @@ version = "0.2.9"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
@@ -8,35 +8,217 @@ using ReactantCore: @trace
 using Adapt
 
 function Adapt.adapt_storage(::CUDA.KernelAdaptor, xs::TracedRArray{T,N}) where {T,N}
-  CuDeviceArray{T,N,CUDA.AS.Global}(pointer(xs.mlir_data.value), size(xs))
+  res = CuDeviceArray{T,N,CUDA.AS.Global}(Base.reinterpret(Core.LLVMPtr{T,CUDA.AS.Global}, xs.mlir_data.value.ptr), size(xs))
+  @show res, xs
+  return res
 end
 
 const _kernel_instances = Dict{Any, Any}()
 
+
+
+# compile to executable machine code
+function compile(job)
+    # lower to PTX
+    # TODO: on 1.9, this actually creates a context. cache those.
+    modstr = JuliaContext() do ctx
+        mod, meta = GPUCompiler.compile(:llvm, job)
+	string(mod)
+    end
+    return modstr
+#=
+    # check if we'll need the device runtime
+    undefined_fs = filter(collect(functions(meta.ir))) do f
+        isdeclaration(f) && !LLVM.isintrinsic(f)
+    end
+    intrinsic_fns = ["vprintf", "malloc", "free", "__assertfail",
+                     "__nvvm_reflect" #= TODO: should have been optimized away =#]
+    needs_cudadevrt = !isempty(setdiff(LLVM.name.(undefined_fs), intrinsic_fns))
+
+    # prepare invocations of CUDA compiler tools
+    ptxas_opts = String[]
+    nvlink_opts = String[]
+    ## debug flags
+    if Base.JLOptions().debug_level == 1
+        push!(ptxas_opts, "--generate-line-info")
+    elseif Base.JLOptions().debug_level >= 2
+        push!(ptxas_opts, "--device-debug")
+        push!(nvlink_opts, "--debug")
+    end
+    ## relocatable device code
+    if needs_cudadevrt
+        push!(ptxas_opts, "--compile-only")
+    end
+
+    ptx = job.config.params.ptx
+    cap = job.config.params.cap
+    arch = "sm_$(cap.major)$(cap.minor)"
+
+    # validate use of parameter memory
+    argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
+        !isghosttype(dt) && !Core.Compiler.isconstType(dt)
+    end
+    param_usage = sum(sizeof, argtypes)
+    param_limit = 4096
+    if cap >= v"7.0" && ptx >= v"8.1"
+        param_limit = 32764
+    end
+    if param_usage > param_limit
+        msg = """Kernel invocation uses too much parameter memory.
+                 $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by sm_$(cap.major)$(cap.minor) / PTX v$(ptx.major).$(ptx.minor)."""
+
+        try
+            details = "\n\nRelevant parameters:"
+
+            source_types = job.source.specTypes.parameters
+            source_argnames = Base.method_argnames(job.source.def)
+            while length(source_argnames) < length(source_types)
+                # this is probably due to a trailing vararg; repeat its name
+                push!(source_argnames, source_argnames[end])
+            end
+
+            for (i, typ) in enumerate(source_types)
+                if isghosttype(typ) || Core.Compiler.isconstType(typ)
+                    continue
+                end
+                name = source_argnames[i]
+                details *= "\n  [$(i-1)] $name::$typ uses $(Base.format_bytes(sizeof(typ)))"
+            end
+            details *= "\n"
+
+            if cap >= v"7.0" && ptx < v"8.1" && param_usage < 32764
+                details *= "\nNote: use a newer CUDA to support more parameters on your device.\n"
+            end
+
+            msg *= details
+        catch err
+            @error "Failed to analyze kernel parameter usage; please file an issue with a reproducer."
+        end
+        error(msg)
+    end
+
+    # compile to machine code
+    # NOTE: we use tempname since mktemp doesn't support suffixes, and mktempdir is slow
+    ptx_input = tempname(cleanup=false) * ".ptx"
+    ptxas_output = tempname(cleanup=false) * ".cubin"
+    write(ptx_input, asm)
+
+    # we could use the driver's embedded JIT compiler, but that has several disadvantages:
+    # 1. fixes and improvements are slower to arrive, by using `ptxas` we only need to
+    #    upgrade the toolkit to get a newer compiler;
+    # 2. version checking is simpler, we otherwise need to use NVML to query the driver
+    #    version, which is hard to correlate to PTX JIT improvements;
+    # 3. if we want to be able to use newer (minor upgrades) of the CUDA toolkit on an
+    #    older driver, we should use the newer compiler to ensure compatibility.
+    append!(ptxas_opts, [
+        "--verbose",
+        "--gpu-name", arch,
+        "--output-file", ptxas_output,
+        ptx_input
+    ])
+    proc, log = run_and_collect(`$(ptxas()) $ptxas_opts`)
+    log = strip(log)
+    if !success(proc)
+        reason = proc.termsignal > 0 ? "ptxas received signal $(proc.termsignal)" :
+                                       "ptxas exited with code $(proc.exitcode)"
+        msg = "Failed to compile PTX code ($reason)"
+        msg *= "\nInvocation arguments: $(join(ptxas_opts, ' '))"
+        if !isempty(log)
+            msg *= "\n" * log
+        end
+        msg *= "\nIf you think this is a bug, please file an issue and attach $(ptx_input)"
+        if parse(Bool, get(ENV, "BUILDKITE", "false"))
+            run(`buildkite-agent artifact upload $(ptx_input)`)
+        end
+        error(msg)
+    elseif !isempty(log)
+        @debug "PTX compiler log:\n" * log
+    end
+    rm(ptx_input)
+=#
+#=
+    # link device libraries, if necessary
+    #
+    # this requires relocatable device code, which prevents certain optimizations and
+    # hurts performance. as such, we only do so when absolutely necessary.
+    # TODO: try LTO, `--link-time-opt --nvvmpath /opt/cuda/nvvm`.
+    #       fails with `Ignoring -lto option because no LTO objects found`
+    if needs_cudadevrt
+        nvlink_output = tempname(cleanup=false) * ".cubin"
+        append!(nvlink_opts, [
+            "--verbose", "--extra-warnings",
+            "--arch", arch,
+            "--library-path", dirname(libcudadevrt),
+            "--library", "cudadevrt",
+            "--output-file", nvlink_output,
+            ptxas_output
+        ])
+        proc, log = run_and_collect(`$(nvlink()) $nvlink_opts`)
+        log = strip(log)
+        if !success(proc)
+            reason = proc.termsignal > 0 ? "nvlink received signal $(proc.termsignal)" :
+                                           "nvlink exited with code $(proc.exitcode)"
+            msg = "Failed to link PTX code ($reason)"
+            msg *= "\nInvocation arguments: $(join(nvlink_opts, ' '))"
+            if !isempty(log)
+                msg *= "\n" * log
+            end
+            msg *= "\nIf you think this is a bug, please file an issue and attach $(ptxas_output)"
+            error(msg)
+        elseif !isempty(log)
+            @debug "PTX linker info log:\n" * log
+        end
+        rm(ptxas_output)
+
+        image = read(nvlink_output)
+        rm(nvlink_output)
+    else
+        image = read(ptxas_output)
+        rm(ptxas_output)
+    end
+=#
+    return (image, entry=LLVM.name(meta.entry))
+end
+
+# link into an executable kernel
+function link(job, compiled)
+    # load as an executable kernel object
+    return compiled
+end
+
+struct LLVMFunc{F,tt}
+	f::F
+	mod::String
+end
+
+function (func::LLVMFunc{F,tt})(args...) where{F, tt}
+	
+end
+
 function recufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
     cuda = CUDA.active_state()
+    @show f, tt
+    flush(stdout)
 
     Base.@lock CUDA.cufunction_lock begin
         # compile the function
         cache = CUDA.compiler_cache(cuda.context)
         source = CUDA.methodinstance(F, tt)
         config = CUDA.compiler_config(cuda.device; kwargs...)::CUDA.CUDACompilerConfig
-        fun = CUDA.GPUCompiler.cached_compilation(cache, source, config, CUDA.compile, CUDA.link)
+        fun = CUDA.GPUCompiler.cached_compilation(cache, source, config, compile, link)
 
 	@show fun
-	@show fun.mod
+	println(string(fun))
+	#@show fun.mod
 	# create a callable object that captures the function instance. we don't need to think
         # about world age here, as GPUCompiler already does and will return a different object
-        key = (objectid(source), hash(fun), f)
+        key = (objectid(source))
         kernel = get(_kernel_instances, key, nothing)
         if kernel === nothing
-            # create the kernel state object
-            state = CUDA.KernelState(create_exceptions!(fun.mod), UInt32(0))
-
-            kernel = CUDA.HostKernel{F,tt}(f, fun, state)
+            kernel = LLVMFunc{F,tt}(f, fun)
             _kernel_instances[key] = kernel
         end
-        return kernel::CUDA.HostKernel{F,tt}
+        return kernel::LLVMFunc{F,tt}
     end
 end