Skip to content

Commit de8eed5

Browse files
Remove device_code_agx (#512)
It doesn't work on M3 anyway, and the Python dependency is quite heavy.
1 parent cf21f9d commit de8eed5

File tree

10 files changed

+13
-206
lines changed

10 files changed

+13
-206
lines changed

Artifacts.toml

Lines changed: 0 additions & 6 deletions
This file was deleted.

Project.toml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ version = "1.4.0"
44

55
[deps]
66
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
7-
Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
87
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
98
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
109
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
@@ -14,12 +13,10 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
1413
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
1514
LLVMDowngrader_jll = "f52de702-fb25-5922-94ba-81dd59b07444"
1615
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
17-
ObjectFile = "d8793406-e978-5875-9003-1fc021f44a92"
1816
ObjectiveC = "e86c9b32-1129-44ac-8ea0-90d5bb39ded9"
1917
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
2018
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
2119
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
22-
Python_jll = "93d3a430-8e7c-50da-8e8d-3dfcfb3baf05"
2320
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2421
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
2522
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -35,7 +32,6 @@ SpecialFunctionsExt = "SpecialFunctions"
3532

3633
[compat]
3734
Adapt = "4"
38-
Artifacts = "1"
3935
BFloat16s = "0.5"
4036
CEnum = "0.4, 0.5"
4137
CodecBzip2 = "0.8.5"
@@ -45,11 +41,14 @@ GPUCompiler = "0.26, 0.27, 1"
4541
KernelAbstractions = "0.9.1"
4642
LLVM = "7.2, 8, 9"
4743
LLVMDowngrader_jll = "0.6"
48-
ObjectFile = "0.4"
44+
LinearAlgebra = "1"
4945
ObjectiveC = "2.1, 3"
5046
PrecompileTools = "1"
5147
Preferences = "1"
48+
Printf = "1"
49+
Random = "1"
5250
SHA = "0.7"
5351
SpecialFunctions = "2"
5452
StaticArrays = "1"
53+
UUIDs = "1"
5554
julia = "1.10"

docs/src/api/compiler.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,9 @@ the InteractiveUtils standard library:
2626
@device_code_typed
2727
@device_code_warntype
2828
@device_code_llvm
29+
@device_code_air
2930
@device_code_native
30-
@device_code_agx
3131
@device_code
3232
```
3333

34-
For more information, please consult the GPUCompiler.jl documentation. `code_agx` is
35-
actually `code_native`:
34+
For more information, please consult the GPUCompiler.jl documentation.

perf/latency.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ function main()
2828
ttfp_cmd =
2929
`$base_cmd -e "using Metal
3030
kernel() = return
31-
Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"`
31+
Metal.code_native(devnull, kernel, Tuple{}; kernel=true)"`
3232
results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
3333

3434
results

src/Metal.jl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,7 @@ using LLVM
77
using LLVM.Interop
88
import LLVMDowngrader_jll
99
using Preferences: @load_preference, load_preference
10-
using Python_jll
11-
using ObjectFile
1210
using ExprTools: splitdef, combinedef
13-
using Artifacts
1411
using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
1512
import KernelAbstractions
1613

src/compiler/compilation.jl

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,7 @@ function compile(@nospecialize(job::CompilerJob))
176176
end
177177

178178
# link into an executable kernel
179-
@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled;
180-
return_function=false)
179+
@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled)
181180
@signpost_event log=log_compiler() "Link" "Job=$job"
182181

183182
@signpost_interval log=log_compiler() "Instantiate compute pipeline" begin
@@ -211,7 +210,5 @@ end
211210
end
212211
end
213212

214-
# most of the time, we don't need the function object,
215-
# so don't keep it alive unconditionally in GPUCompiler's caches
216-
pipeline_state, return_function ? fun : nothing
213+
pipeline_state
217214
end

src/compiler/execution.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
186186
cache = compiler_cache(dev)
187187
source = methodinstance(F, tt)
188188
config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig
189-
pipeline, _ = GPUCompiler.cached_compilation(cache, source, config, compile, link)
189+
pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link)
190190

191191
# create a callable object that captures the function instance. we don't need to think
192192
# about world age here, as GPUCompiler already does and will return a different object

src/compiler/reflection.jl

Lines changed: 3 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -19,161 +19,8 @@ function split_kwargs_runtime(kwargs, wanted::Vector{Symbol})
1919
return extracted, remaining
2020
end
2121

22-
"""
23-
code_agx([io], f, types, cap::VersionNumber)
24-
25-
Prints the AGX code generated for the method matching the given generic function and type
26-
signature to `io` which defaults to `stdout`.
27-
28-
See also: [`@device_code_agx`](@ref)
29-
"""
30-
function code_agx(io::IO, @nospecialize(func::Base.Callable), @nospecialize(types),
31-
kernel::Bool=true; kwargs...)
32-
compiler_kwargs, kwargs = split_kwargs_runtime(kwargs, COMPILER_KWARGS)
33-
source = methodinstance(typeof(func), Base.to_tuple_type(types))
34-
config = compiler_config(device(); kernel, compiler_kwargs...)
35-
job = CompilerJob(source, config)
36-
code_agx(io, job)
37-
end
38-
39-
@autoreleasepool function code_agx(io::IO, job::MetalCompilerJob)
40-
if !job.config.kernel
41-
error("Can only generate AGX code for kernel functions")
42-
end
43-
44-
# compile the kernel
45-
compiled = compile(job)
46-
pipeline, fun = link(job, compiled; return_function=true)
47-
# XXX: can we re-use this pipeline?
48-
49-
# register it with a pipeline descriptor
50-
pipeline_desc = MTLComputePipelineDescriptor()
51-
pipeline_desc.computeFunction = fun
52-
53-
# create a binary archive
54-
bin_desc = MTLBinaryArchiveDescriptor()
55-
bin = MTLBinaryArchive(device(), bin_desc)
56-
add_functions!(bin, pipeline_desc)
57-
58-
mktempdir() do dir
59-
# serialize the archive to a file
60-
binary = joinpath(dir, "kernel.macho")
61-
write(binary, bin)
62-
63-
# disassemble the main function
64-
first = true
65-
i = 0
66-
extract_gpu_code(binary) do name, code
67-
# skip all-zero functions
68-
all(code .== 0) && return
69-
70-
i += 1
71-
file = joinpath(dir, "function$(i).bin")
72-
write(file, code)
73-
74-
# disassemble the function
75-
first || println(io)
76-
println(io, "$name:")
77-
print(io, disassemble(file))
78-
79-
first = false
80-
end
81-
end
82-
end
83-
84-
@enum GPUMachineType::UInt32 begin
85-
AppleGPU = 0x1000013
86-
AMDGPU = 0x1000014
87-
IntelGPU = 0x1000015
88-
AIR64 = 0x1000017
89-
end
90-
91-
function extract_gpu_code(f, binary)
92-
fat_handle = readmeta(open(binary))
93-
fat_handle isa FatMachOHandle || error("Expected a universal binary, got a $(typeof(fat_handle))")
94-
95-
# the universal binary contains several architectures; extract the GPU one
96-
arch = findfirst(fat_handle) do arch
97-
arch.header isa MachO.MachOHeader64 && GPUMachineType(arch.header.cputype) == AppleGPU
98-
end
99-
arch === nothing && error("Could not find GPU architecture in universal binary")
100-
101-
# the GPU binary contains several sections...
102-
## ... extract the compute section, which is another Mach-O binary
103-
compute_section = findfirst(Sections(fat_handle[arch]), "__TEXT,__compute")
104-
compute_section === nothing && error("Could not find __compute section in GPU binary")
105-
compute_binary = read(compute_section)
106-
native_handle = only(readmeta(IOBuffer(compute_binary)))
107-
## ... extract the metallib section, which is a Metal library
108-
metallib_section = findfirst(Sections(fat_handle[arch]), "__TEXT,__metallib")
109-
metallib_section === nothing && error("Could not find __metallib section in GPU binary")
110-
metallib_binary = read(metallib_section)
111-
metallib = read(IOBuffer(metallib_binary), MetalLib)
112-
# TODO: use this to implement a do-block device_code_air like CUDA.jl?
113-
114-
# identify the kernel name
115-
kernel_name = "unknown_kernel"
116-
# XXX: does it happen that these metallibs contain multiple functions?
117-
if length(metallib.functions) == 1
118-
kernel_name = metallib.functions[1].name
119-
end
120-
# XXX: we used to be able to identify the kernel by looking at symbols in
121-
# the fat binary, one of which aliased with the start of the compute
122-
# section. these symbols have disappeared on macOS 15.
123-
#compute_symbol = nothing
124-
#for symbol in Symbols(fat_handle[arch])
125-
# symbol_value(symbol) == section_offset(compute_section) || continue
126-
# endswith(symbol_name(symbol), "_begin") || continue
127-
# compute_symbol = symbol
128-
#end
129-
#compute_symbol === nothing && error("Could not find symbol for __compute section")
130-
#kernel_name = symbol_name(compute_symbol)[1:end-6]
131-
132-
# within the native GPU binary, isolate the section containing code
133-
section = findfirst(Sections(native_handle), "__TEXT,__text")
134-
isnothing(section) && error("Could not find __TEXT,__text section")
135-
136-
# get all symbols, and sort them by address
137-
symbols = sort(collect(Symbols(native_handle)), by=symbol_value)
138-
139-
# extract relevant functions
140-
code = read(section)
141-
function extract_function(fn)
142-
# find the symbol
143-
symbol = findfirst(isequal(fn) , symbols)
144-
symbol === nothing && return nothing
145-
offset = symbol_value(symbols[symbol])
146-
147-
# extract the function
148-
size = if symbol < length(symbols)
149-
# up until the next symbol
150-
symbol_value(symbols[symbol + 1])
151-
else
152-
# up until the end of the section
153-
section_size(section)
154-
end - offset
155-
return code[offset + 1 : offset + size]
156-
end
157-
for sym in symbols
158-
f("$kernel_name.$(symbol_name(sym))", extract_function(sym))
159-
end
160-
return
161-
end
162-
163-
function disassemble(path)
164-
io = IOBuffer()
165-
disassembler = joinpath(only(readdir(artifact"applegpu"; join=true)), "disassemble.py")
166-
run(pipeline(`$(python()) $disassembler $path`, stdout=io))
167-
return String(take!(io))
168-
end
169-
170-
code_agx(@nospecialize(func::Base.Callable), @nospecialize(types); kwargs...) =
171-
code_agx(stdout, func, types; kwargs...)
172-
173-
const code_native = code_agx
174-
17522
# forward the rest to GPUCompiler with an appropriate CompilerJob
176-
for method in (:code_typed, :code_warntype, :code_llvm)
23+
for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
17724
# only code_typed doesn't take a io argument
17825
args = method === :code_typed ? (:job,) : (:io, :job)
17926

@@ -191,37 +38,19 @@ for method in (:code_typed, :code_warntype, :code_llvm)
19138
end
19239
end
19340

194-
19541
#
19642
# @device_code_* functions
19743
#
19844

19945
export @device_code_lowered, @device_code_typed, @device_code_warntype,
200-
@device_code_llvm, @device_code_native, @device_code_agx, @device_code
201-
202-
"""
203-
@device_code_agx [io::IO=stdout, ...] ex
204-
205-
Evaluates the expression `ex` and prints the result of [`Metal.code_agx`](@ref) to
206-
`io` for every compiled Metal kernel. For other supported keywords, see
207-
[`Metal.code_agx`](@ref).
208-
"""
209-
macro device_code_agx(ex...)
210-
function hook(job::MetalCompilerJob; io::IO=stdout, kwargs...)
211-
println(io, "; $job")
212-
println(io)
213-
code_agx(io, job; kwargs...)
214-
end
215-
GPUCompiler.emit_hooked_compilation(hook, ex...)
216-
end
217-
218-
const var"@device_code_native" = var"@device_code_agx"
46+
@device_code_llvm, @device_code_metal, @device_code
21947

22048
# forward to GPUCompiler
22149
@eval $(Symbol("@device_code_lowered")) = $(getfield(GPUCompiler, Symbol("@device_code_lowered")))
22250
@eval $(Symbol("@device_code_typed")) = $(getfield(GPUCompiler, Symbol("@device_code_typed")))
22351
@eval $(Symbol("@device_code_warntype")) = $(getfield(GPUCompiler, Symbol("@device_code_warntype")))
22452
@eval $(Symbol("@device_code_llvm")) = $(getfield(GPUCompiler, Symbol("@device_code_llvm")))
53+
@eval $(Symbol("@device_code_metal")) = $(getfield(GPUCompiler, Symbol("@device_code_native")))
22554
@eval $(Symbol("@device_code")) = $(getfield(GPUCompiler, Symbol("@device_code")))
22655

22756

src/initialization.jl

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,6 @@ function __init__()
4040
@warn "Metal.jl has not been tested on macOS 16 or later, you may run into issues."
4141
end
4242

43-
# we use Python_jll, but don't actually want its environment to be active
44-
# (this breaks the call to pygmentize in GPUCompiler).
45-
# XXX: the JLL should only set PYTHONHOME when the executable is called
46-
delete!(ENV, "PYTHONHOME")
47-
4843
if Base.JLOptions().debug_level >= 2
4944
# enable Metal API validation
5045
ENV["MTL_DEBUG_LAYER"] = "1"

test/execution.jl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,11 @@ end
5959
Metal.code_typed(dummy, Tuple{})
6060
Metal.code_warntype(devnull, dummy, Tuple{})
6161
Metal.code_llvm(devnull, dummy, Tuple{})
62-
shader_validation || Metal.code_agx(devnull, dummy, Tuple{})
6362

6463
@device_code_lowered @metal dummy()
6564
@device_code_typed @metal dummy()
6665
@device_code_warntype io=devnull @metal dummy()
6766
@device_code_llvm io=devnull @metal dummy()
68-
shader_validation || @device_code_agx io=devnull @metal dummy()
6967

7068
mktempdir() do dir
7169
@device_code dir=dir @metal dummy()
@@ -76,7 +74,6 @@ end
7674
# make sure kernel name aliases are preserved in the generated code
7775
@test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @metal dummy())))
7876
@test occursin("dummy", sprint(io->(@device_code_llvm io=io @metal dummy())))
79-
shader_validation || @test occursin("dummy", sprint(io->(@device_code_agx io=io @metal dummy())))
8077

8178
# make sure invalid kernels can be partially reflected upon
8279
let

0 commit comments

Comments
 (0)