@@ -85,13 +85,31 @@ runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) =
85
85
" -exitable=$(job. target. exitable) "
86
86
87
87
function process_module! (@nospecialize (job:: CompilerJob{PTXCompilerTarget} ), mod:: LLVM.Module )
88
+ ctx = context (mod)
89
+
88
90
# calling convention
89
91
if LLVM. version () >= v " 8"
90
92
for f in functions (mod)
91
93
# JuliaGPU/GPUCompiler.jl#97
92
94
# callconv!(f, LLVM.API.LLVMPTXDeviceCallConv)
93
95
end
94
96
end
97
+
98
+ # emit the device capability and ptx isa version as constants in the module. this makes
99
+ # it possible to 'query' these in device code, relying on LLVM to optimize the checks
100
+ # away and generate static code. note that we only do so if there's actual uses of these
101
+ # variables; unconditionally creating a gvar would result in duplicate declarations.
102
+ for (name, value) in [" sm_major" => job. target. cap. major,
103
+ " sm_minor" => job. target. cap. minor,
104
+ " ptx_major" => job. target. ptx. major,
105
+ " ptx_minor" => job. target. ptx. minor]
106
+ if haskey (globals (mod), name)
107
+ gv = globals (mod)[name]
108
+ initializer! (gv, ConstantInt (LLVM. Int32Type (ctx), value))
109
+ # change the linkage so that we can inline the value
110
+ linkage! (gv, LLVM. API. LLVMPrivateLinkage)
111
+ end
112
+ end
95
113
end
96
114
97
115
function process_entry! (@nospecialize (job:: CompilerJob{PTXCompilerTarget} ),
@@ -161,6 +179,10 @@ function add_lowering_passes!(@nospecialize(job::CompilerJob{PTXCompilerTarget})
161
179
162
180
# even if we support `unreachable`, we still prefer `exit` to `trap`
163
181
add! (pm, ModulePass (" HideTrap" , hide_trap!))
182
+
183
+ # we emit properties (of the device and ptx isa) as private global constants,
184
+ # so run the optimizer so that they are inlined before the rest of the optimizer runs.
185
+ global_optimizer! (pm)
164
186
end
165
187
166
188
function optimize_module! (@nospecialize (job:: CompilerJob{PTXCompilerTarget} ),
0 commit comments