Skip to content

Commit 2a5eb69

Browse files
Feat: Implement Dead Code Elimination (no more gigantic PTX files 🎉 ) (#12)
* Feat: start implementing DCE/module merging * Feat: remove useless deps graph stuff and remove mentions in docs * Feat: rename path_tracer kernels name and commit ptx files * Feat: add #[externally_visible], ignore ptx files, disable trace!
1 parent 4caa722 commit 2a5eb69

File tree

17 files changed

+285
-148
lines changed

17 files changed

+285
-148
lines changed

‎crates/cuda_builder/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,10 @@ fn invoke_rustc(builder: &CudaBuilder) -> Result<PathBuf, CudaBuilderError> {
378378
cargo.arg("--release");
379379
}
380380

381+
// TODO(RDambrosio016): Remove this once we can get meaningful error messages in panic to work.
382+
// for now we enable it to remove some useless indirect calls in the ptx.
383+
cargo.arg("-Zbuild-std-features=panic_immediate_abort");
384+
381385
if builder.optix {
382386
cargo.arg("-Zbuild-std-features=panic_immediate_abort");
383387
cargo.arg("-Zunstable-options");

‎crates/cuda_std/CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Changelog
2+
3+
Notable changes to this project will be documented in this file.
4+
5+
## Unreleased
6+
7+
- Added `#[externally_visible]` in conjunction with cg_nvvm dead code elimination changes to mark that
8+
a function is externally visible.

‎crates/cuda_std_macros/src/lib.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,27 @@ pub fn gpu_only(_attr: proc_macro::TokenStream, item: proc_macro::TokenStream) -
184184

185185
output.into()
186186
}
187+
188+
/// Notifies the codegen that this function is externally visible and should not be
189+
/// removed if it is not used by a kernel. Usually used for linking with other PTX/cubin files.
190+
///
191+
/// # Panics
192+
///
193+
/// Panics if the function is not also no_mangle.
194+
#[proc_macro_attribute]
195+
pub fn externally_visible(
196+
_attr: proc_macro::TokenStream,
197+
item: proc_macro::TokenStream,
198+
) -> TokenStream {
199+
let mut func = syn::parse_macro_input!(item as syn::ItemFn);
200+
201+
assert!(
202+
func.attrs.iter().any(|a| a.path.is_ident("no_mangle")),
203+
"#[externally_visible] function should also be #[no_mangle]"
204+
);
205+
206+
let new_attr = parse_quote!(#[cfg_attr(target_os = "cuda", nvvm_internal(used))]);
207+
func.attrs.push(new_attr);
208+
209+
func.into_token_stream().into()
210+
}

‎crates/rustc_codegen_nvvm/CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,17 @@ Notable changes to this project will be documented in this file.
44

55
## Unreleased
66

7+
### Dead Code Elimination
8+
9+
PTX files no longer include useless functions and globals, we have switched to an alternative
10+
method of codegen for the final steps of the codegen. We no longer lazily-load modules using dependency graphs,
11+
we instead merge all the modules into one then run global DCE on it before giving it to libnvvm.
12+
13+
This means all of the dead code is gone before it gets to the libnvvm stage, drastically lowering the size of
14+
the built PTX and improving codegen performance.
15+
16+
- Trace-level debug is compiled out for release now, decreasing the size of the codegen dll and improving compile times.
17+
718
## 0.1.1 - 11/26/21
819

920
- Fix things using the `bswap` intrinsic panicking.

‎crates/rustc_codegen_nvvm/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ libc = "0.2.97"
1616
tar = "0.4.35"
1717
once_cell = "1.8.0"
1818
bitflags = "1.3.2"
19-
tracing = "0.1.29"
19+
tracing = { version = "0.1.29", features = ["release_max_level_debug"] }
2020
find_cuda_helper = { version = "0.1", path = "../find_cuda_helper" }
2121
tracing-subscriber = { version = "0.3.1", features = ["env-filter"] }
2222
rustc_codegen_nvvm_macros = { version = "0.1", path = "../rustc_codegen_nvvm_macros" }

‎crates/rustc_codegen_nvvm/src/attributes.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use rustc_ast::Attribute;
33
use rustc_attr::{InlineAttr, OptimizeAttr};
44
use rustc_middle::{middle::codegen_fn_attrs::CodegenFnAttrFlags, ty};
55
use rustc_session::{config::OptLevel, Session};
6-
use rustc_span::Symbol;
6+
use rustc_span::{sym, Symbol};
77

88
use crate::context::CodegenCx;
99

@@ -99,6 +99,7 @@ pub struct Symbols {
9999
#[derive(Default, Clone, PartialEq)]
100100
pub(crate) struct NvvmAttributes {
101101
pub kernel: bool,
102+
pub used: bool,
102103
}
103104

104105
impl NvvmAttributes {
@@ -112,6 +113,9 @@ impl NvvmAttributes {
112113
if arg.has_name(cx.symbols.kernel) {
113114
nvvm_attrs.kernel = true;
114115
}
116+
if arg.has_name(sym::used) {
117+
nvvm_attrs.used = true;
118+
}
115119
}
116120
}
117121
}

‎crates/rustc_codegen_nvvm/src/back.rs

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,10 @@ use rustc_data_structures::small_c_str::SmallCStr;
1414
use rustc_errors::{FatalError, Handler};
1515
use rustc_fs_util::path_to_c_string;
1616
use rustc_middle::bug;
17-
use rustc_middle::mir::mono::MonoItem;
1817
use rustc_middle::{dep_graph, ty::TyCtxt};
1918
use rustc_session::config::{self, DebugInfo, OutputType};
2019
use rustc_session::Session;
21-
use rustc_span::{sym, Symbol};
20+
use rustc_span::Symbol;
2221
use rustc_target::spec::{CodeModel, RelocModel};
2322
use std::ffi::CString;
2423
use std::sync::Arc;
@@ -265,22 +264,6 @@ pub fn compile_codegen_unit(tcx: TyCtxt<'_>, cgu_name: Symbol) -> (ModuleCodegen
265264
// ... and now that we have everything pre-defined, fill out those definitions.
266265
for &(mono_item, _) in &mono_items {
267266
mono_item.define::<Builder<'_, '_, '_>>(&cx);
268-
if let MonoItem::Fn(inst) = mono_item {
269-
let name = tcx.symbol_name(inst).name;
270-
let attrs = tcx.get_attrs(inst.def_id());
271-
let is_no_mangle = attrs.iter().any(|x| x.has_name(sym::no_mangle));
272-
273-
if name == "rust_begin_unwind"
274-
|| name.starts_with("__rg")
275-
|| name == "rust_oom"
276-
|| is_no_mangle
277-
{
278-
let func = cx.get_fn(inst);
279-
let llval =
280-
unsafe { llvm::LLVMConstBitCast(func, cx.type_ptr_to(cx.type_i8())) };
281-
cx.used_statics.borrow_mut().push(llval);
282-
}
283-
}
284267
}
285268

286269
// a main function for gpu kernels really makes no sense but

‎crates/rustc_codegen_nvvm/src/lib.rs

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ use rustc_middle::{
6969
use rustc_session::{cstore::MetadataLoaderDyn, Session};
7070
use tracing::debug;
7171

72-
use std::{ffi::CString, sync::Arc, sync::Mutex};
72+
use std::ffi::CString;
7373

7474
// codegen dylib entrypoint
7575
#[no_mangle]
@@ -79,13 +79,7 @@ pub fn __rustc_codegen_backend() -> Box<dyn CodegenBackend> {
7979
}
8080

8181
#[derive(Default, Clone)]
82-
pub struct NvvmCodegenBackend {
83-
/// HACK(RDambrosio016): To lazy load modules, we need to know the depedency graph
84-
/// of the crate, we can get this through cstore, but cstore is made using TyCtxt
85-
/// and rustc drops TyCtxt before linking to save memory. So we populate this field
86-
/// in codegen_crate so we can then use it in link.
87-
deps: Arc<Mutex<Option<Vec<String>>>>,
88-
}
82+
pub struct NvvmCodegenBackend(());
8983

9084
unsafe impl Send for NvvmCodegenBackend {}
9185
unsafe impl Sync for NvvmCodegenBackend {}
@@ -124,14 +118,6 @@ impl CodegenBackend for NvvmCodegenBackend {
124118
need_metadata_module: bool,
125119
) -> Box<dyn std::any::Any> {
126120
debug!("Codegen crate");
127-
let mut raw_deps = tcx.postorder_cnums(()).to_vec();
128-
raw_deps.reverse();
129-
let out = raw_deps
130-
.into_iter()
131-
.map(|x| tcx.crate_name(x).as_str().to_string())
132-
.collect::<Vec<_>>();
133-
134-
*self.deps.lock().unwrap() = Some(out);
135121
Box::new(rustc_codegen_ssa::base::codegen_crate(
136122
NvvmCodegenBackend::default(),
137123
tcx,
@@ -164,7 +150,6 @@ impl CodegenBackend for NvvmCodegenBackend {
164150
outputs: &rustc_session::config::OutputFilenames,
165151
) -> Result<(), rustc_errors::ErrorReported> {
166152
link::link(
167-
self.deps.lock().unwrap().clone(),
168153
sess,
169154
&codegen_results,
170155
outputs,

‎crates/rustc_codegen_nvvm/src/link.rs

Lines changed: 14 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use rustc_codegen_ssa::traits::ThinBufferMethods;
21
use rustc_codegen_ssa::CodegenResults;
32
use rustc_codegen_ssa::CompiledModule;
43
use rustc_codegen_ssa::NativeLib;
@@ -16,7 +15,6 @@ use rustc_session::{
1615
Session,
1716
};
1817
use rustc_target::spec::Target;
19-
use std::ffi::CString;
2018
use std::{
2119
ffi::OsStr,
2220
fs::File,
@@ -27,11 +25,6 @@ use tar::{Archive, Builder, Header};
2725
use tracing::{debug, trace};
2826

2927
use crate::context::CodegenArgs;
30-
use crate::create_module;
31-
use crate::llvm::Context;
32-
use crate::llvm::LLVMLinkModules2;
33-
use crate::llvm::LLVMRustParseBitcodeForLTO;
34-
use crate::lto::ThinBuffer;
3528
use crate::LlvmMod;
3629

3730
pub(crate) struct NvvmMetadataLoader;
@@ -73,13 +66,12 @@ fn read_metadata(rlib: &Path) -> Result<MetadataRef, String> {
7366
}
7467

7568
pub fn link<'tcx>(
76-
deps: Option<Vec<String>>,
7769
sess: &'tcx Session,
7870
codegen_results: &CodegenResults,
7971
outputs: &OutputFilenames,
8072
crate_name: &str,
8173
) {
82-
debug!("Linking crate `{}`, deps:\n{:?}", crate_name, deps);
74+
debug!("Linking crate `{}`", crate_name);
8375
// largely inspired by rust-gpu
8476
let output_metadata = sess.opts.output_types.contains_key(&OutputType::Metadata);
8577
for &crate_type in sess.crate_types().iter() {
@@ -107,7 +99,6 @@ pub fn link<'tcx>(
10799
CrateType::Executable | CrateType::Cdylib | CrateType::Dylib => {
108100
let _ = link_exe(
109101
&codegen_results.allocator_module,
110-
deps.clone(),
111102
sess,
112103
crate_type,
113104
&out_filename,
@@ -171,7 +162,6 @@ fn link_rlib(sess: &Session, codegen_results: &CodegenResults, out_filename: &Pa
171162

172163
fn link_exe(
173164
allocator: &Option<CompiledModule>,
174-
deps: Option<Vec<String>>,
175165
sess: &Session,
176166
crate_type: CrateType,
177167
out_filename: &Path,
@@ -201,22 +191,20 @@ fn link_exe(
201191
std::fs::create_dir_all(&out_dir)?;
202192
}
203193

204-
codegen_into_ptx_file(allocator, deps, sess, &objects, &rlibs, out_filename)
194+
codegen_into_ptx_file(allocator, sess, &objects, &rlibs, out_filename)
205195
}
206196

207197
/// This is the meat of the codegen, taking all of the llvm bitcode modules we have, and giving them to
208198
/// nvvm to make into a final
209199
fn codegen_into_ptx_file(
210200
allocator: &Option<CompiledModule>,
211-
deps: Option<Vec<String>>,
212201
sess: &Session,
213202
objects: &[PathBuf],
214203
rlibs: &[PathBuf],
215204
out_filename: &Path,
216205
) -> io::Result<()> {
217-
debug!("Codegenning crate into PTX, allocator: {}, deps:\n{:#?}, objects:\n{:#?}, rlibs:\n{:#?}, out_filename:\n{:#?}",
206+
debug!("Codegenning crate into PTX, allocator: {}, objects:\n{:#?}, rlibs:\n{:#?}, out_filename:\n{:#?}",
218207
allocator.is_some(),
219-
deps,
220208
objects,
221209
rlibs,
222210
out_filename
@@ -226,27 +214,18 @@ fn codegen_into_ptx_file(
226214
// but we dont have our original one because rustc drops tyctxt and codegencx before linking.
227215
let cx = LlvmMod::new("link_tmp");
228216

229-
let deps = deps.unwrap_or_default();
230-
let mut main_modules = Vec::with_capacity(objects.len());
231-
let mut rlib_deps = Vec::with_capacity(rlibs.len());
217+
let mut modules = Vec::with_capacity(objects.len() + rlibs.len());
232218

233219
// object files (theyre not object files, they are impostors à¶ž) are the bitcode modules produced by this codegen session
234220
// they *should* be the final crate.
235221
for obj in objects {
236222
let bitcode = std::fs::read(obj)?;
237-
let name = obj
238-
.file_name()
239-
.unwrap()
240-
.to_str()
241-
.expect("non-utf8 bitcode file name")
242-
.to_string();
243-
main_modules.push((bitcode, name));
223+
modules.push(bitcode);
244224
}
245225

246226
// rlibs are archives that we made previously, they are usually made for crates that are referenced
247227
// in this crate. We must unpack them and devour their bitcode to link in.
248228
for rlib in rlibs {
249-
// every entry will be a CGU, we need to merge those CGUs into a single module so we can give it to libnvvm to load.
250229
let mut cgus = Vec::with_capacity(16);
251230
// just pick the first cgu name as the overall name for now.
252231
let mut name = String::new();
@@ -272,8 +251,7 @@ fn codegen_into_ptx_file(
272251
}
273252
}
274253

275-
let merged = merge_cgus(cgus, cx.llcx, name.clone());
276-
rlib_deps.push((merged, name));
254+
modules.extend(cgus);
277255
}
278256

279257
if let Some(alloc) = allocator {
@@ -283,56 +261,24 @@ fn codegen_into_ptx_file(
283261
.clone()
284262
.expect("expected obj path for allocator module"),
285263
)?;
286-
main_modules.push((bc, String::from("allocator")));
264+
modules.push(bc);
287265
}
288266

289-
let sorted_deps = deps.into_iter().filter_map(|x| {
290-
for (bc, name) in &rlib_deps {
291-
let new_name = name.split_once("-").expect("uh oh rustc changed the format of rlib file names, better go make an angry zulip thread.").0;
292-
if new_name == x.replace("-", "_") {
293-
return Some((bc.clone(), name.to_string()));
294-
}
295-
}
296-
// HACK(RDambrosio016): If a dep cannot be found then it is probably a proc macro crate.
297-
// in which case we should just ignore it and move on, but in the future we should filter out those
298-
// deps before linking.
299-
None
300-
}).collect::<Vec<_>>();
301-
302267
// now that we have our nice bitcode modules, we just need to find libdevice and give our
303268
// modules to nvvm to make a final ptx file
304269

305270
// we need to actually parse the codegen args again, because codegencx is not available at link time.
306271
let nvvm_opts = CodegenArgs::from_session(sess).nvvm_options;
307272

308-
let ptx_bytes =
309-
match crate::nvvm::codegen_bitcode_modules(&nvvm_opts, sess, main_modules, sorted_deps) {
310-
Ok(bytes) => bytes,
311-
Err(err) => {
312-
// TODO(RDambrosio016): maybe include the nvvm log with this fatal error
313-
sess.fatal(&err.to_string())
314-
}
315-
};
316-
317-
std::fs::write(out_filename, ptx_bytes)
318-
}
319-
320-
/// Merges multiple codegen units into a single codegen unit. This is needed because
321-
/// we lazy-load modules in dependency order, not sub-crate order, so we need to lazy load
322-
/// entire modules, not just individual CGUs.
323-
fn merge_cgus(cgus: Vec<Vec<u8>>, llcx: &Context, crate_name: String) -> Vec<u8> {
324-
let cstr = CString::new(crate_name.clone()).unwrap();
325-
let module = unsafe { create_module(llcx, &crate_name) };
326-
for cgu in cgus {
327-
unsafe {
328-
let tmp = LLVMRustParseBitcodeForLTO(llcx, cgu.as_ptr(), cgu.len(), cstr.as_ptr())
329-
.expect("Failed to parse CGU bitcode");
330-
LLVMLinkModules2(module, tmp);
273+
let ptx_bytes = match crate::nvvm::codegen_bitcode_modules(&nvvm_opts, sess, modules, cx.llcx) {
274+
Ok(bytes) => bytes,
275+
Err(err) => {
276+
// TODO(RDambrosio016): maybe include the nvvm log with this fatal error
277+
sess.fatal(&err.to_string())
331278
}
332-
}
279+
};
333280

334-
let thin = ThinBuffer::new(module);
335-
thin.data().to_vec()
281+
std::fs::write(out_filename, ptx_bytes)
336282
}
337283

338284
fn create_archive(sess: &Session, files: &[&Path], metadata: &[u8], out_filename: &Path) {

‎crates/rustc_codegen_nvvm/src/llvm.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,13 @@ extern "C" {
652652
// use rustc_codegen_nvvm_macros::trace_ffi_calls;
653653
// #[trace_ffi_calls]
654654
extern "C" {
655+
pub(crate) fn LLVMAddGlobalDCEPass(PM: &mut PassManager);
656+
pub(crate) fn LLVMGetNamedMetadataOperands(M: &Module, name: *const c_char, Dest: *mut &Value);
657+
pub(crate) fn LLVMGetNamedMetadataNumOperands(M: &Module, name: *const c_char) -> c_uint;
658+
pub(crate) fn LLVMGetMDNodeOperands(V: &Value, Dest: *mut &Value);
659+
pub(crate) fn LLVMGetMDNodeNumOperands(V: &Value) -> c_uint;
660+
pub(crate) fn LLVMGetFirstFunction(M: &Module) -> Option<&Value>;
661+
pub(crate) fn LLVMGetNextFunction(Fn: &Value) -> Option<&Value>;
655662
pub(crate) fn LLVMAddGlobalInAddressSpace<'a>(
656663
M: &'a Module,
657664
Ty: &'a Type,

0 commit comments

Comments
 (0)