From 892aaa339adc3a3d2cc9dac558b947b6ece019fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Sun, 19 Oct 2025 12:36:23 +0200 Subject: [PATCH 01/14] first definition of `offload` intrinsic (dirty code) --- .../src/builder/gpu_offload.rs | 138 ++++++++++++------ compiler/rustc_codegen_llvm/src/intrinsic.rs | 71 +++++++++ compiler/rustc_codegen_llvm/src/lib.rs | 2 + .../rustc_hir_analysis/src/check/intrinsic.rs | 2 + compiler/rustc_span/src/symbol.rs | 1 + library/core/src/intrinsics/mod.rs | 4 + .../gpu_offload/offload_intrinsic.rs | 37 +++++ 7 files changed, 210 insertions(+), 45 deletions(-) create mode 100644 tests/codegen-llvm/gpu_offload/offload_intrinsic.rs diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 5c2f8f700627e..c2df6489a726e 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -4,17 +4,18 @@ use llvm::Linkage::*; use rustc_abi::Align; use rustc_codegen_ssa::back::write::CodegenContext; use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; +use rustc_middle::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv}; use crate::builder::SBuilder; -use crate::common::AsCCharPtr; use crate::llvm::AttributePlace::Function; -use crate::llvm::{self, Linkage, Type, Value}; +use crate::llvm::{self, BasicBlock, Linkage, Type, Value}; use crate::{LlvmCodegenBackend, SimpleCx, attributes}; pub(crate) fn handle_gpu_code<'ll>( _cgcx: &CodegenContext, - cx: &'ll SimpleCx<'_>, + _cx: &'ll SimpleCx<'_>, ) { + /* // The offload memory transfer type for each kernel let mut memtransfer_types = vec![]; let mut region_ids = vec![]; @@ -32,6 +33,7 @@ pub(crate) fn handle_gpu_code<'ll>( } gen_call_handling(&cx, &memtransfer_types, ®ion_ids); + */ } // ; Function Attrs: nounwind @@ -79,7 +81,7 @@ fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value { at_one } -struct TgtOffloadEntry { +pub(crate) struct TgtOffloadEntry { // uint64_t Reserved; // uint16_t Version; // uint16_t Kind; @@ -256,11 +258,14 @@ pub(crate) fn add_global<'ll>( // This function returns a memtransfer value which encodes how arguments to this kernel shall be // mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be // concatenated into the list of region_ids. -fn gen_define_handling<'ll>( - cx: &'ll SimpleCx<'_>, +pub(crate) fn gen_define_handling<'ll, 'tcx>( + cx: &SimpleCx<'ll>, + tcx: TyCtxt<'tcx>, kernel: &'ll llvm::Value, offload_entry_ty: &'ll llvm::Type, - num: i64, + // TODO(Sa4dUs): Define a typetree once i have a better idea of what do we exactly need + tt: Vec>, + symbol: &str, ) -> (&'ll llvm::Value, &'ll llvm::Value) { let types = cx.func_params_types(cx.get_type_of_global(kernel)); // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or @@ -270,11 +275,21 @@ fn gen_define_handling<'ll>( .filter(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer)) .count(); + // TODO(Sa4dUs): Add typetrees here + let ptr_sizes = types + .iter() + .zip(tt) + .filter_map(|(&x, ty)| match cx.type_kind(x) { + rustc_codegen_ssa::common::TypeKind::Pointer => Some(get_payload_size(tcx, ty)), + _ => None, + }) + .collect::>(); + // We do not know their size anymore at this level, so hardcode a placeholder. // A follow-up pr will track these from the frontend, where we still have Rust types. // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes. // I decided that 1024 bytes is a great placeholder value for now. - add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]); + add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes); // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2), // or both to and from the gpu (=3). Other values shouldn't affect us for now. // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten @@ -282,25 +297,28 @@ fn gen_define_handling<'ll>( // 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later). let memtransfer_types = add_priv_unnamed_arr( &cx, - &format!(".offload_maptypes.{num}"), + &format!(".offload_maptypes.{symbol}"), &vec![1 + 2 + 32; num_ptr_types], ); + // Next: For each function, generate these three entries. A weak constant, // the llvm.rodata entry name, and the llvm_offload_entries value - let name = format!(".kernel_{num}.region_id"); + let name = format!(".{symbol}.region_id"); let initializer = cx.get_const_i8(0); let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage); - let c_entry_name = CString::new(format!("kernel_{num}")).unwrap(); + let c_entry_name = CString::new(symbol).unwrap(); let c_val = c_entry_name.as_bytes_with_nul(); - let offload_entry_name = format!(".offloading.entry_name.{num}"); + let offload_entry_name = format!(".offloading.entry_name.{symbol}"); let initializer = crate::common::bytes_in_context(cx.llcx, c_val); let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage); llvm::set_alignment(llglobal, Align::ONE); llvm::set_section(llglobal, c".llvm.rodata.offloading"); - let name = format!(".offloading.entry.kernel_{num}"); + + // Not actively used yet, for calling real kernels + let name = format!(".offloading.entry.{symbol}"); // See the __tgt_offload_entry documentation above. let elems = TgtOffloadEntry::new(&cx, region_id, llglobal); @@ -317,7 +335,57 @@ fn gen_define_handling<'ll>( (memtransfer_types, region_id) } -pub(crate) fn declare_offload_fn<'ll>( +// TODO(Sa4dUs): move this to a proper place +fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { + match ty.kind() { + /* + rustc_middle::infer::canonical::ir::TyKind::Bool => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Char => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Int(int_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Uint(uint_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Float(float_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Adt(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Foreign(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Str => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Array(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Pat(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Slice(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::RawPtr(_, mutability) => todo!(), + */ + ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), + /* + rustc_middle::infer::canonical::ir::TyKind::FnDef(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::FnPtr(binder, fn_header) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::UnsafeBinder(unsafe_binder_inner) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Dynamic(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Closure(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::CoroutineClosure(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Coroutine(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::CoroutineWitness(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Never => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Tuple(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Alias(alias_ty_kind, alias_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Param(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Bound(bound_var_index_kind, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Placeholder(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Infer(infer_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Error(_) => todo!(), + */ + _ => { + tcx + // TODO(Sa4dUs): Maybe `.as_query_input()`? + .layout_of(PseudoCanonicalInput { + typing_env: TypingEnv::fully_monomorphized(), + value: ty, + }) + .unwrap() + .size + .bytes() + } + } +} + +fn declare_offload_fn<'ll>( cx: &'ll SimpleCx<'_>, name: &str, ty: &'ll llvm::Type, @@ -352,10 +420,13 @@ pub(crate) fn declare_offload_fn<'ll>( // 4. set insert point after kernel call. // 5. generate all the GEPS and stores, to be used in 6) // 6. generate __tgt_target_data_end calls to move data from the GPU -fn gen_call_handling<'ll>( - cx: &'ll SimpleCx<'_>, +pub(crate) fn gen_call_handling<'ll>( + cx: &SimpleCx<'ll>, + bb: &BasicBlock, + kernels: &[&'ll llvm::Value], memtransfer_types: &[&'ll llvm::Value], region_ids: &[&'ll llvm::Value], + llfn: &'ll Value, ) { let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx); // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } @@ -368,27 +439,14 @@ fn gen_call_handling<'ll>( let tgt_kernel_decl = KernelArgsTy::new_decl(&cx); let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx); - let main_fn = cx.get_function("main"); - let Some(main_fn) = main_fn else { return }; - let kernel_name = "kernel_1"; - let call = unsafe { - llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len()) - }; - let Some(kernel_call) = call else { - return; - }; - let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) }; - let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() }; - let mut builder = SBuilder::build(cx, kernel_call_bb); - - let types = cx.func_params_types(cx.get_type_of_global(called)); + let mut builder = SBuilder::build(cx, bb); + + let types = cx.func_params_types(cx.get_type_of_global(kernels[0])); let num_args = types.len() as u64; // Step 0) // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } // %6 = alloca %struct.__tgt_bin_desc, align 8 - unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) }; - let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc"); let ty = cx.type_array(cx.type_ptr(), num_args); @@ -404,15 +462,14 @@ fn gen_call_handling<'ll>( let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args"); // Step 1) - unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) }; builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT); // Now we allocate once per function param, a copy to be passed to one of our maps. let mut vals = vec![]; let mut geps = vec![]; let i32_0 = cx.get_const_i32(0); - for index in 0..types.len() { - let v = unsafe { llvm::LLVMGetOperand(kernel_call, index as u32).unwrap() }; + for index in 0..num_args { + let v = unsafe { llvm::LLVMGetParam(llfn, index as u32) }; let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]); vals.push(v); geps.push(gep); @@ -504,13 +561,8 @@ fn gen_call_handling<'ll>( region_ids[0], a5, ]; - let offload_success = builder.call(tgt_target_kernel_ty, tgt_decl, &args, None); + builder.call(tgt_target_kernel_ty, tgt_decl, &args, None); // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) - unsafe { - let next = llvm::LLVMGetNextInstruction(offload_success).unwrap(); - llvm::LLVMRustPositionAfter(builder.llbuilder, next); - llvm::LLVMInstructionEraseFromParent(next); - } // Step 4) let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4); @@ -519,8 +571,4 @@ fn gen_call_handling<'ll>( builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None); drop(builder); - // FIXME(offload) The issue is that we right now add a call to the gpu version of the function, - // and then delete the call to the CPU version. In the future, we should use an intrinsic which - // directly resolves to a call to the GPU version. - unsafe { llvm::LLVMDeleteFunction(called) }; } diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 84fc6ebbc3172..2a650cc3c61ab 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -24,6 +24,7 @@ use tracing::debug; use crate::abi::FnAbiLlvmExt; use crate::builder::Builder; use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call}; +use crate::builder::gpu_offload::TgtOffloadEntry; use crate::context::CodegenCx; use crate::errors::{AutoDiffWithoutEnable, AutoDiffWithoutLto}; use crate::llvm::{self, Metadata, Type, Value}; @@ -196,6 +197,10 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { codegen_autodiff(self, tcx, instance, args, result); return Ok(()); } + sym::offload => { + codegen_offload(self, tcx, instance, args, result); + return Ok(()); + } sym::is_val_statically_known => { if let OperandValue::Immediate(imm) = args[0].val { self.call_intrinsic( @@ -1221,6 +1226,72 @@ fn codegen_autodiff<'ll, 'tcx>( ); } +fn codegen_offload<'ll, 'tcx>( + bx: &mut Builder<'_, 'll, 'tcx>, + tcx: TyCtxt<'tcx>, + instance: ty::Instance<'tcx>, + _args: &[OperandRef<'tcx, &'ll Value>], + _result: PlaceRef<'tcx, &'ll Value>, +) { + let cx = bx.cx; + let fn_args = instance.args; + + let (target_id, target_args) = match fn_args.into_type_list(tcx)[0].kind() { + ty::FnDef(def_id, params) => (def_id, params), + _ => bug!("invalid offload intrinsic arg"), + }; + + let fn_target = match Instance::try_resolve(tcx, cx.typing_env(), *target_id, target_args) { + Ok(Some(instance)) => instance, + Ok(None) => bug!( + "could not resolve ({:?}, {:?}) to a specific offload instance", + target_id, + target_args + ), + Err(_) => { + // An error has already been emitted + return; + } + }; + + // TODO(Sa4dUs): Will need typetrees + let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target.clone(), LOCAL_CRATE); + let Some(kernel) = cx.get_function(&target_symbol) else { + bug!("could not find target function") + }; + + let offload_entry_ty = TgtOffloadEntry::new_decl(&cx); + + // Build TypeTree (or something similar) + let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder(); + let inputs = sig.inputs(); + + // TODO(Sa4dUs): separate globals from call-independent headers and use typetrees to reserve the correct amount of memory + let (memtransfer_type, region_id) = crate::builder::gpu_offload::gen_define_handling( + cx, + tcx, + kernel, + offload_entry_ty, + inputs.to_vec(), + &target_symbol, + ); + + let kernels = &[kernel]; + + let llfn = bx.llfn(); + + // TODO(Sa4dUs): this is a patch for delaying lifetime's issue fix + let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) }; + crate::builder::gpu_offload::gen_call_handling( + cx, + bb, + kernels, + &[memtransfer_type], + &[region_id], + llfn, + ); +} + fn get_args_from_tuple<'ll, 'tcx>( bx: &mut Builder<'_, 'll, 'tcx>, tuple_op: OperandRef<'tcx, &'ll Value>, diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs index 1b65a133d58c1..9406d5e0ca8c3 100644 --- a/compiler/rustc_codegen_llvm/src/lib.rs +++ b/compiler/rustc_codegen_llvm/src/lib.rs @@ -4,6 +4,8 @@ //! //! This API is completely unstable and subject to change. +// TODO(Sa4dUs): remove this once we have a great version, just to ignore unused LLVM wrappers +#![allow(unused)] // tidy-alphabetical-start #![cfg_attr(bootstrap, feature(slice_as_array))] #![feature(assert_matches)] diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index d87a154b0f1bb..f72dc0a466ccd 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -163,6 +163,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi | sym::minnumf128 | sym::mul_with_overflow | sym::needs_drop + | sym::offload | sym::overflow_checks | sym::powf16 | sym::powf32 @@ -311,6 +312,7 @@ pub(crate) fn check_intrinsic_type( let type_id = tcx.type_of(tcx.lang_items().type_id().unwrap()).instantiate_identity(); (0, 0, vec![type_id, type_id], tcx.types.bool) } + sym::offload => (2, 0, vec![param(0)], param(1)), sym::offset => (2, 0, vec![param(0), param(1)], param(0)), sym::arith_offset => ( 1, diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 8ab8181833064..128384821610e 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1583,6 +1583,7 @@ symbols! { object_safe_for_dispatch, of, off, + offload, offset, offset_of, offset_of_enum, diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index c987d80be8b42..94f2cf322f3ee 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3304,6 +3304,10 @@ pub const fn copysignf128(x: f128, y: f128) -> f128; #[rustc_intrinsic] pub const fn autodiff(f: F, df: G, args: T) -> R; +#[rustc_nounwind] +#[rustc_intrinsic] +pub const fn offload(f: F) -> R; + /// Inform Miri that a given pointer definitely has a certain alignment. #[cfg(miri)] #[rustc_allow_const_fn_unstable(const_eval_select)] diff --git a/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs b/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs new file mode 100644 index 0000000000000..739186abc4f45 --- /dev/null +++ b/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs @@ -0,0 +1,37 @@ +//@ compile-flags: -Zoffload=Enable -Zunstable-options -C opt-level=0 -Clto=fat +//@ no-prefer-dynamic +//@ needs-enzyme + +// This test is verifying that we generate __tgt_target_data_*_mapper before and after a call to the +// kernel_1. Better documentation to what each global or variable means is available in the gpu +// offlaod code, or the LLVM offload documentation. This code does not launch any GPU kernels yet, +// and will be rewritten once a proper offload frontend has landed. +// +// We currently only handle memory transfer for specific calls to functions named `kernel_{num}`, +// when inside of a function called main. This, too, is a temporary workaround for not having a +// frontend. + +// CHECK: ; +#![feature(core_intrinsics)] +#![no_main] + +#[unsafe(no_mangle)] +fn main() { + let mut x = [3.0; 256]; + kernel(&mut x); + core::hint::black_box(&x); +} + +#[unsafe(no_mangle)] +#[inline(never)] +pub fn kernel(x: &mut [f32; 256]) { + core::intrinsics::offload(_kernel) +} + +#[unsafe(no_mangle)] +#[inline(never)] +pub fn _kernel(x: &mut [f32; 256]) { + for i in 0..256 { + x[i] = 21.0; + } +} From 0ad8f431e8762de2cb09447eb8af6c943c813c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Tue, 21 Oct 2025 09:49:12 +0200 Subject: [PATCH 02/14] Add basic offload metadata --- .../src/builder/gpu_offload.rs | 65 ++--------------- compiler/rustc_codegen_llvm/src/intrinsic.rs | 12 ++-- compiler/rustc_middle/src/ty/mod.rs | 1 + compiler/rustc_middle/src/ty/offload_meta.rs | 70 +++++++++++++++++++ 4 files changed, 84 insertions(+), 64 deletions(-) create mode 100644 compiler/rustc_middle/src/ty/offload_meta.rs diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index c2df6489a726e..b5a15673d1833 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -4,6 +4,7 @@ use llvm::Linkage::*; use rustc_abi::Align; use rustc_codegen_ssa::back::write::CodegenContext; use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; +use rustc_middle::ty::offload_meta::OffloadMetadata; use rustc_middle::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv}; use crate::builder::SBuilder; @@ -263,8 +264,7 @@ pub(crate) fn gen_define_handling<'ll, 'tcx>( tcx: TyCtxt<'tcx>, kernel: &'ll llvm::Value, offload_entry_ty: &'ll llvm::Type, - // TODO(Sa4dUs): Define a typetree once i have a better idea of what do we exactly need - tt: Vec>, + metadata: Vec, symbol: &str, ) -> (&'ll llvm::Value, &'ll llvm::Value) { let types = cx.func_params_types(cx.get_type_of_global(kernel)); @@ -275,12 +275,11 @@ pub(crate) fn gen_define_handling<'ll, 'tcx>( .filter(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer)) .count(); - // TODO(Sa4dUs): Add typetrees here let ptr_sizes = types .iter() - .zip(tt) - .filter_map(|(&x, ty)| match cx.type_kind(x) { - rustc_codegen_ssa::common::TypeKind::Pointer => Some(get_payload_size(tcx, ty)), + .zip(metadata) + .filter_map(|(&x, meta)| match cx.type_kind(x) { + rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta.payload_size), _ => None, }) .collect::>(); @@ -335,56 +334,6 @@ pub(crate) fn gen_define_handling<'ll, 'tcx>( (memtransfer_types, region_id) } -// TODO(Sa4dUs): move this to a proper place -fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { - match ty.kind() { - /* - rustc_middle::infer::canonical::ir::TyKind::Bool => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Char => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Int(int_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Uint(uint_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Float(float_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Adt(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Foreign(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Str => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Array(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Pat(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Slice(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::RawPtr(_, mutability) => todo!(), - */ - ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), - /* - rustc_middle::infer::canonical::ir::TyKind::FnDef(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::FnPtr(binder, fn_header) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::UnsafeBinder(unsafe_binder_inner) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Dynamic(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Closure(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::CoroutineClosure(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Coroutine(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::CoroutineWitness(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Never => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Tuple(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Alias(alias_ty_kind, alias_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Param(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Bound(bound_var_index_kind, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Placeholder(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Infer(infer_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Error(_) => todo!(), - */ - _ => { - tcx - // TODO(Sa4dUs): Maybe `.as_query_input()`? - .layout_of(PseudoCanonicalInput { - typing_env: TypingEnv::fully_monomorphized(), - value: ty, - }) - .unwrap() - .size - .bytes() - } - } -} - fn declare_offload_fn<'ll>( cx: &'ll SimpleCx<'_>, name: &str, @@ -423,7 +372,7 @@ fn declare_offload_fn<'ll>( pub(crate) fn gen_call_handling<'ll>( cx: &SimpleCx<'ll>, bb: &BasicBlock, - kernels: &[&'ll llvm::Value], + kernel: &'ll llvm::Value, memtransfer_types: &[&'ll llvm::Value], region_ids: &[&'ll llvm::Value], llfn: &'ll Value, @@ -441,7 +390,7 @@ pub(crate) fn gen_call_handling<'ll>( let mut builder = SBuilder::build(cx, bb); - let types = cx.func_params_types(cx.get_type_of_global(kernels[0])); + let types = cx.func_params_types(cx.get_type_of_global(kernel)); let num_args = types.len() as u64; // Step 0) diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 2a650cc3c61ab..5074916d1c394 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -13,6 +13,7 @@ use rustc_hir::def_id::LOCAL_CRATE; use rustc_hir::{self as hir}; use rustc_middle::mir::BinOp; use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt, HasTypingEnv, LayoutOf}; +use rustc_middle::ty::offload_meta::OffloadMetadata; use rustc_middle::ty::{self, GenericArgsRef, Instance, SimdAlign, Ty, TyCtxt, TypingEnv}; use rustc_middle::{bug, span_bug}; use rustc_span::{Span, Symbol, sym}; @@ -1254,7 +1255,6 @@ fn codegen_offload<'ll, 'tcx>( } }; - // TODO(Sa4dUs): Will need typetrees let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target.clone(), LOCAL_CRATE); let Some(kernel) = cx.get_function(&target_symbol) else { bug!("could not find target function") @@ -1266,26 +1266,26 @@ fn codegen_offload<'ll, 'tcx>( let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder(); let inputs = sig.inputs(); + let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::>(); + // TODO(Sa4dUs): separate globals from call-independent headers and use typetrees to reserve the correct amount of memory let (memtransfer_type, region_id) = crate::builder::gpu_offload::gen_define_handling( cx, tcx, kernel, offload_entry_ty, - inputs.to_vec(), + metadata, &target_symbol, ); - let kernels = &[kernel]; - let llfn = bx.llfn(); - // TODO(Sa4dUs): this is a patch for delaying lifetime's issue fix + // TODO(Sa4dUs): this is just to a void lifetime's issues let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) }; crate::builder::gpu_offload::gen_call_handling( cx, bb, - kernels, + kernel, &[memtransfer_type], &[region_id], llfn, diff --git a/compiler/rustc_middle/src/ty/mod.rs b/compiler/rustc_middle/src/ty/mod.rs index 5eb8f1713a138..a6891c26d653e 100644 --- a/compiler/rustc_middle/src/ty/mod.rs +++ b/compiler/rustc_middle/src/ty/mod.rs @@ -129,6 +129,7 @@ pub mod fast_reject; pub mod inhabitedness; pub mod layout; pub mod normalize_erasing_regions; +pub mod offload_meta; pub mod pattern; pub mod print; pub mod relate; diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs new file mode 100644 index 0000000000000..e7159888a643d --- /dev/null +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -0,0 +1,70 @@ +use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv}; + +// TODO(Sa4dUs): it doesn't feel correct for me to place this on `rustc_ast::expand`, will look for a proper location +pub struct OffloadMetadata { + pub payload_size: u64, + pub mode: TransferKind, +} + +pub enum TransferKind { + FromGpu = 1, + ToGpu = 2, + Both = 3, +} + +impl OffloadMetadata { + pub fn new(payload_size: u64, mode: TransferKind) -> Self { + OffloadMetadata { payload_size, mode } + } + + pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { + OffloadMetadata { payload_size: get_payload_size(tcx, ty), mode: TransferKind::Both } + } +} + +// TODO(Sa4dUs): WIP, rn we just have a naive logic for references +fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { + match ty.kind() { + /* + rustc_middle::infer::canonical::ir::TyKind::Bool => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Char => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Int(int_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Uint(uint_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Float(float_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Adt(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Foreign(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Str => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Array(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Pat(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Slice(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::RawPtr(_, mutability) => todo!(), + */ + ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), + /* + rustc_middle::infer::canonical::ir::TyKind::FnDef(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::FnPtr(binder, fn_header) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::UnsafeBinder(unsafe_binder_inner) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Dynamic(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Closure(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::CoroutineClosure(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Coroutine(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::CoroutineWitness(_, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Never => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Tuple(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Alias(alias_ty_kind, alias_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Param(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Bound(bound_var_index_kind, _) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Placeholder(_) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Infer(infer_ty) => todo!(), + rustc_middle::infer::canonical::ir::TyKind::Error(_) => todo!(), + */ + _ => tcx + .layout_of(PseudoCanonicalInput { + typing_env: TypingEnv::fully_monomorphized(), + value: ty, + }) + .unwrap() + .size + .bytes(), + } +} From bb5620a2f7b67b95ff3d8003377c8e91c51d2961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Mon, 27 Oct 2025 12:30:19 +0100 Subject: [PATCH 03/14] Set maptypes using offload metadata --- .../src/builder/gpu_offload.rs | 21 +++---- compiler/rustc_middle/src/ty/offload_meta.rs | 56 ++++++++++++++++++- .../gpu_offload/offload_intrinsic.rs | 2 +- 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index b5a15673d1833..69518358b5b63 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -270,19 +270,17 @@ pub(crate) fn gen_define_handling<'ll, 'tcx>( let types = cx.func_params_types(cx.get_type_of_global(kernel)); // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or // reference) types. - let num_ptr_types = types - .iter() - .filter(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer)) - .count(); - - let ptr_sizes = types + let ptr_meta = types .iter() .zip(metadata) .filter_map(|(&x, meta)| match cx.type_kind(x) { - rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta.payload_size), + rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta), _ => None, }) - .collect::>(); + .collect::>(); + + let ptr_sizes = ptr_meta.iter().map(|m| m.payload_size).collect::>(); + let ptr_transfer = ptr_meta.iter().map(|m| m.mode as u64 | 0x20).collect::>(); // We do not know their size anymore at this level, so hardcode a placeholder. // A follow-up pr will track these from the frontend, where we still have Rust types. @@ -294,11 +292,8 @@ pub(crate) fn gen_define_handling<'ll, 'tcx>( // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten // will be 2. For now, everything is 3, until we have our frontend set up. // 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later). - let memtransfer_types = add_priv_unnamed_arr( - &cx, - &format!(".offload_maptypes.{symbol}"), - &vec![1 + 2 + 32; num_ptr_types], - ); + let memtransfer_types = + add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &ptr_transfer); // Next: For each function, generate these three entries. A weak constant, // the llvm.rodata entry name, and the llvm_offload_entries value diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs index e7159888a643d..7c1b42b8cc08b 100644 --- a/compiler/rustc_middle/src/ty/offload_meta.rs +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -6,10 +6,13 @@ pub struct OffloadMetadata { pub mode: TransferKind, } +// TODO(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` flag only when needed +#[repr(u64)] +#[derive(Debug, Copy, Clone)] pub enum TransferKind { FromGpu = 1, ToGpu = 2, - Both = 3, + Both = 1 + 2, } impl OffloadMetadata { @@ -18,7 +21,10 @@ impl OffloadMetadata { } pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { - OffloadMetadata { payload_size: get_payload_size(tcx, ty), mode: TransferKind::Both } + OffloadMetadata { + payload_size: get_payload_size(tcx, ty), + mode: TransferKind::from_ty(tcx, ty), + } } } @@ -68,3 +74,49 @@ fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { .bytes(), } } + +impl TransferKind { + pub fn from_ty<'tcx>(_tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { + // TODO(Sa4dUs): this logic is probs not fully correct, but it works for now + match ty.kind() { + rustc_type_ir::TyKind::Bool + | rustc_type_ir::TyKind::Char + | rustc_type_ir::TyKind::Int(_) + | rustc_type_ir::TyKind::Uint(_) + | rustc_type_ir::TyKind::Float(_) => TransferKind::ToGpu, + + rustc_type_ir::TyKind::Adt(_, _) + | rustc_type_ir::TyKind::Tuple(_) + | rustc_type_ir::TyKind::Array(_, _) => TransferKind::ToGpu, + + rustc_type_ir::TyKind::RawPtr(_, rustc_ast::Mutability::Not) + | rustc_type_ir::TyKind::Ref(_, _, rustc_ast::Mutability::Not) => TransferKind::ToGpu, + + rustc_type_ir::TyKind::RawPtr(_, rustc_ast::Mutability::Mut) + | rustc_type_ir::TyKind::Ref(_, _, rustc_ast::Mutability::Mut) => TransferKind::Both, + + rustc_type_ir::TyKind::Slice(_) + | rustc_type_ir::TyKind::Str + | rustc_type_ir::TyKind::Dynamic(_, _) => TransferKind::Both, + + rustc_type_ir::TyKind::FnDef(_, _) + | rustc_type_ir::TyKind::FnPtr(_, _) + | rustc_type_ir::TyKind::Closure(_, _) + | rustc_type_ir::TyKind::CoroutineClosure(_, _) + | rustc_type_ir::TyKind::Coroutine(_, _) + | rustc_type_ir::TyKind::CoroutineWitness(_, _) => TransferKind::ToGpu, + + rustc_type_ir::TyKind::Alias(_, _) + | rustc_type_ir::TyKind::Param(_) + | rustc_type_ir::TyKind::Bound(_, _) + | rustc_type_ir::TyKind::Placeholder(_) + | rustc_type_ir::TyKind::Infer(_) + | rustc_type_ir::TyKind::Error(_) => TransferKind::ToGpu, + + rustc_type_ir::TyKind::Never => TransferKind::ToGpu, + rustc_type_ir::TyKind::Foreign(_) => TransferKind::Both, + rustc_type_ir::TyKind::Pat(_, _) => TransferKind::Both, + rustc_type_ir::TyKind::UnsafeBinder(_) => TransferKind::Both, + } + } +} diff --git a/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs b/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs index 739186abc4f45..c3df15e3be6bd 100644 --- a/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs +++ b/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs @@ -1,4 +1,4 @@ -//@ compile-flags: -Zoffload=Enable -Zunstable-options -C opt-level=0 -Clto=fat +//@ compile-flags: -Zoffload=Enable -Zunstable-options -C opt-level=3 -Clto=fat //@ no-prefer-dynamic //@ needs-enzyme From 81c4bb2302b90e1730515f9f8eeddf629f4b0c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Mon, 27 Oct 2025 20:32:46 +0100 Subject: [PATCH 04/14] Pass frontend info to `gen_call_handling` --- compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs | 3 ++- compiler/rustc_codegen_llvm/src/intrinsic.rs | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 69518358b5b63..cbabc2c27106c 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -371,6 +371,7 @@ pub(crate) fn gen_call_handling<'ll>( memtransfer_types: &[&'ll llvm::Value], region_ids: &[&'ll llvm::Value], llfn: &'ll Value, + metadata: Vec, ) { let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx); // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } @@ -441,7 +442,7 @@ pub(crate) fn gen_call_handling<'ll>( // As mentioned above, we don't use Rust type information yet. So for now we will just // assume that we have 1024 bytes, 256 f32 values. // FIXME(offload): write an offload frontend and handle arbitrary types. - builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT); + builder.store(cx.get_const_i64(metadata[i].payload_size), gep3, Align::EIGHT); } // For now we have a very simplistic indexing scheme into our diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 5074916d1c394..f56550f92627c 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -1289,6 +1289,7 @@ fn codegen_offload<'ll, 'tcx>( &[memtransfer_type], &[region_id], llfn, + metadata, ); } From 89fd33548aa5a00e24fb7114ba593416b99faff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Tue, 4 Nov 2025 18:58:24 +0100 Subject: [PATCH 05/14] Mark globals as used + some minor fixes --- .../src/builder/gpu_offload.rs | 62 +++++++++++++++---- compiler/rustc_codegen_llvm/src/intrinsic.rs | 14 ++--- compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 1 + tests/codegen-llvm/gpu_offload/gpu_host.rs | 26 +++++--- .../gpu_offload/offload_intrinsic.rs | 37 ----------- 5 files changed, 72 insertions(+), 68 deletions(-) delete mode 100644 tests/codegen-llvm/gpu_offload/offload_intrinsic.rs diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index cbabc2c27106c..de7245bafec83 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -14,7 +14,7 @@ use crate::{LlvmCodegenBackend, SimpleCx, attributes}; pub(crate) fn handle_gpu_code<'ll>( _cgcx: &CodegenContext, - _cx: &'ll SimpleCx<'_>, + cx: &'ll SimpleCx<'_>, ) { /* // The offload memory transfer type for each kernel @@ -259,15 +259,14 @@ pub(crate) fn add_global<'ll>( // This function returns a memtransfer value which encodes how arguments to this kernel shall be // mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be // concatenated into the list of region_ids. -pub(crate) fn gen_define_handling<'ll, 'tcx>( +pub(crate) fn gen_define_handling<'ll>( cx: &SimpleCx<'ll>, - tcx: TyCtxt<'tcx>, - kernel: &'ll llvm::Value, + llfn: &'ll llvm::Value, offload_entry_ty: &'ll llvm::Type, - metadata: Vec, + metadata: &Vec, symbol: &str, ) -> (&'ll llvm::Value, &'ll llvm::Value) { - let types = cx.func_params_types(cx.get_type_of_global(kernel)); + let types = cx.func_params_types(cx.get_type_of_global(llfn)); // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or // reference) types. let ptr_meta = types @@ -277,7 +276,7 @@ pub(crate) fn gen_define_handling<'ll, 'tcx>( rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta), _ => None, }) - .collect::>(); + .collect::>(); let ptr_sizes = ptr_meta.iter().map(|m| m.payload_size).collect::>(); let ptr_transfer = ptr_meta.iter().map(|m| m.mode as u64 | 0x20).collect::>(); @@ -286,7 +285,7 @@ pub(crate) fn gen_define_handling<'ll, 'tcx>( // A follow-up pr will track these from the frontend, where we still have Rust types. // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes. // I decided that 1024 bytes is a great placeholder value for now. - add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes); + let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes); // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2), // or both to and from the gpu (=3). Other values shouldn't affect us for now. // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten @@ -326,6 +325,8 @@ pub(crate) fn gen_define_handling<'ll, 'tcx>( llvm::set_alignment(llglobal, Align::EIGHT); let c_section_name = CString::new("llvm_offload_entries").unwrap(); llvm::set_section(llglobal, &c_section_name); + + add_to_llvm_used(cx, &[offload_sizes, memtransfer_types, region_id, llglobal]); (memtransfer_types, region_id) } @@ -367,11 +368,10 @@ fn declare_offload_fn<'ll>( pub(crate) fn gen_call_handling<'ll>( cx: &SimpleCx<'ll>, bb: &BasicBlock, - kernel: &'ll llvm::Value, memtransfer_types: &[&'ll llvm::Value], region_ids: &[&'ll llvm::Value], llfn: &'ll Value, - metadata: Vec, + metadata: &Vec, ) { let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx); // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } @@ -386,7 +386,7 @@ pub(crate) fn gen_call_handling<'ll>( let mut builder = SBuilder::build(cx, bb); - let types = cx.func_params_types(cx.get_type_of_global(kernel)); + let types = cx.func_params_types(cx.get_type_of_global(llfn)); let num_args = types.len() as u64; // Step 0) @@ -442,7 +442,7 @@ pub(crate) fn gen_call_handling<'ll>( // As mentioned above, we don't use Rust type information yet. So for now we will just // assume that we have 1024 bytes, 256 f32 values. // FIXME(offload): write an offload frontend and handle arbitrary types. - builder.store(cx.get_const_i64(metadata[i].payload_size), gep3, Align::EIGHT); + builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT); } // For now we have a very simplistic indexing scheme into our @@ -517,3 +517,41 @@ pub(crate) fn gen_call_handling<'ll>( drop(builder); } + +// TODO(Sa4dUs): check if there's a better way of doing this, also move to a proper location +fn add_to_llvm_used<'ll>(cx: &'ll SimpleCx<'_>, globals: &[&'ll Value]) { + let ptr_ty = cx.type_ptr(); + let arr_ty = cx.type_array(ptr_ty, globals.len() as u64); + let arr_val = cx.const_array(ptr_ty, globals); + + let name = CString::new("llvm.used").unwrap(); + + let used_global_opt = unsafe { llvm::LLVMGetNamedGlobal(cx.llmod, name.as_ptr()) }; + + if used_global_opt.is_none() { + let new_global = unsafe { llvm::LLVMAddGlobal(cx.llmod, arr_ty, name.as_ptr()) }; + unsafe { llvm::LLVMSetLinkage(new_global, llvm::Linkage::AppendingLinkage) }; + unsafe { + llvm::LLVMSetSection(new_global, CString::new("llvm.metadata").unwrap().as_ptr()) + }; + unsafe { llvm::LLVMSetInitializer(new_global, arr_val) }; + llvm::LLVMSetGlobalConstant(new_global, llvm::TRUE); + return; + } + + let used_global = used_global_opt.expect("expected @llvm.used"); + let mut combined: Vec<&'ll Value> = Vec::new(); + + if let Some(existing_init) = llvm::LLVMGetInitializer(used_global) { + let num_elems = unsafe { llvm::LLVMGetNumOperands(existing_init) }; + for i in 0..num_elems { + if let Some(elem) = unsafe { llvm::LLVMGetOperand(existing_init, i) } { + combined.push(elem); + } + } + } + + combined.extend_from_slice(globals); + let new_arr = cx.const_array(ptr_ty, &combined); + unsafe { llvm::LLVMSetInitializer(used_global, new_arr) }; +} diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index f56550f92627c..76fc3e97093bb 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -1256,9 +1256,6 @@ fn codegen_offload<'ll, 'tcx>( }; let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target.clone(), LOCAL_CRATE); - let Some(kernel) = cx.get_function(&target_symbol) else { - bug!("could not find target function") - }; let offload_entry_ty = TgtOffloadEntry::new_decl(&cx); @@ -1267,29 +1264,26 @@ fn codegen_offload<'ll, 'tcx>( let inputs = sig.inputs(); let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::>(); + let llfn = bx.llfn(); // TODO(Sa4dUs): separate globals from call-independent headers and use typetrees to reserve the correct amount of memory let (memtransfer_type, region_id) = crate::builder::gpu_offload::gen_define_handling( cx, - tcx, - kernel, + llfn, offload_entry_ty, - metadata, + &metadata, &target_symbol, ); - let llfn = bx.llfn(); - // TODO(Sa4dUs): this is just to a void lifetime's issues let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) }; crate::builder::gpu_offload::gen_call_handling( cx, bb, - kernel, &[memtransfer_type], &[region_id], llfn, - metadata, + &metadata, ); } diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index ca64d96c2a33c..cf94678d140d0 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1167,6 +1167,7 @@ unsafe extern "C" { pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>; pub(crate) fn LLVMGetNextInstruction(Val: &Value) -> Option<&Value>; pub(crate) fn LLVMInstructionEraseFromParent(Val: &Value); + pub(crate) fn LLVMGetNumOperands(Val: &Value) -> c_uint; // Operations on call sites pub(crate) fn LLVMSetInstructionCallConv(Instr: &Value, CC: c_uint); diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index fac4054d1b7ff..69eea6a6a8cea 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -11,12 +11,13 @@ // when inside of a function called main. This, too, is a temporary workaround for not having a // frontend. +#![feature(core_intrinsics)] #![no_main] #[unsafe(no_mangle)] fn main() { let mut x = [3.0; 256]; - kernel_1(&mut x); + kernel(&mut x); core::hint::black_box(&x); } @@ -25,13 +26,14 @@ fn main() { // CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } // CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 } -// CHECK: @.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 1024] -// CHECK: @.offload_maptypes.1 = private unnamed_addr constant [1 x i64] [i64 35] -// CHECK: @.kernel_1.region_id = weak unnamed_addr constant i8 0 -// CHECK: @.offloading.entry_name.1 = internal unnamed_addr constant [9 x i8] c"kernel_1\00", section ".llvm.rodata.offloading", align 1 -// CHECK: @.offloading.entry.kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @.kernel_1.region_id, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 -// CHECK: @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 -// CHECK: @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 +// CHECK: @.offload_sizes._kernel = private unnamed_addr constant [1 x i64] [i64 1024] +// CHECK: @.offload_maptypes._kernel = private unnamed_addr constant [1 x i64] [i64 35] +// CHECK: @._kernel.region_id = weak unnamed_addr constant i8 0 +// CHECK: @.offloading.entry_name._kernel = internal unnamed_addr constant [8 x i8] c"_kernel\00", section ".llvm.rodata.offloading", align 1 +// CHECK: @.offloading.entry._kernel = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel.region_id, ptr @.offloading.entry_name._kernel, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 + +// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8 // CHECK: Function Attrs: // CHECK-NEXT: define{{( dso_local)?}} void @main() @@ -99,7 +101,13 @@ fn main() { #[unsafe(no_mangle)] #[inline(never)] -pub fn kernel_1(x: &mut [f32; 256]) { +pub fn kernel(x: &mut [f32; 256]) { + core::intrinsics::offload(_kernel) +} + +#[unsafe(no_mangle)] +#[inline(never)] +pub fn _kernel(x: &mut [f32; 256]) { for i in 0..256 { x[i] = 21.0; } diff --git a/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs b/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs deleted file mode 100644 index c3df15e3be6bd..0000000000000 --- a/tests/codegen-llvm/gpu_offload/offload_intrinsic.rs +++ /dev/null @@ -1,37 +0,0 @@ -//@ compile-flags: -Zoffload=Enable -Zunstable-options -C opt-level=3 -Clto=fat -//@ no-prefer-dynamic -//@ needs-enzyme - -// This test is verifying that we generate __tgt_target_data_*_mapper before and after a call to the -// kernel_1. Better documentation to what each global or variable means is available in the gpu -// offlaod code, or the LLVM offload documentation. This code does not launch any GPU kernels yet, -// and will be rewritten once a proper offload frontend has landed. -// -// We currently only handle memory transfer for specific calls to functions named `kernel_{num}`, -// when inside of a function called main. This, too, is a temporary workaround for not having a -// frontend. - -// CHECK: ; -#![feature(core_intrinsics)] -#![no_main] - -#[unsafe(no_mangle)] -fn main() { - let mut x = [3.0; 256]; - kernel(&mut x); - core::hint::black_box(&x); -} - -#[unsafe(no_mangle)] -#[inline(never)] -pub fn kernel(x: &mut [f32; 256]) { - core::intrinsics::offload(_kernel) -} - -#[unsafe(no_mangle)] -#[inline(never)] -pub fn _kernel(x: &mut [f32; 256]) { - for i in 0..256 { - x[i] = 21.0; - } -} From 29a3eacc291a711e25596f2a32a31d76c3bcabeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Thu, 6 Nov 2025 12:34:55 +0100 Subject: [PATCH 06/14] Get types from fn_sig --- compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs | 5 ++--- compiler/rustc_codegen_llvm/src/intrinsic.rs | 7 +++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index de7245bafec83..fab867ba8b53b 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -261,12 +261,11 @@ pub(crate) fn add_global<'ll>( // concatenated into the list of region_ids. pub(crate) fn gen_define_handling<'ll>( cx: &SimpleCx<'ll>, - llfn: &'ll llvm::Value, offload_entry_ty: &'ll llvm::Type, metadata: &Vec, + types: &Vec<&Type>, symbol: &str, ) -> (&'ll llvm::Value, &'ll llvm::Value) { - let types = cx.func_params_types(cx.get_type_of_global(llfn)); // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or // reference) types. let ptr_meta = types @@ -371,6 +370,7 @@ pub(crate) fn gen_call_handling<'ll>( memtransfer_types: &[&'ll llvm::Value], region_ids: &[&'ll llvm::Value], llfn: &'ll Value, + types: &Vec<&Type>, metadata: &Vec, ) { let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx); @@ -386,7 +386,6 @@ pub(crate) fn gen_call_handling<'ll>( let mut builder = SBuilder::build(cx, bb); - let types = cx.func_params_types(cx.get_type_of_global(llfn)); let num_args = types.len() as u64; // Step 0) diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 76fc3e97093bb..c642019ec9826 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -22,7 +22,7 @@ use rustc_target::callconv::PassMode; use rustc_target::spec::Os; use tracing::debug; -use crate::abi::FnAbiLlvmExt; +use crate::abi::{FnAbiLlvmExt, LlvmType}; use crate::builder::Builder; use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call}; use crate::builder::gpu_offload::TgtOffloadEntry; @@ -1266,12 +1266,14 @@ fn codegen_offload<'ll, 'tcx>( let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::>(); let llfn = bx.llfn(); + let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::>(); + // TODO(Sa4dUs): separate globals from call-independent headers and use typetrees to reserve the correct amount of memory let (memtransfer_type, region_id) = crate::builder::gpu_offload::gen_define_handling( cx, - llfn, offload_entry_ty, &metadata, + &types, &target_symbol, ); @@ -1283,6 +1285,7 @@ fn codegen_offload<'ll, 'tcx>( &[memtransfer_type], &[region_id], llfn, + &types, &metadata, ); } From 75fdf41d6f5889637a10cc276360601d82aca310 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Thu, 6 Nov 2025 20:30:44 +0100 Subject: [PATCH 07/14] Don't depend on outer fn and some cleanup --- .../src/builder/gpu_offload.rs | 30 +++--- compiler/rustc_codegen_llvm/src/intrinsic.rs | 11 +-- .../rustc_hir_analysis/src/check/intrinsic.rs | 2 +- compiler/rustc_middle/src/ty/offload_meta.rs | 96 +++++++------------ library/core/src/intrinsics/mod.rs | 2 +- tests/codegen-llvm/gpu_offload/gpu_host.rs | 2 +- 6 files changed, 53 insertions(+), 90 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index fab867ba8b53b..151be8fcb4757 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -262,23 +262,19 @@ pub(crate) fn add_global<'ll>( pub(crate) fn gen_define_handling<'ll>( cx: &SimpleCx<'ll>, offload_entry_ty: &'ll llvm::Type, - metadata: &Vec, - types: &Vec<&Type>, + metadata: &[OffloadMetadata], + types: &[&Type], symbol: &str, ) -> (&'ll llvm::Value, &'ll llvm::Value) { // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or // reference) types. - let ptr_meta = types - .iter() - .zip(metadata) - .filter_map(|(&x, meta)| match cx.type_kind(x) { - rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta), - _ => None, - }) - .collect::>(); - - let ptr_sizes = ptr_meta.iter().map(|m| m.payload_size).collect::>(); - let ptr_transfer = ptr_meta.iter().map(|m| m.mode as u64 | 0x20).collect::>(); + let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) { + rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta), + _ => None, + }); + + let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) = + ptr_meta.map(|m| (m.payload_size, m.mode as u64 | 0x20)).unzip(); // We do not know their size anymore at this level, so hardcode a placeholder. // A follow-up pr will track these from the frontend, where we still have Rust types. @@ -369,9 +365,9 @@ pub(crate) fn gen_call_handling<'ll>( bb: &BasicBlock, memtransfer_types: &[&'ll llvm::Value], region_ids: &[&'ll llvm::Value], - llfn: &'ll Value, - types: &Vec<&Type>, - metadata: &Vec, + args: &[&'ll Value], + types: &[&Type], + metadata: &[OffloadMetadata], ) { let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx); // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } @@ -413,7 +409,7 @@ pub(crate) fn gen_call_handling<'ll>( let mut geps = vec![]; let i32_0 = cx.get_const_i32(0); for index in 0..num_args { - let v = unsafe { llvm::LLVMGetParam(llfn, index as u32) }; + let v = args[index as usize]; let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]); vals.push(v); geps.push(gep); diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index c642019ec9826..52a5becfffef6 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -199,7 +199,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } sym::offload => { - codegen_offload(self, tcx, instance, args, result); + codegen_offload(self, tcx, instance, args); return Ok(()); } sym::is_val_statically_known => { @@ -1231,8 +1231,7 @@ fn codegen_offload<'ll, 'tcx>( bx: &mut Builder<'_, 'll, 'tcx>, tcx: TyCtxt<'tcx>, instance: ty::Instance<'tcx>, - _args: &[OperandRef<'tcx, &'ll Value>], - _result: PlaceRef<'tcx, &'ll Value>, + args: &[OperandRef<'tcx, &'ll Value>], ) { let cx = bx.cx; let fn_args = instance.args; @@ -1255,7 +1254,8 @@ fn codegen_offload<'ll, 'tcx>( } }; - let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target.clone(), LOCAL_CRATE); + let args = get_args_from_tuple(bx, args[1], fn_target); + let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE); let offload_entry_ty = TgtOffloadEntry::new_decl(&cx); @@ -1264,7 +1264,6 @@ fn codegen_offload<'ll, 'tcx>( let inputs = sig.inputs(); let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::>(); - let llfn = bx.llfn(); let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::>(); @@ -1284,7 +1283,7 @@ fn codegen_offload<'ll, 'tcx>( bb, &[memtransfer_type], &[region_id], - llfn, + &args, &types, &metadata, ); diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index f72dc0a466ccd..2996bd3a65188 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -312,7 +312,7 @@ pub(crate) fn check_intrinsic_type( let type_id = tcx.type_of(tcx.lang_items().type_id().unwrap()).instantiate_identity(); (0, 0, vec![type_id, type_id], tcx.types.bool) } - sym::offload => (2, 0, vec![param(0)], param(1)), + sym::offload => (3, 0, vec![param(0), param(1)], param(2)), sym::offset => (2, 0, vec![param(0), param(1)], param(0)), sym::arith_offset => ( 1, diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs index 7c1b42b8cc08b..11a0ca2741bb4 100644 --- a/compiler/rustc_middle/src/ty/offload_meta.rs +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -31,39 +31,7 @@ impl OffloadMetadata { // TODO(Sa4dUs): WIP, rn we just have a naive logic for references fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { match ty.kind() { - /* - rustc_middle::infer::canonical::ir::TyKind::Bool => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Char => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Int(int_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Uint(uint_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Float(float_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Adt(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Foreign(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Str => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Array(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Pat(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Slice(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::RawPtr(_, mutability) => todo!(), - */ - ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), - /* - rustc_middle::infer::canonical::ir::TyKind::FnDef(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::FnPtr(binder, fn_header) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::UnsafeBinder(unsafe_binder_inner) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Dynamic(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Closure(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::CoroutineClosure(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Coroutine(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::CoroutineWitness(_, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Never => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Tuple(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Alias(alias_ty_kind, alias_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Param(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Bound(bound_var_index_kind, _) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Placeholder(_) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Infer(infer_ty) => todo!(), - rustc_middle::infer::canonical::ir::TyKind::Error(_) => todo!(), - */ + ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), _ => tcx .layout_of(PseudoCanonicalInput { typing_env: TypingEnv::fully_monomorphized(), @@ -79,44 +47,44 @@ impl TransferKind { pub fn from_ty<'tcx>(_tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { // TODO(Sa4dUs): this logic is probs not fully correct, but it works for now match ty.kind() { - rustc_type_ir::TyKind::Bool - | rustc_type_ir::TyKind::Char - | rustc_type_ir::TyKind::Int(_) - | rustc_type_ir::TyKind::Uint(_) - | rustc_type_ir::TyKind::Float(_) => TransferKind::ToGpu, + ty::Bool + | ty::Char + | ty::Int(_) + | ty::Uint(_) + | ty::Float(_) => TransferKind::ToGpu, - rustc_type_ir::TyKind::Adt(_, _) - | rustc_type_ir::TyKind::Tuple(_) - | rustc_type_ir::TyKind::Array(_, _) => TransferKind::ToGpu, + ty::Adt(_, _) + | ty::Tuple(_) + | ty::Array(_, _) => TransferKind::ToGpu, - rustc_type_ir::TyKind::RawPtr(_, rustc_ast::Mutability::Not) - | rustc_type_ir::TyKind::Ref(_, _, rustc_ast::Mutability::Not) => TransferKind::ToGpu, + ty::RawPtr(_, rustc_ast::Mutability::Not) + | ty::Ref(_, _, rustc_ast::Mutability::Not) => TransferKind::ToGpu, - rustc_type_ir::TyKind::RawPtr(_, rustc_ast::Mutability::Mut) - | rustc_type_ir::TyKind::Ref(_, _, rustc_ast::Mutability::Mut) => TransferKind::Both, + ty::RawPtr(_, rustc_ast::Mutability::Mut) + | ty::Ref(_, _, rustc_ast::Mutability::Mut) => TransferKind::Both, - rustc_type_ir::TyKind::Slice(_) - | rustc_type_ir::TyKind::Str - | rustc_type_ir::TyKind::Dynamic(_, _) => TransferKind::Both, + ty::Slice(_) + | ty::Str + | ty::Dynamic(_, _) => TransferKind::Both, - rustc_type_ir::TyKind::FnDef(_, _) - | rustc_type_ir::TyKind::FnPtr(_, _) - | rustc_type_ir::TyKind::Closure(_, _) - | rustc_type_ir::TyKind::CoroutineClosure(_, _) - | rustc_type_ir::TyKind::Coroutine(_, _) - | rustc_type_ir::TyKind::CoroutineWitness(_, _) => TransferKind::ToGpu, + ty::FnDef(_, _) + | ty::FnPtr(_, _) + | ty::Closure(_, _) + | ty::CoroutineClosure(_, _) + | ty::Coroutine(_, _) + | ty::CoroutineWitness(_, _) => TransferKind::ToGpu, - rustc_type_ir::TyKind::Alias(_, _) - | rustc_type_ir::TyKind::Param(_) - | rustc_type_ir::TyKind::Bound(_, _) - | rustc_type_ir::TyKind::Placeholder(_) - | rustc_type_ir::TyKind::Infer(_) - | rustc_type_ir::TyKind::Error(_) => TransferKind::ToGpu, + ty::Alias(_, _) + | ty::Param(_) + | ty::Bound(_, _) + | ty::Placeholder(_) + | ty::Infer(_) + | ty::Error(_) => TransferKind::ToGpu, - rustc_type_ir::TyKind::Never => TransferKind::ToGpu, - rustc_type_ir::TyKind::Foreign(_) => TransferKind::Both, - rustc_type_ir::TyKind::Pat(_, _) => TransferKind::Both, - rustc_type_ir::TyKind::UnsafeBinder(_) => TransferKind::Both, + ty::Never => TransferKind::ToGpu, + ty::Foreign(_) => TransferKind::Both, + ty::Pat(_, _) => TransferKind::Both, + ty::UnsafeBinder(_) => TransferKind::Both, } } } diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index 94f2cf322f3ee..97bf21c88e203 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3306,7 +3306,7 @@ pub const fn autodiff(f: F, df: G, args: T) -> #[rustc_nounwind] #[rustc_intrinsic] -pub const fn offload(f: F) -> R; +pub const fn offload(f: F, args: T) -> R; /// Inform Miri that a given pointer definitely has a certain alignment. #[cfg(miri)] diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index 69eea6a6a8cea..8a469f42906bf 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -102,7 +102,7 @@ fn main() { #[unsafe(no_mangle)] #[inline(never)] pub fn kernel(x: &mut [f32; 256]) { - core::intrinsics::offload(_kernel) + core::intrinsics::offload(_kernel, (x,)) } #[unsafe(no_mangle)] From 3dcb78a24ee1157843a9fbc36ad695ffecbdcbcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Tue, 11 Nov 2025 19:04:11 +0100 Subject: [PATCH 08/14] Add string attr to apply extra ptr arg to offload kernels --- compiler/rustc_codegen_llvm/src/attributes.rs | 12 ++++++++++++ compiler/rustc_codegen_llvm/src/back/write.rs | 11 ++++++----- compiler/rustc_codegen_llvm/src/context.rs | 10 ++++++++++ compiler/rustc_codegen_llvm/src/llvm/mod.rs | 8 ++++++++ compiler/rustc_codegen_ssa/src/codegen_attrs.rs | 3 +++ compiler/rustc_feature/src/builtin_attrs.rs | 5 +++++ compiler/rustc_middle/src/middle/codegen_fn_attrs.rs | 2 ++ compiler/rustc_span/src/symbol.rs | 1 + 8 files changed, 47 insertions(+), 5 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/attributes.rs b/compiler/rustc_codegen_llvm/src/attributes.rs index 89878d1e7e20b..a25ce9e5a90ac 100644 --- a/compiler/rustc_codegen_llvm/src/attributes.rs +++ b/compiler/rustc_codegen_llvm/src/attributes.rs @@ -30,6 +30,14 @@ pub(crate) fn apply_to_callsite(callsite: &Value, idx: AttributePlace, attrs: &[ } } +pub(crate) fn has_string_attr(llfn: &Value, name: &str) -> bool { + llvm::HasStringAttribute(llfn, name) +} + +pub(crate) fn remove_string_attr_from_llfn(llfn: &Value, name: &str) { + llvm::RemoveStringAttrFromFn(llfn, name); +} + /// Get LLVM attribute for the provided inline heuristic. pub(crate) fn inline_attr<'ll, 'tcx>( cx: &SimpleCx<'ll>, @@ -408,6 +416,10 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>( to_add.push(llvm::CreateAttrString(cx.llcx, "no-builtins")); } + if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::OFFLOAD_KERNEL) { + to_add.push(llvm::CreateAttrString(cx.llcx, "offload-kernel")) + } + if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::COLD) { to_add.push(AttributeKind::Cold.create_attr(cx.llcx)); } diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs index fde7dd6ef7a85..4db4283adb404 100644 --- a/compiler/rustc_codegen_llvm/src/back/write.rs +++ b/compiler/rustc_codegen_llvm/src/back/write.rs @@ -43,7 +43,7 @@ use crate::errors::{ use crate::llvm::diagnostic::OptimizationDiagnosticKind::*; use crate::llvm::{self, DiagnosticInfo}; use crate::type_::llvm_type_ptr; -use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, base, common, llvm_util}; +use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes, base, common, llvm_util}; pub(crate) fn llvm_err<'a>(dcx: DiagCtxtHandle<'_>, err: LlvmError<'a>) -> ! { match llvm::last_error() { @@ -706,11 +706,12 @@ pub(crate) unsafe fn llvm_optimize( SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size); // For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is // introducing a proper offload intrinsic to solve this limitation. - for num in 0..9 { - let name = format!("kernel_{num}"); - if let Some(kernel) = cx.get_function(&name) { - handle_offload(&cx, kernel); + for func in cx.get_functions() { + let offload_kernel = "offload-kernel"; + if attributes::has_string_attr(func, offload_kernel) { + handle_offload(&cx, func); } + attributes::remove_string_attr_from_llfn(func, offload_kernel); } } diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs index b60c8a7d37193..6caf60e3cc41e 100644 --- a/compiler/rustc_codegen_llvm/src/context.rs +++ b/compiler/rustc_codegen_llvm/src/context.rs @@ -791,6 +791,16 @@ impl<'ll, CX: Borrow>> GenericCx<'ll, CX> { llvm::LLVMMDStringInContext2(self.llcx(), name.as_ptr() as *const c_char, name.len()) } } + + pub(crate) fn get_functions(&self) -> Vec<&'ll Value> { + let mut functions = vec![]; + let mut func = unsafe { llvm::LLVMGetFirstFunction(self.llmod()) }; + while let Some(f) = func { + functions.push(f); + func = unsafe { llvm::LLVMGetNextFunction(f) } + } + functions + } } impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/llvm/mod.rs b/compiler/rustc_codegen_llvm/src/llvm/mod.rs index 4c58a92106d5c..55a4b415a4e27 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/mod.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/mod.rs @@ -43,6 +43,14 @@ pub(crate) fn AddFunctionAttributes<'ll>( } } +pub(crate) fn HasStringAttribute<'ll>(llfn: &'ll Value, name: &str) -> bool { + unsafe { LLVMRustHasFnAttribute(llfn, name.as_c_char_ptr(), name.len()) } +} + +pub(crate) fn RemoveStringAttrFromFn<'ll>(llfn: &'ll Value, name: &str) { + unsafe { LLVMRustRemoveFnAttribute(llfn, name.as_c_char_ptr(), name.len()) } +} + pub(crate) fn AddCallSiteAttributes<'ll>( callsite: &'ll Value, idx: AttributePlace, diff --git a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs index fd3d7d2a3ded0..0ab0cb0ef88a5 100644 --- a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs +++ b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs @@ -334,6 +334,9 @@ fn process_builtin_attrs( codegen_fn_attrs.patchable_function_entry = parse_patchable_function_entry(tcx, attr); } + sym::rustc_offload_kernel => { + codegen_fn_attrs.flags |= CodegenFnAttrFlags::OFFLOAD_KERNEL + } _ => {} } } diff --git a/compiler/rustc_feature/src/builtin_attrs.rs b/compiler/rustc_feature/src/builtin_attrs.rs index 4d50b9683fc57..0e48d943eab2e 100644 --- a/compiler/rustc_feature/src/builtin_attrs.rs +++ b/compiler/rustc_feature/src/builtin_attrs.rs @@ -1100,6 +1100,11 @@ pub static BUILTIN_ATTRIBUTES: &[BuiltinAttribute] = &[ rustc_autodiff, Normal, template!(Word, List: &[r#""...""#]), DuplicatesOk, EncodeCrossCrate::Yes, + ), + rustc_attr!( + rustc_offload_kernel, Normal, + template!(Word), DuplicatesOk, + EncodeCrossCrate::Yes, ), // Traces that are left when `cfg` and `cfg_attr` attributes are expanded. // The attributes are not gated, to avoid stability errors, but they cannot be used in stable diff --git a/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs b/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs index 5a28d56d4e549..9630cfc94b433 100644 --- a/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs +++ b/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs @@ -190,6 +190,8 @@ bitflags::bitflags! { const NO_BUILTINS = 1 << 15; /// Marks foreign items, to make `contains_extern_indicator` cheaper. const FOREIGN_ITEM = 1 << 16; + /// `#[rustc_offload_kernel]`: indicates that this is an offload kernel, an extra ptr arg will be added. + const OFFLOAD_KERNEL = 1 << 17; } } rustc_data_structures::external_bitflags_debug! { CodegenFnAttrFlags } diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 128384821610e..21c8d5b784ee2 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1966,6 +1966,7 @@ symbols! { rustc_objc_class, rustc_objc_selector, rustc_object_lifetime_default, + rustc_offload_kernel, rustc_on_unimplemented, rustc_outlives, rustc_paren_sugar, From 00294243854d7ee14933686c268bfc2fb37463cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Thu, 13 Nov 2025 16:39:59 +0100 Subject: [PATCH 09/14] Prevent globals from being optimized without relying on llvm.used --- .../src/builder/gpu_offload.rs | 81 +++++-------- compiler/rustc_codegen_llvm/src/intrinsic.rs | 21 ++-- tests/codegen-llvm/gpu_offload/gpu_host.rs | 110 +++++++++--------- 3 files changed, 93 insertions(+), 119 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 151be8fcb4757..afa6da96849cd 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -170,7 +170,7 @@ impl KernelArgsTy { fn new<'ll>( cx: &'ll SimpleCx<'_>, num_args: u64, - memtransfer_types: &[&'ll Value], + memtransfer_types: &'ll Value, geps: [&'ll Value; 3], ) -> [(Align, &'ll Value); 13] { let four = Align::from_bytes(4).expect("4 Byte alignment should work"); @@ -184,7 +184,7 @@ impl KernelArgsTy { (eight, geps[0]), (eight, geps[1]), (eight, geps[2]), - (eight, memtransfer_types[0]), + (eight, memtransfer_types), // The next two are debug infos. FIXME(offload): set them (eight, cx.const_null(cx.type_ptr())), // dbg (eight, cx.const_null(cx.type_ptr())), // dbg @@ -265,7 +265,7 @@ pub(crate) fn gen_define_handling<'ll>( metadata: &[OffloadMetadata], types: &[&Type], symbol: &str, -) -> (&'ll llvm::Value, &'ll llvm::Value) { +) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value) { // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or // reference) types. let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) { @@ -313,16 +313,15 @@ pub(crate) fn gen_define_handling<'ll>( let initializer = crate::common::named_struct(offload_entry_ty, &elems); let c_name = CString::new(name).unwrap(); - let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name); - llvm::set_global_constant(llglobal, true); - llvm::set_linkage(llglobal, WeakAnyLinkage); - llvm::set_initializer(llglobal, initializer); - llvm::set_alignment(llglobal, Align::EIGHT); + let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name); + llvm::set_global_constant(offload_entry, true); + llvm::set_linkage(offload_entry, WeakAnyLinkage); + llvm::set_initializer(offload_entry, initializer); + llvm::set_alignment(offload_entry, Align::EIGHT); let c_section_name = CString::new("llvm_offload_entries").unwrap(); - llvm::set_section(llglobal, &c_section_name); + llvm::set_section(offload_entry, &c_section_name); - add_to_llvm_used(cx, &[offload_sizes, memtransfer_types, region_id, llglobal]); - (memtransfer_types, region_id) + (offload_sizes, memtransfer_types, region_id, offload_entry) } fn declare_offload_fn<'ll>( @@ -363,8 +362,10 @@ fn declare_offload_fn<'ll>( pub(crate) fn gen_call_handling<'ll>( cx: &SimpleCx<'ll>, bb: &BasicBlock, - memtransfer_types: &[&'ll llvm::Value], - region_ids: &[&'ll llvm::Value], + offload_sizes: &'ll llvm::Value, + offload_entry: &'ll llvm::Value, + memtransfer_types: &'ll llvm::Value, + region_id: &'ll llvm::Value, args: &[&'ll Value], types: &[&Type], metadata: &[OffloadMetadata], @@ -382,6 +383,18 @@ pub(crate) fn gen_call_handling<'ll>( let mut builder = SBuilder::build(cx, bb); + for val in [offload_sizes, offload_entry] { + unsafe { + let dummy = llvm::LLVMBuildLoad2( + &builder.llbuilder, + llvm::LLVMTypeOf(val), + val, + b"dummy\0".as_ptr() as *const _, + ); + llvm::LLVMSetVolatile(dummy, llvm::TRUE); + } + } + let num_args = types.len() as u64; // Step 0) @@ -479,7 +492,7 @@ pub(crate) fn gen_call_handling<'ll>( // Step 2) let s_ident_t = generate_at_one(&cx); - let o = memtransfer_types[0]; + let o = memtransfer_types; let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4); generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t); let values = KernelArgsTy::new(&cx, num_args, memtransfer_types, geps); @@ -498,7 +511,7 @@ pub(crate) fn gen_call_handling<'ll>( // FIXME(offload): Don't hardcode the numbers of threads in the future. cx.get_const_i32(2097152), cx.get_const_i32(256), - region_ids[0], + region_id, a5, ]; builder.call(tgt_target_kernel_ty, tgt_decl, &args, None); @@ -512,41 +525,3 @@ pub(crate) fn gen_call_handling<'ll>( drop(builder); } - -// TODO(Sa4dUs): check if there's a better way of doing this, also move to a proper location -fn add_to_llvm_used<'ll>(cx: &'ll SimpleCx<'_>, globals: &[&'ll Value]) { - let ptr_ty = cx.type_ptr(); - let arr_ty = cx.type_array(ptr_ty, globals.len() as u64); - let arr_val = cx.const_array(ptr_ty, globals); - - let name = CString::new("llvm.used").unwrap(); - - let used_global_opt = unsafe { llvm::LLVMGetNamedGlobal(cx.llmod, name.as_ptr()) }; - - if used_global_opt.is_none() { - let new_global = unsafe { llvm::LLVMAddGlobal(cx.llmod, arr_ty, name.as_ptr()) }; - unsafe { llvm::LLVMSetLinkage(new_global, llvm::Linkage::AppendingLinkage) }; - unsafe { - llvm::LLVMSetSection(new_global, CString::new("llvm.metadata").unwrap().as_ptr()) - }; - unsafe { llvm::LLVMSetInitializer(new_global, arr_val) }; - llvm::LLVMSetGlobalConstant(new_global, llvm::TRUE); - return; - } - - let used_global = used_global_opt.expect("expected @llvm.used"); - let mut combined: Vec<&'ll Value> = Vec::new(); - - if let Some(existing_init) = llvm::LLVMGetInitializer(used_global) { - let num_elems = unsafe { llvm::LLVMGetNumOperands(existing_init) }; - for i in 0..num_elems { - if let Some(elem) = unsafe { llvm::LLVMGetOperand(existing_init, i) } { - combined.push(elem); - } - } - } - - combined.extend_from_slice(globals); - let new_arr = cx.const_array(ptr_ty, &combined); - unsafe { llvm::LLVMSetInitializer(used_global, new_arr) }; -} diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 52a5becfffef6..fa8568d59049a 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -1268,21 +1268,24 @@ fn codegen_offload<'ll, 'tcx>( let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::>(); // TODO(Sa4dUs): separate globals from call-independent headers and use typetrees to reserve the correct amount of memory - let (memtransfer_type, region_id) = crate::builder::gpu_offload::gen_define_handling( - cx, - offload_entry_ty, - &metadata, - &types, - &target_symbol, - ); + let (offload_sizes, memtransfer_types, region_id, offload_entry) = + crate::builder::gpu_offload::gen_define_handling( + cx, + offload_entry_ty, + &metadata, + &types, + &target_symbol, + ); // TODO(Sa4dUs): this is just to a void lifetime's issues let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) }; crate::builder::gpu_offload::gen_call_handling( cx, bb, - &[memtransfer_type], - &[region_id], + offload_sizes, + offload_entry, + memtransfer_types, + region_id, &args, &types, &metadata, diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index 8a469f42906bf..c53968f558acc 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -17,7 +17,7 @@ #[unsafe(no_mangle)] fn main() { let mut x = [3.0; 256]; - kernel(&mut x); + kernel_1(&mut x); core::hint::black_box(&x); } @@ -26,11 +26,11 @@ fn main() { // CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } // CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 } -// CHECK: @.offload_sizes._kernel = private unnamed_addr constant [1 x i64] [i64 1024] -// CHECK: @.offload_maptypes._kernel = private unnamed_addr constant [1 x i64] [i64 35] -// CHECK: @._kernel.region_id = weak unnamed_addr constant i8 0 -// CHECK: @.offloading.entry_name._kernel = internal unnamed_addr constant [8 x i8] c"_kernel\00", section ".llvm.rodata.offloading", align 1 -// CHECK: @.offloading.entry._kernel = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel.region_id, ptr @.offloading.entry_name._kernel, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024] +// CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35] +// CHECK: @._kernel_1.region_id = weak unnamed_addr constant i8 0 +// CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1 +// CHECK: @.offloading.entry._kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 // CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 // CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8 @@ -40,60 +40,56 @@ fn main() { // CHECK-NEXT: start: // CHECK-NEXT: %0 = alloca [8 x i8], align 8 // CHECK-NEXT: %x = alloca [1024 x i8], align 16 +// CHECK: call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x) +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %0) +// CHECK-NEXT: store ptr %x, ptr %0, align 8 +// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #4, !srcloc !4 +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %0) +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr nonnull %x) +// CHECK-NEXT: ret void +// CHECK-NEXT: } + +// CHECK: define{{( dso_local)?}} void @kernel_1(ptr noalias noundef align 4 dereferenceable(1024) %x) +// CHECK-NEXT: start: +// CHECK-NEXT: %dummy = load volatile ptr, ptr @.offload_sizes._kernel_1, align 8 +// CHECK-NEXT: %dummy1 = load volatile ptr, ptr @.offloading.entry._kernel_1, align 8 // CHECK-NEXT: %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8 // CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8 // CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 -// CHECK: call void @llvm.memset.p0.i64(ptr align 8 %EmptyDesc, i8 0, i64 32, i1 false) -// CHECK-NEXT: %1 = getelementptr inbounds float, ptr %x, i32 0 -// CHECK-NEXT: call void @__tgt_register_lib(ptr %EmptyDesc) +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %EmptyDesc, i8 0, i64 32, i1 false) +// CHECK-NEXT: call void @__tgt_register_lib(ptr nonnull %EmptyDesc) // CHECK-NEXT: call void @__tgt_init_all_rtls() -// CHECK-NEXT: %2 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 -// CHECK-NEXT: store ptr %x, ptr %2, align 8 -// CHECK-NEXT: %3 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 -// CHECK-NEXT: store ptr %1, ptr %3, align 8 -// CHECK-NEXT: %4 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 -// CHECK-NEXT: store i64 1024, ptr %4, align 8 -// CHECK-NEXT: %5 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 -// CHECK-NEXT: %6 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 -// CHECK-NEXT: %7 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 -// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %5, ptr %6, ptr %7, ptr @.offload_maptypes.1, ptr null, ptr null) -// CHECK-NEXT: %8 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 0 -// CHECK-NEXT: store i32 3, ptr %8, align 4 -// CHECK-NEXT: %9 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 1 -// CHECK-NEXT: store i32 1, ptr %9, align 4 -// CHECK-NEXT: %10 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 2 -// CHECK-NEXT: store ptr %5, ptr %10, align 8 -// CHECK-NEXT: %11 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 3 -// CHECK-NEXT: store ptr %6, ptr %11, align 8 -// CHECK-NEXT: %12 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 4 -// CHECK-NEXT: store ptr %7, ptr %12, align 8 -// CHECK-NEXT: %13 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 5 -// CHECK-NEXT: store ptr @.offload_maptypes.1, ptr %13, align 8 -// CHECK-NEXT: %14 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 6 -// CHECK-NEXT: store ptr null, ptr %14, align 8 -// CHECK-NEXT: %15 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 7 -// CHECK-NEXT: store ptr null, ptr %15, align 8 -// CHECK-NEXT: %16 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 8 -// CHECK-NEXT: store i64 0, ptr %16, align 8 -// CHECK-NEXT: %17 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 9 -// CHECK-NEXT: store i64 0, ptr %17, align 8 -// CHECK-NEXT: %18 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 10 -// CHECK-NEXT: store [3 x i32] [i32 2097152, i32 0, i32 0], ptr %18, align 4 -// CHECK-NEXT: %19 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 11 -// CHECK-NEXT: store [3 x i32] [i32 256, i32 0, i32 0], ptr %19, align 4 -// CHECK-NEXT: %20 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 12 -// CHECK-NEXT: store i32 0, ptr %20, align 4 -// CHECK-NEXT: %21 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) -// CHECK-NEXT: %22 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 -// CHECK-NEXT: %23 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 -// CHECK-NEXT: %24 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 -// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %22, ptr %23, ptr %24, ptr @.offload_maptypes.1, ptr null, ptr null) -// CHECK-NEXT: call void @__tgt_unregister_lib(ptr %EmptyDesc) -// CHECK: store ptr %x, ptr %0, align 8 -// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) -// CHECK: ret void +// CHECK-NEXT: store ptr %x, ptr %.offload_baseptrs, align 8 +// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8 +// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8 +// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null) +// CHECK-NEXT: store i32 3, ptr %kernel_args, align 8 +// CHECK-NEXT: %0 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4 +// CHECK-NEXT: store i32 1, ptr %0, align 4 +// CHECK-NEXT: %1 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 8 +// CHECK-NEXT: store ptr %.offload_baseptrs, ptr %1, align 8 +// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16 +// CHECK-NEXT: store ptr %.offload_ptrs, ptr %2, align 8 +// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24 +// CHECK-NEXT: store ptr %.offload_sizes, ptr %3, align 8 +// CHECK-NEXT: %4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32 +// CHECK-NEXT: store ptr @.offload_maptypes._kernel_1, ptr %4, align 8 +// CHECK-NEXT: %5 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40 +// CHECK-NEXT: %6 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72 +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %5, i8 0, i64 32, i1 false) +// CHECK-NEXT: store <4 x i32> , ptr %6, align 8 +// CHECK-NEXT: %.fca.1.gep2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88 +// CHECK-NEXT: store i32 0, ptr %.fca.1.gep2, align 8 +// CHECK-NEXT: %.fca.2.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92 +// CHECK-NEXT: store i32 0, ptr %.fca.2.gep3, align 4 +// CHECK-NEXT: %7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96 +// CHECK-NEXT: store i32 0, ptr %7, align 8 +// CHECK-NEXT: %8 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2097152, i32 256, ptr nonnull @._kernel_1.region_id, ptr nonnull %kernel_args) +// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null) +// CHECK-NEXT: call void @__tgt_unregister_lib(ptr nonnull %EmptyDesc) +// CHECK-NEXT: ret void // CHECK-NEXT: } // CHECK: Function Attrs: nounwind @@ -101,13 +97,13 @@ fn main() { #[unsafe(no_mangle)] #[inline(never)] -pub fn kernel(x: &mut [f32; 256]) { - core::intrinsics::offload(_kernel, (x,)) +pub fn kernel_1(x: &mut [f32; 256]) { + core::intrinsics::offload(_kernel_1, (x,)) } #[unsafe(no_mangle)] #[inline(never)] -pub fn _kernel(x: &mut [f32; 256]) { +pub fn _kernel_1(x: &mut [f32; 256]) { for i in 0..256 { x[i] = 21.0; } From 68a7a9fc9a1fad23ddf40a9e4ea9acd1e7a8a8c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Thu, 13 Nov 2025 20:05:50 +0100 Subject: [PATCH 10/14] Add mapping bitflags and general cleanup --- compiler/rustc_codegen_llvm/src/back/lto.rs | 10 +- .../src/builder/gpu_offload.rs | 43 +------ compiler/rustc_codegen_llvm/src/intrinsic.rs | 6 +- compiler/rustc_codegen_llvm/src/lib.rs | 2 - compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 6 - compiler/rustc_middle/src/ty/offload_meta.rs | 108 +++++++++++------- tests/codegen-llvm/gpu_offload/gpu_host.rs | 12 +- 7 files changed, 80 insertions(+), 107 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs index b820b992105fd..482e954138553 100644 --- a/compiler/rustc_codegen_llvm/src/back/lto.rs +++ b/compiler/rustc_codegen_llvm/src/back/lto.rs @@ -26,7 +26,7 @@ use crate::back::write::{ }; use crate::errors::{LlvmError, LtoBitcodeFromRlib}; use crate::llvm::{self, build_string}; -use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx}; +use crate::{LlvmCodegenBackend, ModuleLlvm}; /// We keep track of the computed LTO cache keys from the previous /// session to determine which CGUs we can reuse. @@ -601,7 +601,6 @@ pub(crate) fn run_pass_manager( // We then run the llvm_optimize function a second time, to optimize the code which we generated // in the enzyme differentiation pass. let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable); - let enable_gpu = config.offload.contains(&config::Offload::Enable); let stage = if thin { write::AutodiffStage::PreAD } else { @@ -616,13 +615,6 @@ pub(crate) fn run_pass_manager( write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage); } - // Here we only handle the GPU host (=cpu) code. - if enable_gpu && !thin && !cgcx.target_is_like_gpu { - let cx = - SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size); - crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx); - } - if cfg!(feature = "llvm_enzyme") && enable_ad && !thin { let opt_stage = llvm::OptStage::FatLTO; let stage = write::AutodiffStage::PostAD; diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index afa6da96849cd..c3b743cba2c3a 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -2,40 +2,13 @@ use std::ffi::CString; use llvm::Linkage::*; use rustc_abi::Align; -use rustc_codegen_ssa::back::write::CodegenContext; use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; use rustc_middle::ty::offload_meta::OffloadMetadata; -use rustc_middle::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv}; use crate::builder::SBuilder; use crate::llvm::AttributePlace::Function; use crate::llvm::{self, BasicBlock, Linkage, Type, Value}; -use crate::{LlvmCodegenBackend, SimpleCx, attributes}; - -pub(crate) fn handle_gpu_code<'ll>( - _cgcx: &CodegenContext, - cx: &'ll SimpleCx<'_>, -) { - /* - // The offload memory transfer type for each kernel - let mut memtransfer_types = vec![]; - let mut region_ids = vec![]; - let offload_entry_ty = TgtOffloadEntry::new_decl(&cx); - // This is a temporary hack, we only search for kernel_0 to kernel_9 functions. - // There is a draft PR in progress which will introduce a proper offload intrinsic to remove - // this limitation. - for num in 0..9 { - let kernel = cx.get_function(&format!("kernel_{num}")); - if let Some(kernel) = kernel { - let (o, k) = gen_define_handling(&cx, kernel, offload_entry_ty, num); - memtransfer_types.push(o); - region_ids.push(k); - } - } - - gen_call_handling(&cx, &memtransfer_types, ®ion_ids); - */ -} +use crate::{SimpleCx, attributes}; // ; Function Attrs: nounwind // declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2 @@ -273,13 +246,10 @@ pub(crate) fn gen_define_handling<'ll>( _ => None, }); + // FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) = - ptr_meta.map(|m| (m.payload_size, m.mode as u64 | 0x20)).unzip(); + ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip(); - // We do not know their size anymore at this level, so hardcode a placeholder. - // A follow-up pr will track these from the frontend, where we still have Rust types. - // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes. - // I decided that 1024 bytes is a great placeholder value for now. let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes); // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2), // or both to and from the gpu (=3). Other values shouldn't affect us for now. @@ -305,7 +275,6 @@ pub(crate) fn gen_define_handling<'ll>( llvm::set_alignment(llglobal, Align::ONE); llvm::set_section(llglobal, c".llvm.rodata.offloading"); - // Not actively used yet, for calling real kernels let name = format!(".offloading.entry.{symbol}"); // See the __tgt_offload_entry documentation above. @@ -340,8 +309,7 @@ fn declare_offload_fn<'ll>( } // For each kernel *call*, we now use some of our previous declared globals to move data to and from -// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function -// from main is intended to run on the GPU. For now, we only handle the data transfer part of it. +// the gpu. For now, we only handle the data transfer part of it. // If two consecutive kernels use the same memory, we still move it to the host and back to the gpu. // Since in our frontend users (by default) don't have to specify data transfer, this is something // we should optimize in the future! We also assume that everything should be copied back and forth, @@ -383,6 +351,7 @@ pub(crate) fn gen_call_handling<'ll>( let mut builder = SBuilder::build(cx, bb); + // prevent these globals from being optimized away for val in [offload_sizes, offload_entry] { unsafe { let dummy = llvm::LLVMBuildLoad2( @@ -447,8 +416,6 @@ pub(crate) fn gen_call_handling<'ll>( let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]); builder.store(geps[i as usize], gep2, Align::EIGHT); let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]); - // As mentioned above, we don't use Rust type information yet. So for now we will just - // assume that we have 1024 bytes, 256 f32 values. // FIXME(offload): write an offload frontend and handle arbitrary types. builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT); } diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index fa8568d59049a..23a226e127499 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -22,7 +22,7 @@ use rustc_target::callconv::PassMode; use rustc_target::spec::Os; use tracing::debug; -use crate::abi::{FnAbiLlvmExt, LlvmType}; +use crate::abi::FnAbiLlvmExt; use crate::builder::Builder; use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call}; use crate::builder::gpu_offload::TgtOffloadEntry; @@ -199,6 +199,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } sym::offload => { + // FIXME(Sa4dUs): emit error when offload is not enabled codegen_offload(self, tcx, instance, args); return Ok(()); } @@ -1259,7 +1260,6 @@ fn codegen_offload<'ll, 'tcx>( let offload_entry_ty = TgtOffloadEntry::new_decl(&cx); - // Build TypeTree (or something similar) let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder(); let inputs = sig.inputs(); @@ -1267,7 +1267,6 @@ fn codegen_offload<'ll, 'tcx>( let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::>(); - // TODO(Sa4dUs): separate globals from call-independent headers and use typetrees to reserve the correct amount of memory let (offload_sizes, memtransfer_types, region_id, offload_entry) = crate::builder::gpu_offload::gen_define_handling( cx, @@ -1277,7 +1276,6 @@ fn codegen_offload<'ll, 'tcx>( &target_symbol, ); - // TODO(Sa4dUs): this is just to a void lifetime's issues let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) }; crate::builder::gpu_offload::gen_call_handling( cx, diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs index 9406d5e0ca8c3..1b65a133d58c1 100644 --- a/compiler/rustc_codegen_llvm/src/lib.rs +++ b/compiler/rustc_codegen_llvm/src/lib.rs @@ -4,8 +4,6 @@ //! //! This API is completely unstable and subject to change. -// TODO(Sa4dUs): remove this once we have a great version, just to ignore unused LLVM wrappers -#![allow(unused)] // tidy-alphabetical-start #![cfg_attr(bootstrap, feature(slice_as_array))] #![feature(assert_matches)] diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index cf94678d140d0..dfac3022eeffd 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1160,14 +1160,9 @@ unsafe extern "C" { ) -> &'a BasicBlock; // Operations on instructions - pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock; - pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>; pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>; pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock; pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>; - pub(crate) fn LLVMGetNextInstruction(Val: &Value) -> Option<&Value>; - pub(crate) fn LLVMInstructionEraseFromParent(Val: &Value); - pub(crate) fn LLVMGetNumOperands(Val: &Value) -> c_uint; // Operations on call sites pub(crate) fn LLVMSetInstructionCallConv(Instr: &Value, CC: c_uint); @@ -2453,7 +2448,6 @@ unsafe extern "C" { pub(crate) fn LLVMRustSetDataLayoutFromTargetMachine<'a>(M: &'a Module, TM: &'a TargetMachine); - pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value); pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock); pub(crate) fn LLVMRustSetModulePICLevel(M: &Module); diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs index 11a0ca2741bb4..06f376d4a7d9d 100644 --- a/compiler/rustc_middle/src/ty/offload_meta.rs +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -1,34 +1,64 @@ +use bitflags::bitflags; + use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv}; -// TODO(Sa4dUs): it doesn't feel correct for me to place this on `rustc_ast::expand`, will look for a proper location pub struct OffloadMetadata { pub payload_size: u64, - pub mode: TransferKind, + pub mode: MappingFlags, } -// TODO(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` flag only when needed -#[repr(u64)] -#[derive(Debug, Copy, Clone)] -pub enum TransferKind { - FromGpu = 1, - ToGpu = 2, - Both = 1 + 2, +bitflags! { + /// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP. + #[derive(Debug, Copy, Clone)] + #[repr(transparent)] + pub struct MappingFlags: u64 { + /// No flags. + const NONE = 0x0; + /// Allocate memory on the device and move data from host to device. + const TO = 0x01; + /// Allocate memory on the device and move data from device to host. + const FROM = 0x02; + /// Always perform the requested mapping action, even if already mapped. + const ALWAYS = 0x04; + /// Delete the element from the device environment, ignoring ref count. + const DELETE = 0x08; + /// The element being mapped is a pointer-pointee pair. + const PTR_AND_OBJ = 0x10; + /// The base address should be passed to the target kernel as argument. + const TARGET_PARAM = 0x20; + /// The runtime must return the device pointer. + const RETURN_PARAM = 0x40; + /// The reference being passed is a pointer to private data. + const PRIVATE = 0x80; + /// Pass the element by value. + const LITERAL = 0x100; + /// Implicit map (generated by compiler, not explicit in code). + const IMPLICIT = 0x200; + /// Hint to allocate memory close to the target device. + const CLOSE = 0x400; + /// Reserved (0x800 in OpenMP for XLC compatibility). + const RESERVED = 0x800; + /// Require that the data is already allocated on the device. + const PRESENT = 0x1000; + /// Increment/decrement a separate ref counter (OpenACC compatibility). + const OMPX_HOLD = 0x2000; + /// Used for non-contiguous list items in target update. + const NON_CONTIG = 0x100000000000; + /// 16 MSBs indicate membership in a struct. + const MEMBER_OF = 0xffff000000000000; + } } impl OffloadMetadata { - pub fn new(payload_size: u64, mode: TransferKind) -> Self { - OffloadMetadata { payload_size, mode } - } - pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { OffloadMetadata { payload_size: get_payload_size(tcx, ty), - mode: TransferKind::from_ty(tcx, ty), + mode: MappingFlags::from_ty(tcx, ty), } } } -// TODO(Sa4dUs): WIP, rn we just have a naive logic for references +// FIXME(Sa4dUs): implement a solid logic to determine the payload size fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { match ty.kind() { ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), @@ -43,48 +73,42 @@ fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { } } -impl TransferKind { - pub fn from_ty<'tcx>(_tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { - // TODO(Sa4dUs): this logic is probs not fully correct, but it works for now +impl MappingFlags { + fn from_ty<'tcx>(_tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { + use rustc_ast::Mutability::*; + match ty.kind() { ty::Bool | ty::Char | ty::Int(_) | ty::Uint(_) - | ty::Float(_) => TransferKind::ToGpu, - - ty::Adt(_, _) + | ty::Float(_) + | ty::Adt(_, _) | ty::Tuple(_) - | ty::Array(_, _) => TransferKind::ToGpu, - - ty::RawPtr(_, rustc_ast::Mutability::Not) - | ty::Ref(_, _, rustc_ast::Mutability::Not) => TransferKind::ToGpu, - - ty::RawPtr(_, rustc_ast::Mutability::Mut) - | ty::Ref(_, _, rustc_ast::Mutability::Mut) => TransferKind::Both, - - ty::Slice(_) - | ty::Str - | ty::Dynamic(_, _) => TransferKind::Both, - - ty::FnDef(_, _) + | ty::Array(_, _) + | ty::FnDef(_, _) | ty::FnPtr(_, _) | ty::Closure(_, _) | ty::CoroutineClosure(_, _) | ty::Coroutine(_, _) - | ty::CoroutineWitness(_, _) => TransferKind::ToGpu, - - ty::Alias(_, _) + | ty::CoroutineWitness(_, _) + | ty::Never + | ty::Alias(_, _) | ty::Param(_) | ty::Bound(_, _) | ty::Placeholder(_) | ty::Infer(_) - | ty::Error(_) => TransferKind::ToGpu, + | ty::Error(_) => MappingFlags::TO, + + ty::RawPtr(_, Not) | ty::Ref(_, _, Not) => MappingFlags::TO, + + ty::RawPtr(_, Mut) | ty::Ref(_, _, Mut) => MappingFlags::TO | MappingFlags::FROM, + + ty::Slice(_) | ty::Str | ty::Dynamic(_, _) => MappingFlags::TO | MappingFlags::FROM, - ty::Never => TransferKind::ToGpu, - ty::Foreign(_) => TransferKind::Both, - ty::Pat(_, _) => TransferKind::Both, - ty::UnsafeBinder(_) => TransferKind::Both, + ty::Foreign(_) | ty::Pat(_, _) | ty::UnsafeBinder(_) => { + MappingFlags::TO | MappingFlags::FROM + } } } } diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index c53968f558acc..cf303dbb15d81 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -28,9 +28,9 @@ fn main() { // CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024] // CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35] -// CHECK: @._kernel_1.region_id = weak unnamed_addr constant i8 0 +// CHECK: @._kernel_1.region_id = internal unnamed_addr constant i8 0 // CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1 -// CHECK: @.offloading.entry._kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// CHECK: @.offloading.entry._kernel_1 = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 // CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 // CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8 @@ -80,10 +80,10 @@ fn main() { // CHECK-NEXT: %6 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72 // CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %5, i8 0, i64 32, i1 false) // CHECK-NEXT: store <4 x i32> , ptr %6, align 8 -// CHECK-NEXT: %.fca.1.gep2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88 -// CHECK-NEXT: store i32 0, ptr %.fca.1.gep2, align 8 -// CHECK-NEXT: %.fca.2.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92 -// CHECK-NEXT: store i32 0, ptr %.fca.2.gep3, align 4 +// CHECK-NEXT: %.fca.1.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88 +// CHECK-NEXT: store i32 0, ptr %.fca.1.gep3, align 8 +// CHECK-NEXT: %.fca.2.gep4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92 +// CHECK-NEXT: store i32 0, ptr %.fca.2.gep4, align 4 // CHECK-NEXT: %7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96 // CHECK-NEXT: store i32 0, ptr %7, align 8 // CHECK-NEXT: %8 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2097152, i32 256, ptr nonnull @._kernel_1.region_id, ptr nonnull %kernel_args) From 9c9aac045fc8663defef4e13a30b58253f96a685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Fri, 14 Nov 2025 23:25:48 +0100 Subject: [PATCH 11/14] Iterate over params and minor fixes --- .../src/builder/gpu_offload.rs | 3 +-- compiler/rustc_codegen_llvm/src/intrinsic.rs | 2 +- compiler/rustc_middle/src/ty/offload_meta.rs | 25 ++++++++++--------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index c3b743cba2c3a..3b484449f7e30 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -390,8 +390,7 @@ pub(crate) fn gen_call_handling<'ll>( let mut vals = vec![]; let mut geps = vec![]; let i32_0 = cx.get_const_i32(0); - for index in 0..num_args { - let v = args[index as usize]; + for &v in args { let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]); vals.push(v); geps.push(gep); diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 23a226e127499..66d419415c791 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -199,7 +199,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } sym::offload => { - // FIXME(Sa4dUs): emit error when offload is not enabled + // TODO(Sa4dUs): emit error when offload is not enabled codegen_offload(self, tcx, instance, args); return Ok(()); } diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs index 06f376d4a7d9d..b7261b07e1bb6 100644 --- a/compiler/rustc_middle/src/ty/offload_meta.rs +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -86,19 +86,8 @@ impl MappingFlags { | ty::Adt(_, _) | ty::Tuple(_) | ty::Array(_, _) - | ty::FnDef(_, _) - | ty::FnPtr(_, _) - | ty::Closure(_, _) - | ty::CoroutineClosure(_, _) - | ty::Coroutine(_, _) - | ty::CoroutineWitness(_, _) - | ty::Never | ty::Alias(_, _) - | ty::Param(_) - | ty::Bound(_, _) - | ty::Placeholder(_) - | ty::Infer(_) - | ty::Error(_) => MappingFlags::TO, + | ty::Param(_) => MappingFlags::TO, ty::RawPtr(_, Not) | ty::Ref(_, _, Not) => MappingFlags::TO, @@ -109,6 +98,18 @@ impl MappingFlags { ty::Foreign(_) | ty::Pat(_, _) | ty::UnsafeBinder(_) => { MappingFlags::TO | MappingFlags::FROM } + + ty::FnDef(_, _) + | ty::FnPtr(_, _) + | ty::Closure(_, _) + | ty::CoroutineClosure(_, _) + | ty::Coroutine(_, _) + | ty::CoroutineWitness(_, _) + | ty::Never + | ty::Bound(_, _) + | ty::Placeholder(_) + | ty::Infer(_) + | ty::Error(_) => MappingFlags::TO, /* TODO(Sa4dUs): emit error */ } } } From d43655b8d716b563780555abdc48797a20b78942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Sat, 15 Nov 2025 10:36:56 +0100 Subject: [PATCH 12/14] Emit erros when invalid config --- compiler/rustc_codegen_llvm/messages.ftl | 3 +++ compiler/rustc_codegen_llvm/src/errors.rs | 8 +++++++ compiler/rustc_codegen_llvm/src/intrinsic.rs | 19 ++++++++++++++-- compiler/rustc_middle/src/ty/offload_meta.rs | 8 +++++-- tests/ui/offload/check_config.fail.stderr | 6 +++++ tests/ui/offload/check_config.rs | 23 ++++++++++++++++++++ 6 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 tests/ui/offload/check_config.fail.stderr create mode 100644 tests/ui/offload/check_config.rs diff --git a/compiler/rustc_codegen_llvm/messages.ftl b/compiler/rustc_codegen_llvm/messages.ftl index c9d28160d66f7..0e7b00d0bcb70 100644 --- a/compiler/rustc_codegen_llvm/messages.ftl +++ b/compiler/rustc_codegen_llvm/messages.ftl @@ -18,6 +18,9 @@ codegen_llvm_lto_bitcode_from_rlib = failed to get bitcode from object file for codegen_llvm_mismatch_data_layout = data-layout for target `{$rustc_target}`, `{$rustc_layout}`, differs from LLVM target's `{$llvm_target}` default layout, `{$llvm_layout}` +codegen_llvm_offload_without_enable = using the offload feature requires -Z offload=Enable +codegen_llvm_offload_without_fat_lto = using the offload feature requires -C lto=fat + codegen_llvm_parse_bitcode = failed to parse bitcode for LTO module codegen_llvm_parse_bitcode_with_llvm_err = failed to parse bitcode for LTO module: {$llvm_err} diff --git a/compiler/rustc_codegen_llvm/src/errors.rs b/compiler/rustc_codegen_llvm/src/errors.rs index 629afee8a6677..dd9fde0b08c6f 100644 --- a/compiler/rustc_codegen_llvm/src/errors.rs +++ b/compiler/rustc_codegen_llvm/src/errors.rs @@ -40,6 +40,14 @@ pub(crate) struct AutoDiffWithoutLto; #[diag(codegen_llvm_autodiff_without_enable)] pub(crate) struct AutoDiffWithoutEnable; +#[derive(Diagnostic)] +#[diag(codegen_llvm_offload_without_enable)] +pub(crate) struct OffloadWithoutEnable; + +#[derive(Diagnostic)] +#[diag(codegen_llvm_offload_without_fat_lto)] +pub(crate) struct OffloadWithoutFatLTO; + #[derive(Diagnostic)] #[diag(codegen_llvm_lto_bitcode_from_rlib)] pub(crate) struct LtoBitcodeFromRlib { diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 66d419415c791..eb3f0affc9e29 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -27,7 +27,9 @@ use crate::builder::Builder; use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call}; use crate::builder::gpu_offload::TgtOffloadEntry; use crate::context::CodegenCx; -use crate::errors::{AutoDiffWithoutEnable, AutoDiffWithoutLto}; +use crate::errors::{ + AutoDiffWithoutEnable, AutoDiffWithoutLto, OffloadWithoutEnable, OffloadWithoutFatLTO, +}; use crate::llvm::{self, Metadata, Type, Value}; use crate::type_of::LayoutLlvmExt; use crate::va_arg::emit_va_arg; @@ -199,7 +201,20 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } sym::offload => { - // TODO(Sa4dUs): emit error when offload is not enabled + if !tcx + .sess + .opts + .unstable_opts + .offload + .contains(&rustc_session::config::Offload::Enable) + { + let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable); + } + + if tcx.sess.lto() != rustc_session::config::Lto::Fat { + let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutFatLTO); + } + codegen_offload(self, tcx, instance, args); return Ok(()); } diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs index b7261b07e1bb6..04a7cd2c75f28 100644 --- a/compiler/rustc_middle/src/ty/offload_meta.rs +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -74,7 +74,7 @@ fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { } impl MappingFlags { - fn from_ty<'tcx>(_tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { + fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { use rustc_ast::Mutability::*; match ty.kind() { @@ -109,7 +109,11 @@ impl MappingFlags { | ty::Bound(_, _) | ty::Placeholder(_) | ty::Infer(_) - | ty::Error(_) => MappingFlags::TO, /* TODO(Sa4dUs): emit error */ + | ty::Error(_) => { + tcx.dcx() + .span_err(rustc_span::DUMMY_SP, format!("type `{ty:?}` cannot be offloaded")); + MappingFlags::empty() + } } } } diff --git a/tests/ui/offload/check_config.fail.stderr b/tests/ui/offload/check_config.fail.stderr new file mode 100644 index 0000000000000..a9162ed926cb0 --- /dev/null +++ b/tests/ui/offload/check_config.fail.stderr @@ -0,0 +1,6 @@ +error: using the offload feature requires -Z offload=Enable + +error: using the offload feature requires -C lto=fat + +error: aborting due to 2 previous errors + diff --git a/tests/ui/offload/check_config.rs b/tests/ui/offload/check_config.rs new file mode 100644 index 0000000000000..667c6d9788bae --- /dev/null +++ b/tests/ui/offload/check_config.rs @@ -0,0 +1,23 @@ +//@ revisions: pass fail +//@ no-prefer-dynamic +//@ needs-enzyme +//@[pass] build-pass +//@[fail] build-fail +//@[pass] compile-flags: -Zunstable-options -Zoffload=Enable -Clto=fat --emit=metadata +//@[fail] compile-flags: -Clto=thin + +//[fail]~? ERROR: using the offload feature requires -Z offload=Enable +//[fail]~? ERROR: using the offload feature requires -C lto=fat + +#![feature(core_intrinsics)] + +fn main() { + let mut x = [3.0; 256]; + kernel_1(&mut x); +} + +fn kernel_1(x: &mut [f32; 256]) { + core::intrinsics::offload(_kernel_1, (x,)) +} + +fn _kernel_1(x: &mut [f32; 256]) {} From a4c3e6c5b67aa6b0b1e2e325bd864a73149819d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Sun, 16 Nov 2025 12:29:35 +0100 Subject: [PATCH 13/14] Add intrinsic doc comment --- library/core/src/intrinsics/mod.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index 97bf21c88e203..b386cffd061b4 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3304,6 +3304,31 @@ pub const fn copysignf128(x: f128, y: f128) -> f128; #[rustc_intrinsic] pub const fn autodiff(f: F, df: G, args: T) -> R; +/// Generates the LLVM body of a wrapper function to offload a kernel `f`. +/// +/// Type Parameters: +/// - `F`: The kernel to offload. Must be a function item. +/// - `T`: A tuple of arguments passed to `f`. +/// - `R`: The return type of the kernel. +/// +/// Example usage (pseudocode): +/// +/// ```rust,ignore (pseudocode) +/// fn kernel(x: *mut [f64; 128]) { +/// core::intrinsics::offload(kernel_1, (x,)) +/// } +/// +/// #[cfg(target_os = "linux")] +/// extern "C" { +/// pub fn kernel_1(array_b: *mut [f64; 128]); +/// } +/// +/// #[cfg(not(target_os = "linux"))] +/// #[rustc_offload_kernel] +/// extern "gpu-kernel" fn kernel_1(x: *mut [f64; 128]) { +/// unsafe { (*x)[0] = 21.0 }; +/// } +/// ``` #[rustc_nounwind] #[rustc_intrinsic] pub const fn offload(f: F, args: T) -> R; From 0cee58b40085f4e8b0e9a4ec7c950b39ed00447d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Mon, 17 Nov 2025 18:40:25 +0100 Subject: [PATCH 14/14] Update rustc-dev-guide --- library/core/src/intrinsics/mod.rs | 3 +++ src/doc/rustc-dev-guide/src/offload/usage.md | 13 +++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index b386cffd061b4..5587a8d16fc62 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3329,6 +3329,9 @@ pub const fn autodiff(f: F, df: G, args: T) -> /// unsafe { (*x)[0] = 21.0 }; /// } /// ``` +/// +/// For reference, see the Clang documentation on offloading: +/// . #[rustc_nounwind] #[rustc_intrinsic] pub const fn offload(f: F, args: T) -> R; diff --git a/src/doc/rustc-dev-guide/src/offload/usage.md b/src/doc/rustc-dev-guide/src/offload/usage.md index 9d5839334b1a9..a5b36b1223c37 100644 --- a/src/doc/rustc-dev-guide/src/offload/usage.md +++ b/src/doc/rustc-dev-guide/src/offload/usage.md @@ -5,6 +5,8 @@ We currently work on launching the following Rust kernel on the GPU. To follow a ```rust #![feature(abi_gpu_kernel)] +#![feature(rustc_attrs)] +#![feature(core_intrinsics)] #![no_std] #[cfg(target_os = "linux")] @@ -12,6 +14,7 @@ extern crate libc; #[cfg(target_os = "linux")] use libc::c_char; +#[cfg(target_os = "linux")] use core::mem; #[panic_handler] @@ -38,7 +41,7 @@ fn main() { } unsafe { - kernel_1(array_c); + kernel(array_c); } core::hint::black_box(&array_c); unsafe { @@ -52,6 +55,11 @@ fn main() { } } +#[inline(never)] +unsafe fn kernel(x: *mut [f64; 256]) { + core::intrinsics::offload(kernel_1, (x,)) +} + #[cfg(target_os = "linux")] unsafe extern "C" { pub fn kernel_1(array_b: *mut [f64; 256]); @@ -60,6 +68,7 @@ unsafe extern "C" { #[cfg(not(target_os = "linux"))] #[unsafe(no_mangle)] #[inline(never)] +#[rustc_offload_kernel] pub extern "gpu-kernel" fn kernel_1(x: *mut [f64; 256]) { unsafe { (*x)[0] = 21.0 }; } @@ -76,7 +85,7 @@ rustc +offload --edition 2024 src/lib.rs -g --crate-type cdylib -C opt-level=3 - Now we generate the device code. Replace the target-cpu with the right code for your gpu. ``` -RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core +RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core ``` Now find the `.ll` under target/amdgcn-amd-amdhsa folder and copy it to a device.ll file (or adjust the file names below).