diff --git a/compiler/rustc_codegen_llvm/messages.ftl b/compiler/rustc_codegen_llvm/messages.ftl index c9d28160d66f7..0e7b00d0bcb70 100644 --- a/compiler/rustc_codegen_llvm/messages.ftl +++ b/compiler/rustc_codegen_llvm/messages.ftl @@ -18,6 +18,9 @@ codegen_llvm_lto_bitcode_from_rlib = failed to get bitcode from object file for codegen_llvm_mismatch_data_layout = data-layout for target `{$rustc_target}`, `{$rustc_layout}`, differs from LLVM target's `{$llvm_target}` default layout, `{$llvm_layout}` +codegen_llvm_offload_without_enable = using the offload feature requires -Z offload=Enable +codegen_llvm_offload_without_fat_lto = using the offload feature requires -C lto=fat + codegen_llvm_parse_bitcode = failed to parse bitcode for LTO module codegen_llvm_parse_bitcode_with_llvm_err = failed to parse bitcode for LTO module: {$llvm_err} diff --git a/compiler/rustc_codegen_llvm/src/attributes.rs b/compiler/rustc_codegen_llvm/src/attributes.rs index 89878d1e7e20b..a25ce9e5a90ac 100644 --- a/compiler/rustc_codegen_llvm/src/attributes.rs +++ b/compiler/rustc_codegen_llvm/src/attributes.rs @@ -30,6 +30,14 @@ pub(crate) fn apply_to_callsite(callsite: &Value, idx: AttributePlace, attrs: &[ } } +pub(crate) fn has_string_attr(llfn: &Value, name: &str) -> bool { + llvm::HasStringAttribute(llfn, name) +} + +pub(crate) fn remove_string_attr_from_llfn(llfn: &Value, name: &str) { + llvm::RemoveStringAttrFromFn(llfn, name); +} + /// Get LLVM attribute for the provided inline heuristic. pub(crate) fn inline_attr<'ll, 'tcx>( cx: &SimpleCx<'ll>, @@ -408,6 +416,10 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>( to_add.push(llvm::CreateAttrString(cx.llcx, "no-builtins")); } + if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::OFFLOAD_KERNEL) { + to_add.push(llvm::CreateAttrString(cx.llcx, "offload-kernel")) + } + if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::COLD) { to_add.push(AttributeKind::Cold.create_attr(cx.llcx)); } diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs index b820b992105fd..482e954138553 100644 --- a/compiler/rustc_codegen_llvm/src/back/lto.rs +++ b/compiler/rustc_codegen_llvm/src/back/lto.rs @@ -26,7 +26,7 @@ use crate::back::write::{ }; use crate::errors::{LlvmError, LtoBitcodeFromRlib}; use crate::llvm::{self, build_string}; -use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx}; +use crate::{LlvmCodegenBackend, ModuleLlvm}; /// We keep track of the computed LTO cache keys from the previous /// session to determine which CGUs we can reuse. @@ -601,7 +601,6 @@ pub(crate) fn run_pass_manager( // We then run the llvm_optimize function a second time, to optimize the code which we generated // in the enzyme differentiation pass. let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable); - let enable_gpu = config.offload.contains(&config::Offload::Enable); let stage = if thin { write::AutodiffStage::PreAD } else { @@ -616,13 +615,6 @@ pub(crate) fn run_pass_manager( write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage); } - // Here we only handle the GPU host (=cpu) code. - if enable_gpu && !thin && !cgcx.target_is_like_gpu { - let cx = - SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size); - crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx); - } - if cfg!(feature = "llvm_enzyme") && enable_ad && !thin { let opt_stage = llvm::OptStage::FatLTO; let stage = write::AutodiffStage::PostAD; diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs index fde7dd6ef7a85..4db4283adb404 100644 --- a/compiler/rustc_codegen_llvm/src/back/write.rs +++ b/compiler/rustc_codegen_llvm/src/back/write.rs @@ -43,7 +43,7 @@ use crate::errors::{ use crate::llvm::diagnostic::OptimizationDiagnosticKind::*; use crate::llvm::{self, DiagnosticInfo}; use crate::type_::llvm_type_ptr; -use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, base, common, llvm_util}; +use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes, base, common, llvm_util}; pub(crate) fn llvm_err<'a>(dcx: DiagCtxtHandle<'_>, err: LlvmError<'a>) -> ! { match llvm::last_error() { @@ -706,11 +706,12 @@ pub(crate) unsafe fn llvm_optimize( SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size); // For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is // introducing a proper offload intrinsic to solve this limitation. - for num in 0..9 { - let name = format!("kernel_{num}"); - if let Some(kernel) = cx.get_function(&name) { - handle_offload(&cx, kernel); + for func in cx.get_functions() { + let offload_kernel = "offload-kernel"; + if attributes::has_string_attr(func, offload_kernel) { + handle_offload(&cx, func); } + attributes::remove_string_attr_from_llfn(func, offload_kernel); } } diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 5c2f8f700627e..3b484449f7e30 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -2,37 +2,13 @@ use std::ffi::CString; use llvm::Linkage::*; use rustc_abi::Align; -use rustc_codegen_ssa::back::write::CodegenContext; use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; +use rustc_middle::ty::offload_meta::OffloadMetadata; use crate::builder::SBuilder; -use crate::common::AsCCharPtr; use crate::llvm::AttributePlace::Function; -use crate::llvm::{self, Linkage, Type, Value}; -use crate::{LlvmCodegenBackend, SimpleCx, attributes}; - -pub(crate) fn handle_gpu_code<'ll>( - _cgcx: &CodegenContext, - cx: &'ll SimpleCx<'_>, -) { - // The offload memory transfer type for each kernel - let mut memtransfer_types = vec![]; - let mut region_ids = vec![]; - let offload_entry_ty = TgtOffloadEntry::new_decl(&cx); - // This is a temporary hack, we only search for kernel_0 to kernel_9 functions. - // There is a draft PR in progress which will introduce a proper offload intrinsic to remove - // this limitation. - for num in 0..9 { - let kernel = cx.get_function(&format!("kernel_{num}")); - if let Some(kernel) = kernel { - let (o, k) = gen_define_handling(&cx, kernel, offload_entry_ty, num); - memtransfer_types.push(o); - region_ids.push(k); - } - } - - gen_call_handling(&cx, &memtransfer_types, ®ion_ids); -} +use crate::llvm::{self, BasicBlock, Linkage, Type, Value}; +use crate::{SimpleCx, attributes}; // ; Function Attrs: nounwind // declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2 @@ -79,7 +55,7 @@ fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value { at_one } -struct TgtOffloadEntry { +pub(crate) struct TgtOffloadEntry { // uint64_t Reserved; // uint16_t Version; // uint16_t Kind; @@ -167,7 +143,7 @@ impl KernelArgsTy { fn new<'ll>( cx: &'ll SimpleCx<'_>, num_args: u64, - memtransfer_types: &[&'ll Value], + memtransfer_types: &'ll Value, geps: [&'ll Value; 3], ) -> [(Align, &'ll Value); 13] { let four = Align::from_bytes(4).expect("4 Byte alignment should work"); @@ -181,7 +157,7 @@ impl KernelArgsTy { (eight, geps[0]), (eight, geps[1]), (eight, geps[2]), - (eight, memtransfer_types[0]), + (eight, memtransfer_types), // The next two are debug infos. FIXME(offload): set them (eight, cx.const_null(cx.type_ptr())), // dbg (eight, cx.const_null(cx.type_ptr())), // dbg @@ -256,68 +232,68 @@ pub(crate) fn add_global<'ll>( // This function returns a memtransfer value which encodes how arguments to this kernel shall be // mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be // concatenated into the list of region_ids. -fn gen_define_handling<'ll>( - cx: &'ll SimpleCx<'_>, - kernel: &'ll llvm::Value, +pub(crate) fn gen_define_handling<'ll>( + cx: &SimpleCx<'ll>, offload_entry_ty: &'ll llvm::Type, - num: i64, -) -> (&'ll llvm::Value, &'ll llvm::Value) { - let types = cx.func_params_types(cx.get_type_of_global(kernel)); + metadata: &[OffloadMetadata], + types: &[&Type], + symbol: &str, +) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value) { // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or // reference) types. - let num_ptr_types = types - .iter() - .filter(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer)) - .count(); - - // We do not know their size anymore at this level, so hardcode a placeholder. - // A follow-up pr will track these from the frontend, where we still have Rust types. - // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes. - // I decided that 1024 bytes is a great placeholder value for now. - add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]); + let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) { + rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta), + _ => None, + }); + + // FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary + let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) = + ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip(); + + let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes); // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2), // or both to and from the gpu (=3). Other values shouldn't affect us for now. // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten // will be 2. For now, everything is 3, until we have our frontend set up. // 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later). - let memtransfer_types = add_priv_unnamed_arr( - &cx, - &format!(".offload_maptypes.{num}"), - &vec![1 + 2 + 32; num_ptr_types], - ); + let memtransfer_types = + add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &ptr_transfer); + // Next: For each function, generate these three entries. A weak constant, // the llvm.rodata entry name, and the llvm_offload_entries value - let name = format!(".kernel_{num}.region_id"); + let name = format!(".{symbol}.region_id"); let initializer = cx.get_const_i8(0); let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage); - let c_entry_name = CString::new(format!("kernel_{num}")).unwrap(); + let c_entry_name = CString::new(symbol).unwrap(); let c_val = c_entry_name.as_bytes_with_nul(); - let offload_entry_name = format!(".offloading.entry_name.{num}"); + let offload_entry_name = format!(".offloading.entry_name.{symbol}"); let initializer = crate::common::bytes_in_context(cx.llcx, c_val); let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage); llvm::set_alignment(llglobal, Align::ONE); llvm::set_section(llglobal, c".llvm.rodata.offloading"); - let name = format!(".offloading.entry.kernel_{num}"); + + let name = format!(".offloading.entry.{symbol}"); // See the __tgt_offload_entry documentation above. let elems = TgtOffloadEntry::new(&cx, region_id, llglobal); let initializer = crate::common::named_struct(offload_entry_ty, &elems); let c_name = CString::new(name).unwrap(); - let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name); - llvm::set_global_constant(llglobal, true); - llvm::set_linkage(llglobal, WeakAnyLinkage); - llvm::set_initializer(llglobal, initializer); - llvm::set_alignment(llglobal, Align::EIGHT); + let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name); + llvm::set_global_constant(offload_entry, true); + llvm::set_linkage(offload_entry, WeakAnyLinkage); + llvm::set_initializer(offload_entry, initializer); + llvm::set_alignment(offload_entry, Align::EIGHT); let c_section_name = CString::new("llvm_offload_entries").unwrap(); - llvm::set_section(llglobal, &c_section_name); - (memtransfer_types, region_id) + llvm::set_section(offload_entry, &c_section_name); + + (offload_sizes, memtransfer_types, region_id, offload_entry) } -pub(crate) fn declare_offload_fn<'ll>( +fn declare_offload_fn<'ll>( cx: &'ll SimpleCx<'_>, name: &str, ty: &'ll llvm::Type, @@ -333,8 +309,7 @@ pub(crate) fn declare_offload_fn<'ll>( } // For each kernel *call*, we now use some of our previous declared globals to move data to and from -// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function -// from main is intended to run on the GPU. For now, we only handle the data transfer part of it. +// the gpu. For now, we only handle the data transfer part of it. // If two consecutive kernels use the same memory, we still move it to the host and back to the gpu. // Since in our frontend users (by default) don't have to specify data transfer, this is something // we should optimize in the future! We also assume that everything should be copied back and forth, @@ -352,10 +327,16 @@ pub(crate) fn declare_offload_fn<'ll>( // 4. set insert point after kernel call. // 5. generate all the GEPS and stores, to be used in 6) // 6. generate __tgt_target_data_end calls to move data from the GPU -fn gen_call_handling<'ll>( - cx: &'ll SimpleCx<'_>, - memtransfer_types: &[&'ll llvm::Value], - region_ids: &[&'ll llvm::Value], +pub(crate) fn gen_call_handling<'ll>( + cx: &SimpleCx<'ll>, + bb: &BasicBlock, + offload_sizes: &'ll llvm::Value, + offload_entry: &'ll llvm::Value, + memtransfer_types: &'ll llvm::Value, + region_id: &'ll llvm::Value, + args: &[&'ll Value], + types: &[&Type], + metadata: &[OffloadMetadata], ) { let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx); // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } @@ -368,27 +349,26 @@ fn gen_call_handling<'ll>( let tgt_kernel_decl = KernelArgsTy::new_decl(&cx); let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx); - let main_fn = cx.get_function("main"); - let Some(main_fn) = main_fn else { return }; - let kernel_name = "kernel_1"; - let call = unsafe { - llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len()) - }; - let Some(kernel_call) = call else { - return; - }; - let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) }; - let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() }; - let mut builder = SBuilder::build(cx, kernel_call_bb); - - let types = cx.func_params_types(cx.get_type_of_global(called)); + let mut builder = SBuilder::build(cx, bb); + + // prevent these globals from being optimized away + for val in [offload_sizes, offload_entry] { + unsafe { + let dummy = llvm::LLVMBuildLoad2( + &builder.llbuilder, + llvm::LLVMTypeOf(val), + val, + b"dummy\0".as_ptr() as *const _, + ); + llvm::LLVMSetVolatile(dummy, llvm::TRUE); + } + } + let num_args = types.len() as u64; // Step 0) // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } // %6 = alloca %struct.__tgt_bin_desc, align 8 - unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) }; - let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc"); let ty = cx.type_array(cx.type_ptr(), num_args); @@ -404,15 +384,13 @@ fn gen_call_handling<'ll>( let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args"); // Step 1) - unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) }; builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT); // Now we allocate once per function param, a copy to be passed to one of our maps. let mut vals = vec![]; let mut geps = vec![]; let i32_0 = cx.get_const_i32(0); - for index in 0..types.len() { - let v = unsafe { llvm::LLVMGetOperand(kernel_call, index as u32).unwrap() }; + for &v in args { let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]); vals.push(v); geps.push(gep); @@ -437,10 +415,8 @@ fn gen_call_handling<'ll>( let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]); builder.store(geps[i as usize], gep2, Align::EIGHT); let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]); - // As mentioned above, we don't use Rust type information yet. So for now we will just - // assume that we have 1024 bytes, 256 f32 values. // FIXME(offload): write an offload frontend and handle arbitrary types. - builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT); + builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT); } // For now we have a very simplistic indexing scheme into our @@ -482,7 +458,7 @@ fn gen_call_handling<'ll>( // Step 2) let s_ident_t = generate_at_one(&cx); - let o = memtransfer_types[0]; + let o = memtransfer_types; let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4); generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t); let values = KernelArgsTy::new(&cx, num_args, memtransfer_types, geps); @@ -501,16 +477,11 @@ fn gen_call_handling<'ll>( // FIXME(offload): Don't hardcode the numbers of threads in the future. cx.get_const_i32(2097152), cx.get_const_i32(256), - region_ids[0], + region_id, a5, ]; - let offload_success = builder.call(tgt_target_kernel_ty, tgt_decl, &args, None); + builder.call(tgt_target_kernel_ty, tgt_decl, &args, None); // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) - unsafe { - let next = llvm::LLVMGetNextInstruction(offload_success).unwrap(); - llvm::LLVMRustPositionAfter(builder.llbuilder, next); - llvm::LLVMInstructionEraseFromParent(next); - } // Step 4) let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4); @@ -519,8 +490,4 @@ fn gen_call_handling<'ll>( builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None); drop(builder); - // FIXME(offload) The issue is that we right now add a call to the gpu version of the function, - // and then delete the call to the CPU version. In the future, we should use an intrinsic which - // directly resolves to a call to the GPU version. - unsafe { llvm::LLVMDeleteFunction(called) }; } diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs index b60c8a7d37193..6caf60e3cc41e 100644 --- a/compiler/rustc_codegen_llvm/src/context.rs +++ b/compiler/rustc_codegen_llvm/src/context.rs @@ -791,6 +791,16 @@ impl<'ll, CX: Borrow>> GenericCx<'ll, CX> { llvm::LLVMMDStringInContext2(self.llcx(), name.as_ptr() as *const c_char, name.len()) } } + + pub(crate) fn get_functions(&self) -> Vec<&'ll Value> { + let mut functions = vec![]; + let mut func = unsafe { llvm::LLVMGetFirstFunction(self.llmod()) }; + while let Some(f) = func { + functions.push(f); + func = unsafe { llvm::LLVMGetNextFunction(f) } + } + functions + } } impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/errors.rs b/compiler/rustc_codegen_llvm/src/errors.rs index 629afee8a6677..dd9fde0b08c6f 100644 --- a/compiler/rustc_codegen_llvm/src/errors.rs +++ b/compiler/rustc_codegen_llvm/src/errors.rs @@ -40,6 +40,14 @@ pub(crate) struct AutoDiffWithoutLto; #[diag(codegen_llvm_autodiff_without_enable)] pub(crate) struct AutoDiffWithoutEnable; +#[derive(Diagnostic)] +#[diag(codegen_llvm_offload_without_enable)] +pub(crate) struct OffloadWithoutEnable; + +#[derive(Diagnostic)] +#[diag(codegen_llvm_offload_without_fat_lto)] +pub(crate) struct OffloadWithoutFatLTO; + #[derive(Diagnostic)] #[diag(codegen_llvm_lto_bitcode_from_rlib)] pub(crate) struct LtoBitcodeFromRlib { diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 84fc6ebbc3172..eb3f0affc9e29 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -13,6 +13,7 @@ use rustc_hir::def_id::LOCAL_CRATE; use rustc_hir::{self as hir}; use rustc_middle::mir::BinOp; use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt, HasTypingEnv, LayoutOf}; +use rustc_middle::ty::offload_meta::OffloadMetadata; use rustc_middle::ty::{self, GenericArgsRef, Instance, SimdAlign, Ty, TyCtxt, TypingEnv}; use rustc_middle::{bug, span_bug}; use rustc_span::{Span, Symbol, sym}; @@ -24,8 +25,11 @@ use tracing::debug; use crate::abi::FnAbiLlvmExt; use crate::builder::Builder; use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call}; +use crate::builder::gpu_offload::TgtOffloadEntry; use crate::context::CodegenCx; -use crate::errors::{AutoDiffWithoutEnable, AutoDiffWithoutLto}; +use crate::errors::{ + AutoDiffWithoutEnable, AutoDiffWithoutLto, OffloadWithoutEnable, OffloadWithoutFatLTO, +}; use crate::llvm::{self, Metadata, Type, Value}; use crate::type_of::LayoutLlvmExt; use crate::va_arg::emit_va_arg; @@ -196,6 +200,24 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { codegen_autodiff(self, tcx, instance, args, result); return Ok(()); } + sym::offload => { + if !tcx + .sess + .opts + .unstable_opts + .offload + .contains(&rustc_session::config::Offload::Enable) + { + let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable); + } + + if tcx.sess.lto() != rustc_session::config::Lto::Fat { + let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutFatLTO); + } + + codegen_offload(self, tcx, instance, args); + return Ok(()); + } sym::is_val_statically_known => { if let OperandValue::Immediate(imm) = args[0].val { self.call_intrinsic( @@ -1221,6 +1243,68 @@ fn codegen_autodiff<'ll, 'tcx>( ); } +fn codegen_offload<'ll, 'tcx>( + bx: &mut Builder<'_, 'll, 'tcx>, + tcx: TyCtxt<'tcx>, + instance: ty::Instance<'tcx>, + args: &[OperandRef<'tcx, &'ll Value>], +) { + let cx = bx.cx; + let fn_args = instance.args; + + let (target_id, target_args) = match fn_args.into_type_list(tcx)[0].kind() { + ty::FnDef(def_id, params) => (def_id, params), + _ => bug!("invalid offload intrinsic arg"), + }; + + let fn_target = match Instance::try_resolve(tcx, cx.typing_env(), *target_id, target_args) { + Ok(Some(instance)) => instance, + Ok(None) => bug!( + "could not resolve ({:?}, {:?}) to a specific offload instance", + target_id, + target_args + ), + Err(_) => { + // An error has already been emitted + return; + } + }; + + let args = get_args_from_tuple(bx, args[1], fn_target); + let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE); + + let offload_entry_ty = TgtOffloadEntry::new_decl(&cx); + + let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder(); + let inputs = sig.inputs(); + + let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::>(); + + let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::>(); + + let (offload_sizes, memtransfer_types, region_id, offload_entry) = + crate::builder::gpu_offload::gen_define_handling( + cx, + offload_entry_ty, + &metadata, + &types, + &target_symbol, + ); + + let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) }; + crate::builder::gpu_offload::gen_call_handling( + cx, + bb, + offload_sizes, + offload_entry, + memtransfer_types, + region_id, + &args, + &types, + &metadata, + ); +} + fn get_args_from_tuple<'ll, 'tcx>( bx: &mut Builder<'_, 'll, 'tcx>, tuple_op: OperandRef<'tcx, &'ll Value>, diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index ca64d96c2a33c..dfac3022eeffd 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1160,13 +1160,9 @@ unsafe extern "C" { ) -> &'a BasicBlock; // Operations on instructions - pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock; - pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>; pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>; pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock; pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>; - pub(crate) fn LLVMGetNextInstruction(Val: &Value) -> Option<&Value>; - pub(crate) fn LLVMInstructionEraseFromParent(Val: &Value); // Operations on call sites pub(crate) fn LLVMSetInstructionCallConv(Instr: &Value, CC: c_uint); @@ -2452,7 +2448,6 @@ unsafe extern "C" { pub(crate) fn LLVMRustSetDataLayoutFromTargetMachine<'a>(M: &'a Module, TM: &'a TargetMachine); - pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value); pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock); pub(crate) fn LLVMRustSetModulePICLevel(M: &Module); diff --git a/compiler/rustc_codegen_llvm/src/llvm/mod.rs b/compiler/rustc_codegen_llvm/src/llvm/mod.rs index 4c58a92106d5c..55a4b415a4e27 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/mod.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/mod.rs @@ -43,6 +43,14 @@ pub(crate) fn AddFunctionAttributes<'ll>( } } +pub(crate) fn HasStringAttribute<'ll>(llfn: &'ll Value, name: &str) -> bool { + unsafe { LLVMRustHasFnAttribute(llfn, name.as_c_char_ptr(), name.len()) } +} + +pub(crate) fn RemoveStringAttrFromFn<'ll>(llfn: &'ll Value, name: &str) { + unsafe { LLVMRustRemoveFnAttribute(llfn, name.as_c_char_ptr(), name.len()) } +} + pub(crate) fn AddCallSiteAttributes<'ll>( callsite: &'ll Value, idx: AttributePlace, diff --git a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs index fd3d7d2a3ded0..0ab0cb0ef88a5 100644 --- a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs +++ b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs @@ -334,6 +334,9 @@ fn process_builtin_attrs( codegen_fn_attrs.patchable_function_entry = parse_patchable_function_entry(tcx, attr); } + sym::rustc_offload_kernel => { + codegen_fn_attrs.flags |= CodegenFnAttrFlags::OFFLOAD_KERNEL + } _ => {} } } diff --git a/compiler/rustc_feature/src/builtin_attrs.rs b/compiler/rustc_feature/src/builtin_attrs.rs index 4d50b9683fc57..0e48d943eab2e 100644 --- a/compiler/rustc_feature/src/builtin_attrs.rs +++ b/compiler/rustc_feature/src/builtin_attrs.rs @@ -1100,6 +1100,11 @@ pub static BUILTIN_ATTRIBUTES: &[BuiltinAttribute] = &[ rustc_autodiff, Normal, template!(Word, List: &[r#""...""#]), DuplicatesOk, EncodeCrossCrate::Yes, + ), + rustc_attr!( + rustc_offload_kernel, Normal, + template!(Word), DuplicatesOk, + EncodeCrossCrate::Yes, ), // Traces that are left when `cfg` and `cfg_attr` attributes are expanded. // The attributes are not gated, to avoid stability errors, but they cannot be used in stable diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index d87a154b0f1bb..2996bd3a65188 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -163,6 +163,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi | sym::minnumf128 | sym::mul_with_overflow | sym::needs_drop + | sym::offload | sym::overflow_checks | sym::powf16 | sym::powf32 @@ -311,6 +312,7 @@ pub(crate) fn check_intrinsic_type( let type_id = tcx.type_of(tcx.lang_items().type_id().unwrap()).instantiate_identity(); (0, 0, vec![type_id, type_id], tcx.types.bool) } + sym::offload => (3, 0, vec![param(0), param(1)], param(2)), sym::offset => (2, 0, vec![param(0), param(1)], param(0)), sym::arith_offset => ( 1, diff --git a/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs b/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs index 5a28d56d4e549..9630cfc94b433 100644 --- a/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs +++ b/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs @@ -190,6 +190,8 @@ bitflags::bitflags! { const NO_BUILTINS = 1 << 15; /// Marks foreign items, to make `contains_extern_indicator` cheaper. const FOREIGN_ITEM = 1 << 16; + /// `#[rustc_offload_kernel]`: indicates that this is an offload kernel, an extra ptr arg will be added. + const OFFLOAD_KERNEL = 1 << 17; } } rustc_data_structures::external_bitflags_debug! { CodegenFnAttrFlags } diff --git a/compiler/rustc_middle/src/ty/mod.rs b/compiler/rustc_middle/src/ty/mod.rs index 5eb8f1713a138..a6891c26d653e 100644 --- a/compiler/rustc_middle/src/ty/mod.rs +++ b/compiler/rustc_middle/src/ty/mod.rs @@ -129,6 +129,7 @@ pub mod fast_reject; pub mod inhabitedness; pub mod layout; pub mod normalize_erasing_regions; +pub mod offload_meta; pub mod pattern; pub mod print; pub mod relate; diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs new file mode 100644 index 0000000000000..04a7cd2c75f28 --- /dev/null +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -0,0 +1,119 @@ +use bitflags::bitflags; + +use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv}; + +pub struct OffloadMetadata { + pub payload_size: u64, + pub mode: MappingFlags, +} + +bitflags! { + /// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP. + #[derive(Debug, Copy, Clone)] + #[repr(transparent)] + pub struct MappingFlags: u64 { + /// No flags. + const NONE = 0x0; + /// Allocate memory on the device and move data from host to device. + const TO = 0x01; + /// Allocate memory on the device and move data from device to host. + const FROM = 0x02; + /// Always perform the requested mapping action, even if already mapped. + const ALWAYS = 0x04; + /// Delete the element from the device environment, ignoring ref count. + const DELETE = 0x08; + /// The element being mapped is a pointer-pointee pair. + const PTR_AND_OBJ = 0x10; + /// The base address should be passed to the target kernel as argument. + const TARGET_PARAM = 0x20; + /// The runtime must return the device pointer. + const RETURN_PARAM = 0x40; + /// The reference being passed is a pointer to private data. + const PRIVATE = 0x80; + /// Pass the element by value. + const LITERAL = 0x100; + /// Implicit map (generated by compiler, not explicit in code). + const IMPLICIT = 0x200; + /// Hint to allocate memory close to the target device. + const CLOSE = 0x400; + /// Reserved (0x800 in OpenMP for XLC compatibility). + const RESERVED = 0x800; + /// Require that the data is already allocated on the device. + const PRESENT = 0x1000; + /// Increment/decrement a separate ref counter (OpenACC compatibility). + const OMPX_HOLD = 0x2000; + /// Used for non-contiguous list items in target update. + const NON_CONTIG = 0x100000000000; + /// 16 MSBs indicate membership in a struct. + const MEMBER_OF = 0xffff000000000000; + } +} + +impl OffloadMetadata { + pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { + OffloadMetadata { + payload_size: get_payload_size(tcx, ty), + mode: MappingFlags::from_ty(tcx, ty), + } + } +} + +// FIXME(Sa4dUs): implement a solid logic to determine the payload size +fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { + match ty.kind() { + ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), + _ => tcx + .layout_of(PseudoCanonicalInput { + typing_env: TypingEnv::fully_monomorphized(), + value: ty, + }) + .unwrap() + .size + .bytes(), + } +} + +impl MappingFlags { + fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self { + use rustc_ast::Mutability::*; + + match ty.kind() { + ty::Bool + | ty::Char + | ty::Int(_) + | ty::Uint(_) + | ty::Float(_) + | ty::Adt(_, _) + | ty::Tuple(_) + | ty::Array(_, _) + | ty::Alias(_, _) + | ty::Param(_) => MappingFlags::TO, + + ty::RawPtr(_, Not) | ty::Ref(_, _, Not) => MappingFlags::TO, + + ty::RawPtr(_, Mut) | ty::Ref(_, _, Mut) => MappingFlags::TO | MappingFlags::FROM, + + ty::Slice(_) | ty::Str | ty::Dynamic(_, _) => MappingFlags::TO | MappingFlags::FROM, + + ty::Foreign(_) | ty::Pat(_, _) | ty::UnsafeBinder(_) => { + MappingFlags::TO | MappingFlags::FROM + } + + ty::FnDef(_, _) + | ty::FnPtr(_, _) + | ty::Closure(_, _) + | ty::CoroutineClosure(_, _) + | ty::Coroutine(_, _) + | ty::CoroutineWitness(_, _) + | ty::Never + | ty::Bound(_, _) + | ty::Placeholder(_) + | ty::Infer(_) + | ty::Error(_) => { + tcx.dcx() + .span_err(rustc_span::DUMMY_SP, format!("type `{ty:?}` cannot be offloaded")); + MappingFlags::empty() + } + } + } +} diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 8ab8181833064..21c8d5b784ee2 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1583,6 +1583,7 @@ symbols! { object_safe_for_dispatch, of, off, + offload, offset, offset_of, offset_of_enum, @@ -1965,6 +1966,7 @@ symbols! { rustc_objc_class, rustc_objc_selector, rustc_object_lifetime_default, + rustc_offload_kernel, rustc_on_unimplemented, rustc_outlives, rustc_paren_sugar, diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index c987d80be8b42..5587a8d16fc62 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3304,6 +3304,38 @@ pub const fn copysignf128(x: f128, y: f128) -> f128; #[rustc_intrinsic] pub const fn autodiff(f: F, df: G, args: T) -> R; +/// Generates the LLVM body of a wrapper function to offload a kernel `f`. +/// +/// Type Parameters: +/// - `F`: The kernel to offload. Must be a function item. +/// - `T`: A tuple of arguments passed to `f`. +/// - `R`: The return type of the kernel. +/// +/// Example usage (pseudocode): +/// +/// ```rust,ignore (pseudocode) +/// fn kernel(x: *mut [f64; 128]) { +/// core::intrinsics::offload(kernel_1, (x,)) +/// } +/// +/// #[cfg(target_os = "linux")] +/// extern "C" { +/// pub fn kernel_1(array_b: *mut [f64; 128]); +/// } +/// +/// #[cfg(not(target_os = "linux"))] +/// #[rustc_offload_kernel] +/// extern "gpu-kernel" fn kernel_1(x: *mut [f64; 128]) { +/// unsafe { (*x)[0] = 21.0 }; +/// } +/// ``` +/// +/// For reference, see the Clang documentation on offloading: +/// . +#[rustc_nounwind] +#[rustc_intrinsic] +pub const fn offload(f: F, args: T) -> R; + /// Inform Miri that a given pointer definitely has a certain alignment. #[cfg(miri)] #[rustc_allow_const_fn_unstable(const_eval_select)] diff --git a/src/doc/rustc-dev-guide/src/offload/usage.md b/src/doc/rustc-dev-guide/src/offload/usage.md index 9d5839334b1a9..a5b36b1223c37 100644 --- a/src/doc/rustc-dev-guide/src/offload/usage.md +++ b/src/doc/rustc-dev-guide/src/offload/usage.md @@ -5,6 +5,8 @@ We currently work on launching the following Rust kernel on the GPU. To follow a ```rust #![feature(abi_gpu_kernel)] +#![feature(rustc_attrs)] +#![feature(core_intrinsics)] #![no_std] #[cfg(target_os = "linux")] @@ -12,6 +14,7 @@ extern crate libc; #[cfg(target_os = "linux")] use libc::c_char; +#[cfg(target_os = "linux")] use core::mem; #[panic_handler] @@ -38,7 +41,7 @@ fn main() { } unsafe { - kernel_1(array_c); + kernel(array_c); } core::hint::black_box(&array_c); unsafe { @@ -52,6 +55,11 @@ fn main() { } } +#[inline(never)] +unsafe fn kernel(x: *mut [f64; 256]) { + core::intrinsics::offload(kernel_1, (x,)) +} + #[cfg(target_os = "linux")] unsafe extern "C" { pub fn kernel_1(array_b: *mut [f64; 256]); @@ -60,6 +68,7 @@ unsafe extern "C" { #[cfg(not(target_os = "linux"))] #[unsafe(no_mangle)] #[inline(never)] +#[rustc_offload_kernel] pub extern "gpu-kernel" fn kernel_1(x: *mut [f64; 256]) { unsafe { (*x)[0] = 21.0 }; } @@ -76,7 +85,7 @@ rustc +offload --edition 2024 src/lib.rs -g --crate-type cdylib -C opt-level=3 - Now we generate the device code. Replace the target-cpu with the right code for your gpu. ``` -RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core +RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core ``` Now find the `.ll` under target/amdgcn-amd-amdhsa folder and copy it to a device.ll file (or adjust the file names below). diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index fac4054d1b7ff..cf303dbb15d81 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -11,6 +11,7 @@ // when inside of a function called main. This, too, is a temporary workaround for not having a // frontend. +#![feature(core_intrinsics)] #![no_main] #[unsafe(no_mangle)] @@ -25,73 +26,70 @@ fn main() { // CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } // CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 } -// CHECK: @.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 1024] -// CHECK: @.offload_maptypes.1 = private unnamed_addr constant [1 x i64] [i64 35] -// CHECK: @.kernel_1.region_id = weak unnamed_addr constant i8 0 -// CHECK: @.offloading.entry_name.1 = internal unnamed_addr constant [9 x i8] c"kernel_1\00", section ".llvm.rodata.offloading", align 1 -// CHECK: @.offloading.entry.kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @.kernel_1.region_id, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 -// CHECK: @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 -// CHECK: @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 +// CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024] +// CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35] +// CHECK: @._kernel_1.region_id = internal unnamed_addr constant i8 0 +// CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1 +// CHECK: @.offloading.entry._kernel_1 = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 + +// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8 // CHECK: Function Attrs: // CHECK-NEXT: define{{( dso_local)?}} void @main() // CHECK-NEXT: start: // CHECK-NEXT: %0 = alloca [8 x i8], align 8 // CHECK-NEXT: %x = alloca [1024 x i8], align 16 +// CHECK: call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x) +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %0) +// CHECK-NEXT: store ptr %x, ptr %0, align 8 +// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #4, !srcloc !4 +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %0) +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr nonnull %x) +// CHECK-NEXT: ret void +// CHECK-NEXT: } + +// CHECK: define{{( dso_local)?}} void @kernel_1(ptr noalias noundef align 4 dereferenceable(1024) %x) +// CHECK-NEXT: start: +// CHECK-NEXT: %dummy = load volatile ptr, ptr @.offload_sizes._kernel_1, align 8 +// CHECK-NEXT: %dummy1 = load volatile ptr, ptr @.offloading.entry._kernel_1, align 8 // CHECK-NEXT: %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8 // CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8 // CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 -// CHECK: call void @llvm.memset.p0.i64(ptr align 8 %EmptyDesc, i8 0, i64 32, i1 false) -// CHECK-NEXT: %1 = getelementptr inbounds float, ptr %x, i32 0 -// CHECK-NEXT: call void @__tgt_register_lib(ptr %EmptyDesc) +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %EmptyDesc, i8 0, i64 32, i1 false) +// CHECK-NEXT: call void @__tgt_register_lib(ptr nonnull %EmptyDesc) // CHECK-NEXT: call void @__tgt_init_all_rtls() -// CHECK-NEXT: %2 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 -// CHECK-NEXT: store ptr %x, ptr %2, align 8 -// CHECK-NEXT: %3 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 -// CHECK-NEXT: store ptr %1, ptr %3, align 8 -// CHECK-NEXT: %4 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 -// CHECK-NEXT: store i64 1024, ptr %4, align 8 -// CHECK-NEXT: %5 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 -// CHECK-NEXT: %6 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 -// CHECK-NEXT: %7 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 -// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %5, ptr %6, ptr %7, ptr @.offload_maptypes.1, ptr null, ptr null) -// CHECK-NEXT: %8 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 0 -// CHECK-NEXT: store i32 3, ptr %8, align 4 -// CHECK-NEXT: %9 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 1 -// CHECK-NEXT: store i32 1, ptr %9, align 4 -// CHECK-NEXT: %10 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 2 -// CHECK-NEXT: store ptr %5, ptr %10, align 8 -// CHECK-NEXT: %11 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 3 -// CHECK-NEXT: store ptr %6, ptr %11, align 8 -// CHECK-NEXT: %12 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 4 -// CHECK-NEXT: store ptr %7, ptr %12, align 8 -// CHECK-NEXT: %13 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 5 -// CHECK-NEXT: store ptr @.offload_maptypes.1, ptr %13, align 8 -// CHECK-NEXT: %14 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 6 -// CHECK-NEXT: store ptr null, ptr %14, align 8 -// CHECK-NEXT: %15 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 7 -// CHECK-NEXT: store ptr null, ptr %15, align 8 -// CHECK-NEXT: %16 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 8 -// CHECK-NEXT: store i64 0, ptr %16, align 8 -// CHECK-NEXT: %17 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 9 -// CHECK-NEXT: store i64 0, ptr %17, align 8 -// CHECK-NEXT: %18 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 10 -// CHECK-NEXT: store [3 x i32] [i32 2097152, i32 0, i32 0], ptr %18, align 4 -// CHECK-NEXT: %19 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 11 -// CHECK-NEXT: store [3 x i32] [i32 256, i32 0, i32 0], ptr %19, align 4 -// CHECK-NEXT: %20 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 12 -// CHECK-NEXT: store i32 0, ptr %20, align 4 -// CHECK-NEXT: %21 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) -// CHECK-NEXT: %22 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 -// CHECK-NEXT: %23 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 -// CHECK-NEXT: %24 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 -// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %22, ptr %23, ptr %24, ptr @.offload_maptypes.1, ptr null, ptr null) -// CHECK-NEXT: call void @__tgt_unregister_lib(ptr %EmptyDesc) -// CHECK: store ptr %x, ptr %0, align 8 -// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) -// CHECK: ret void +// CHECK-NEXT: store ptr %x, ptr %.offload_baseptrs, align 8 +// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8 +// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8 +// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null) +// CHECK-NEXT: store i32 3, ptr %kernel_args, align 8 +// CHECK-NEXT: %0 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4 +// CHECK-NEXT: store i32 1, ptr %0, align 4 +// CHECK-NEXT: %1 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 8 +// CHECK-NEXT: store ptr %.offload_baseptrs, ptr %1, align 8 +// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16 +// CHECK-NEXT: store ptr %.offload_ptrs, ptr %2, align 8 +// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24 +// CHECK-NEXT: store ptr %.offload_sizes, ptr %3, align 8 +// CHECK-NEXT: %4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32 +// CHECK-NEXT: store ptr @.offload_maptypes._kernel_1, ptr %4, align 8 +// CHECK-NEXT: %5 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40 +// CHECK-NEXT: %6 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72 +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %5, i8 0, i64 32, i1 false) +// CHECK-NEXT: store <4 x i32> , ptr %6, align 8 +// CHECK-NEXT: %.fca.1.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88 +// CHECK-NEXT: store i32 0, ptr %.fca.1.gep3, align 8 +// CHECK-NEXT: %.fca.2.gep4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92 +// CHECK-NEXT: store i32 0, ptr %.fca.2.gep4, align 4 +// CHECK-NEXT: %7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96 +// CHECK-NEXT: store i32 0, ptr %7, align 8 +// CHECK-NEXT: %8 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2097152, i32 256, ptr nonnull @._kernel_1.region_id, ptr nonnull %kernel_args) +// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null) +// CHECK-NEXT: call void @__tgt_unregister_lib(ptr nonnull %EmptyDesc) +// CHECK-NEXT: ret void // CHECK-NEXT: } // CHECK: Function Attrs: nounwind @@ -100,6 +98,12 @@ fn main() { #[unsafe(no_mangle)] #[inline(never)] pub fn kernel_1(x: &mut [f32; 256]) { + core::intrinsics::offload(_kernel_1, (x,)) +} + +#[unsafe(no_mangle)] +#[inline(never)] +pub fn _kernel_1(x: &mut [f32; 256]) { for i in 0..256 { x[i] = 21.0; } diff --git a/tests/ui/offload/check_config.fail.stderr b/tests/ui/offload/check_config.fail.stderr new file mode 100644 index 0000000000000..a9162ed926cb0 --- /dev/null +++ b/tests/ui/offload/check_config.fail.stderr @@ -0,0 +1,6 @@ +error: using the offload feature requires -Z offload=Enable + +error: using the offload feature requires -C lto=fat + +error: aborting due to 2 previous errors + diff --git a/tests/ui/offload/check_config.rs b/tests/ui/offload/check_config.rs new file mode 100644 index 0000000000000..667c6d9788bae --- /dev/null +++ b/tests/ui/offload/check_config.rs @@ -0,0 +1,23 @@ +//@ revisions: pass fail +//@ no-prefer-dynamic +//@ needs-enzyme +//@[pass] build-pass +//@[fail] build-fail +//@[pass] compile-flags: -Zunstable-options -Zoffload=Enable -Clto=fat --emit=metadata +//@[fail] compile-flags: -Clto=thin + +//[fail]~? ERROR: using the offload feature requires -Z offload=Enable +//[fail]~? ERROR: using the offload feature requires -C lto=fat + +#![feature(core_intrinsics)] + +fn main() { + let mut x = [3.0; 256]; + kernel_1(&mut x); +} + +fn kernel_1(x: &mut [f32; 256]) { + core::intrinsics::offload(_kernel_1, (x,)) +} + +fn _kernel_1(x: &mut [f32; 256]) {}