Skip to content

Commit 3540edb

Browse files
committed
Add mapping bitflags and general cleanup
1 parent 10ef5bd commit 3540edb

File tree

7 files changed

+80
-107
lines changed

7 files changed

+80
-107
lines changed

compiler/rustc_codegen_llvm/src/back/lto.rs

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use crate::back::write::{
2626
};
2727
use crate::errors::{LlvmError, LtoBitcodeFromRlib};
2828
use crate::llvm::{self, build_string};
29-
use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx};
29+
use crate::{LlvmCodegenBackend, ModuleLlvm};
3030

3131
/// We keep track of the computed LTO cache keys from the previous
3232
/// session to determine which CGUs we can reuse.
@@ -601,7 +601,6 @@ pub(crate) fn run_pass_manager(
601601
// We then run the llvm_optimize function a second time, to optimize the code which we generated
602602
// in the enzyme differentiation pass.
603603
let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
604-
let enable_gpu = config.offload.contains(&config::Offload::Enable);
605604
let stage = if thin {
606605
write::AutodiffStage::PreAD
607606
} else {
@@ -616,13 +615,6 @@ pub(crate) fn run_pass_manager(
616615
write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage);
617616
}
618617

619-
// Here we only handle the GPU host (=cpu) code.
620-
if enable_gpu && !thin && !cgcx.target_is_like_gpu {
621-
let cx =
622-
SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
623-
crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
624-
}
625-
626618
if cfg!(feature = "llvm_enzyme") && enable_ad && !thin {
627619
let opt_stage = llvm::OptStage::FatLTO;
628620
let stage = write::AutodiffStage::PostAD;

compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs

Lines changed: 5 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,40 +2,13 @@ use std::ffi::CString;
22

33
use llvm::Linkage::*;
44
use rustc_abi::Align;
5-
use rustc_codegen_ssa::back::write::CodegenContext;
65
use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
76
use rustc_middle::ty::offload_meta::OffloadMetadata;
8-
use rustc_middle::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
97

108
use crate::builder::SBuilder;
119
use crate::llvm::AttributePlace::Function;
1210
use crate::llvm::{self, BasicBlock, Linkage, Type, Value};
13-
use crate::{LlvmCodegenBackend, SimpleCx, attributes};
14-
15-
pub(crate) fn handle_gpu_code<'ll>(
16-
_cgcx: &CodegenContext<LlvmCodegenBackend>,
17-
cx: &'ll SimpleCx<'_>,
18-
) {
19-
/*
20-
// The offload memory transfer type for each kernel
21-
let mut memtransfer_types = vec![];
22-
let mut region_ids = vec![];
23-
let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
24-
// This is a temporary hack, we only search for kernel_0 to kernel_9 functions.
25-
// There is a draft PR in progress which will introduce a proper offload intrinsic to remove
26-
// this limitation.
27-
for num in 0..9 {
28-
let kernel = cx.get_function(&format!("kernel_{num}"));
29-
if let Some(kernel) = kernel {
30-
let (o, k) = gen_define_handling(&cx, kernel, offload_entry_ty, num);
31-
memtransfer_types.push(o);
32-
region_ids.push(k);
33-
}
34-
}
35-
36-
gen_call_handling(&cx, &memtransfer_types, &region_ids);
37-
*/
38-
}
11+
use crate::{SimpleCx, attributes};
3912

4013
// ; Function Attrs: nounwind
4114
// declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2
@@ -273,13 +246,10 @@ pub(crate) fn gen_define_handling<'ll>(
273246
_ => None,
274247
});
275248

249+
// FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
276250
let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) =
277-
ptr_meta.map(|m| (m.payload_size, m.mode as u64 | 0x20)).unzip();
251+
ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip();
278252

279-
// We do not know their size anymore at this level, so hardcode a placeholder.
280-
// A follow-up pr will track these from the frontend, where we still have Rust types.
281-
// Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
282-
// I decided that 1024 bytes is a great placeholder value for now.
283253
let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes);
284254
// Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
285255
// or both to and from the gpu (=3). Other values shouldn't affect us for now.
@@ -305,7 +275,6 @@ pub(crate) fn gen_define_handling<'ll>(
305275
llvm::set_alignment(llglobal, Align::ONE);
306276
llvm::set_section(llglobal, c".llvm.rodata.offloading");
307277

308-
// Not actively used yet, for calling real kernels
309278
let name = format!(".offloading.entry.{symbol}");
310279

311280
// See the __tgt_offload_entry documentation above.
@@ -340,8 +309,7 @@ fn declare_offload_fn<'ll>(
340309
}
341310

342311
// For each kernel *call*, we now use some of our previous declared globals to move data to and from
343-
// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
344-
// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
312+
// the gpu. For now, we only handle the data transfer part of it.
345313
// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
346314
// Since in our frontend users (by default) don't have to specify data transfer, this is something
347315
// we should optimize in the future! We also assume that everything should be copied back and forth,
@@ -383,6 +351,7 @@ pub(crate) fn gen_call_handling<'ll>(
383351

384352
let mut builder = SBuilder::build(cx, bb);
385353

354+
// prevent these globals from being optimized away
386355
for val in [offload_sizes, offload_entry] {
387356
unsafe {
388357
let dummy = llvm::LLVMBuildLoad2(
@@ -447,8 +416,6 @@ pub(crate) fn gen_call_handling<'ll>(
447416
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
448417
builder.store(geps[i as usize], gep2, Align::EIGHT);
449418
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
450-
// As mentioned above, we don't use Rust type information yet. So for now we will just
451-
// assume that we have 1024 bytes, 256 f32 values.
452419
// FIXME(offload): write an offload frontend and handle arbitrary types.
453420
builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
454421
}

compiler/rustc_codegen_llvm/src/intrinsic.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use rustc_symbol_mangling::{mangle_internal_symbol, symbol_name_for_instance_in_
2121
use rustc_target::callconv::PassMode;
2222
use tracing::debug;
2323

24-
use crate::abi::{FnAbiLlvmExt, LlvmType};
24+
use crate::abi::FnAbiLlvmExt;
2525
use crate::builder::Builder;
2626
use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call};
2727
use crate::builder::gpu_offload::TgtOffloadEntry;
@@ -198,6 +198,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
198198
return Ok(());
199199
}
200200
sym::offload => {
201+
// FIXME(Sa4dUs): emit error when offload is not enabled
201202
codegen_offload(self, tcx, instance, args);
202203
return Ok(());
203204
}
@@ -1265,15 +1266,13 @@ fn codegen_offload<'ll, 'tcx>(
12651266

12661267
let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
12671268

1268-
// Build TypeTree (or something similar)
12691269
let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder();
12701270
let inputs = sig.inputs();
12711271

12721272
let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::<Vec<_>>();
12731273

12741274
let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::<Vec<_>>();
12751275

1276-
// TODO(Sa4dUs): separate globals from call-independent headers and use typetrees to reserve the correct amount of memory
12771276
let (offload_sizes, memtransfer_types, region_id, offload_entry) =
12781277
crate::builder::gpu_offload::gen_define_handling(
12791278
cx,
@@ -1283,7 +1282,6 @@ fn codegen_offload<'ll, 'tcx>(
12831282
&target_symbol,
12841283
);
12851284

1286-
// TODO(Sa4dUs): this is just to a void lifetime's issues
12871285
let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) };
12881286
crate::builder::gpu_offload::gen_call_handling(
12891287
cx,

compiler/rustc_codegen_llvm/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
//!
55
//! This API is completely unstable and subject to change.
66
7-
// TODO(Sa4dUs): remove this once we have a great version, just to ignore unused LLVM wrappers
8-
#![allow(unused)]
97
// tidy-alphabetical-start
108
#![feature(assert_matches)]
119
#![feature(extern_types)]

compiler/rustc_codegen_llvm/src/llvm/ffi.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,14 +1157,9 @@ unsafe extern "C" {
11571157
) -> &'a BasicBlock;
11581158

11591159
// Operations on instructions
1160-
pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock;
1161-
pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>;
11621160
pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>;
11631161
pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock;
11641162
pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>;
1165-
pub(crate) fn LLVMGetNextInstruction(Val: &Value) -> Option<&Value>;
1166-
pub(crate) fn LLVMInstructionEraseFromParent(Val: &Value);
1167-
pub(crate) fn LLVMGetNumOperands(Val: &Value) -> c_uint;
11681163

11691164
// Operations on call sites
11701165
pub(crate) fn LLVMSetInstructionCallConv(Instr: &Value, CC: c_uint);
@@ -2450,7 +2445,6 @@ unsafe extern "C" {
24502445

24512446
pub(crate) fn LLVMRustSetDataLayoutFromTargetMachine<'a>(M: &'a Module, TM: &'a TargetMachine);
24522447

2453-
pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value);
24542448
pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
24552449

24562450
pub(crate) fn LLVMRustSetModulePICLevel(M: &Module);
Lines changed: 66 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,64 @@
1+
use bitflags::bitflags;
2+
13
use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
24

3-
// TODO(Sa4dUs): it doesn't feel correct for me to place this on `rustc_ast::expand`, will look for a proper location
45
pub struct OffloadMetadata {
56
pub payload_size: u64,
6-
pub mode: TransferKind,
7+
pub mode: MappingFlags,
78
}
89

9-
// TODO(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` flag only when needed
10-
#[repr(u64)]
11-
#[derive(Debug, Copy, Clone)]
12-
pub enum TransferKind {
13-
FromGpu = 1,
14-
ToGpu = 2,
15-
Both = 1 + 2,
10+
bitflags! {
11+
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
12+
#[derive(Debug, Copy, Clone)]
13+
#[repr(transparent)]
14+
pub struct MappingFlags: u64 {
15+
/// No flags.
16+
const NONE = 0x0;
17+
/// Allocate memory on the device and move data from host to device.
18+
const TO = 0x01;
19+
/// Allocate memory on the device and move data from device to host.
20+
const FROM = 0x02;
21+
/// Always perform the requested mapping action, even if already mapped.
22+
const ALWAYS = 0x04;
23+
/// Delete the element from the device environment, ignoring ref count.
24+
const DELETE = 0x08;
25+
/// The element being mapped is a pointer-pointee pair.
26+
const PTR_AND_OBJ = 0x10;
27+
/// The base address should be passed to the target kernel as argument.
28+
const TARGET_PARAM = 0x20;
29+
/// The runtime must return the device pointer.
30+
const RETURN_PARAM = 0x40;
31+
/// The reference being passed is a pointer to private data.
32+
const PRIVATE = 0x80;
33+
/// Pass the element by value.
34+
const LITERAL = 0x100;
35+
/// Implicit map (generated by compiler, not explicit in code).
36+
const IMPLICIT = 0x200;
37+
/// Hint to allocate memory close to the target device.
38+
const CLOSE = 0x400;
39+
/// Reserved (0x800 in OpenMP for XLC compatibility).
40+
const RESERVED = 0x800;
41+
/// Require that the data is already allocated on the device.
42+
const PRESENT = 0x1000;
43+
/// Increment/decrement a separate ref counter (OpenACC compatibility).
44+
const OMPX_HOLD = 0x2000;
45+
/// Used for non-contiguous list items in target update.
46+
const NON_CONTIG = 0x100000000000;
47+
/// 16 MSBs indicate membership in a struct.
48+
const MEMBER_OF = 0xffff000000000000;
49+
}
1650
}
1751

1852
impl OffloadMetadata {
19-
pub fn new(payload_size: u64, mode: TransferKind) -> Self {
20-
OffloadMetadata { payload_size, mode }
21-
}
22-
2353
pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
2454
OffloadMetadata {
2555
payload_size: get_payload_size(tcx, ty),
26-
mode: TransferKind::from_ty(tcx, ty),
56+
mode: MappingFlags::from_ty(tcx, ty),
2757
}
2858
}
2959
}
3060

31-
// TODO(Sa4dUs): WIP, rn we just have a naive logic for references
61+
// FIXME(Sa4dUs): implement a solid logic to determine the payload size
3262
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
3363
match ty.kind() {
3464
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
@@ -43,48 +73,42 @@ fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
4373
}
4474
}
4575

46-
impl TransferKind {
47-
pub fn from_ty<'tcx>(_tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
48-
// TODO(Sa4dUs): this logic is probs not fully correct, but it works for now
76+
impl MappingFlags {
77+
fn from_ty<'tcx>(_tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
78+
use rustc_ast::Mutability::*;
79+
4980
match ty.kind() {
5081
ty::Bool
5182
| ty::Char
5283
| ty::Int(_)
5384
| ty::Uint(_)
54-
| ty::Float(_) => TransferKind::ToGpu,
55-
56-
ty::Adt(_, _)
85+
| ty::Float(_)
86+
| ty::Adt(_, _)
5787
| ty::Tuple(_)
58-
| ty::Array(_, _) => TransferKind::ToGpu,
59-
60-
ty::RawPtr(_, rustc_ast::Mutability::Not)
61-
| ty::Ref(_, _, rustc_ast::Mutability::Not) => TransferKind::ToGpu,
62-
63-
ty::RawPtr(_, rustc_ast::Mutability::Mut)
64-
| ty::Ref(_, _, rustc_ast::Mutability::Mut) => TransferKind::Both,
65-
66-
ty::Slice(_)
67-
| ty::Str
68-
| ty::Dynamic(_, _) => TransferKind::Both,
69-
70-
ty::FnDef(_, _)
88+
| ty::Array(_, _)
89+
| ty::FnDef(_, _)
7190
| ty::FnPtr(_, _)
7291
| ty::Closure(_, _)
7392
| ty::CoroutineClosure(_, _)
7493
| ty::Coroutine(_, _)
75-
| ty::CoroutineWitness(_, _) => TransferKind::ToGpu,
76-
77-
ty::Alias(_, _)
94+
| ty::CoroutineWitness(_, _)
95+
| ty::Never
96+
| ty::Alias(_, _)
7897
| ty::Param(_)
7998
| ty::Bound(_, _)
8099
| ty::Placeholder(_)
81100
| ty::Infer(_)
82-
| ty::Error(_) => TransferKind::ToGpu,
101+
| ty::Error(_) => MappingFlags::TO,
102+
103+
ty::RawPtr(_, Not) | ty::Ref(_, _, Not) => MappingFlags::TO,
104+
105+
ty::RawPtr(_, Mut) | ty::Ref(_, _, Mut) => MappingFlags::TO | MappingFlags::FROM,
106+
107+
ty::Slice(_) | ty::Str | ty::Dynamic(_, _) => MappingFlags::TO | MappingFlags::FROM,
83108

84-
ty::Never => TransferKind::ToGpu,
85-
ty::Foreign(_) => TransferKind::Both,
86-
ty::Pat(_, _) => TransferKind::Both,
87-
ty::UnsafeBinder(_) => TransferKind::Both,
109+
ty::Foreign(_) | ty::Pat(_, _) | ty::UnsafeBinder(_) => {
110+
MappingFlags::TO | MappingFlags::FROM
111+
}
88112
}
89113
}
90114
}

tests/codegen-llvm/gpu_offload/gpu_host.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ fn main() {
2828

2929
// CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024]
3030
// CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35]
31-
// CHECK: @._kernel_1.region_id = weak unnamed_addr constant i8 0
31+
// CHECK: @._kernel_1.region_id = internal unnamed_addr constant i8 0
3232
// CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1
33-
// CHECK: @.offloading.entry._kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8
33+
// CHECK: @.offloading.entry._kernel_1 = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8
3434

3535
// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
3636
// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8
@@ -80,10 +80,10 @@ fn main() {
8080
// CHECK-NEXT: %6 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72
8181
// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %5, i8 0, i64 32, i1 false)
8282
// CHECK-NEXT: store <4 x i32> <i32 2097152, i32 0, i32 0, i32 256>, ptr %6, align 8
83-
// CHECK-NEXT: %.fca.1.gep2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88
84-
// CHECK-NEXT: store i32 0, ptr %.fca.1.gep2, align 8
85-
// CHECK-NEXT: %.fca.2.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92
86-
// CHECK-NEXT: store i32 0, ptr %.fca.2.gep3, align 4
83+
// CHECK-NEXT: %.fca.1.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88
84+
// CHECK-NEXT: store i32 0, ptr %.fca.1.gep3, align 8
85+
// CHECK-NEXT: %.fca.2.gep4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92
86+
// CHECK-NEXT: store i32 0, ptr %.fca.2.gep4, align 4
8787
// CHECK-NEXT: %7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96
8888
// CHECK-NEXT: store i32 0, ptr %7, align 8
8989
// CHECK-NEXT: %8 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2097152, i32 256, ptr nonnull @._kernel_1.region_id, ptr nonnull %kernel_args)

0 commit comments

Comments
 (0)