Skip to content

Commit 5f6098f

Browse files
committed
[Offload] Introduce the offload sanitizer (initially for traps)
This is the first commit for a new "OffloadSanitizer" that is designed to work well on GPUs. To keep the commit small, only traps are sanitized and we only report information about the encountering thread. It is also restricted to AMD GPUs for now, though that is not a conceptual requirement. The communication between the instrumented device code and the runtime is performed via host initialized pinned memory. If an error is detected, one encountering thread will setup this sanitizer environment and a hardware trap is executed to end the kernel. The host trap handler can check the sanitizer environment to determine if the trap was issued by the sanitizer code or not. If so, we report the reason (for now only that a trap was encountered), the encountering thread id, and the PC.
1 parent 7f19686 commit 5f6098f

File tree

18 files changed

+470
-27
lines changed

18 files changed

+470
-27
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
//===- Transforms/Instrumentation/OffloadSanitizer.h ------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Pass to instrument offload code in order to detect errors and communicate
10+
// them to the LLVM/Offload runtimes.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_OFFLOADSAN_H
15+
#define LLVM_TRANSFORMS_INSTRUMENTATION_OFFLOADSAN_H
16+
17+
#include "llvm/IR/PassManager.h"
18+
19+
namespace llvm {
20+
21+
class OffloadSanitizerPass : public PassInfoMixin<OffloadSanitizerPass> {
22+
public:
23+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
24+
};
25+
} // end namespace llvm
26+
27+
#endif // LLVM_TRANSFORMS_INSTRUMENTATION_OFFLOADSAN_H

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
#include "llvm/Transforms/IPO/ExpandVariadics.h"
6161
#include "llvm/Transforms/IPO/GlobalDCE.h"
6262
#include "llvm/Transforms/IPO/Internalize.h"
63+
#include "llvm/Transforms/Instrumentation/OffloadSanitizer.h"
6364
#include "llvm/Transforms/Scalar.h"
6465
#include "llvm/Transforms/Scalar/GVN.h"
6566
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
@@ -380,6 +381,11 @@ static cl::opt<bool> EnableHipStdPar(
380381
cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
381382
cl::Hidden);
382383

384+
static cl::opt<bool>
385+
EnableOffloadSanitizer("amdgpu-enable-offload-sanitizer",
386+
cl::desc("Enable the offload sanitizer"),
387+
cl::init(false), cl::Hidden);
388+
383389
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
384390
// Register the target
385391
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -744,6 +750,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
744750

745751
PB.registerFullLinkTimeOptimizationLastEPCallback(
746752
[this](ModulePassManager &PM, OptimizationLevel Level) {
753+
if (EnableOffloadSanitizer)
754+
PM.addPass(OffloadSanitizerPass());
755+
747756
// We want to support the -lto-partitions=N option as "best effort".
748757
// For that, we need to lower LDS earlier in the pipeline before the
749758
// module is partitioned for codegen.

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ add_llvm_target(AMDGPUCodeGen
185185
Core
186186
GlobalISel
187187
HipStdPar
188+
Instrumentation
188189
IPO
189190
IRPrinter
190191
MC

llvm/lib/Transforms/Instrumentation/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ add_llvm_component_library(LLVMInstrumentation
99
MemProfiler.cpp
1010
MemorySanitizer.cpp
1111
NumericalStabilitySanitizer.cpp
12+
OffloadSanitizer.cpp
1213
IndirectCallPromotion.cpp
1314
Instrumentation.cpp
1415
InstrOrderFile.cpp
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
//===-- OffloadSanitizer.cpp - Offload sanitizer --------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include "llvm/Transforms/Instrumentation/OffloadSanitizer.h"
12+
13+
#include "llvm/ADT/SetVector.h"
14+
#include "llvm/ADT/SmallVector.h"
15+
#include "llvm/IR/DebugInfoMetadata.h"
16+
#include "llvm/IR/DerivedTypes.h"
17+
#include "llvm/IR/IRBuilder.h"
18+
#include "llvm/IR/InstIterator.h"
19+
#include "llvm/IR/Instructions.h"
20+
#include "llvm/IR/IntrinsicInst.h"
21+
#include "llvm/IR/Intrinsics.h"
22+
#include "llvm/IR/IntrinsicsAMDGPU.h"
23+
#include "llvm/IR/Module.h"
24+
#include "llvm/IR/Value.h"
25+
#include "llvm/Transforms/Utils/Cloning.h"
26+
#include "llvm/Transforms/Utils/ModuleUtils.h"
27+
28+
using namespace llvm;
29+
30+
#define DEBUG_TYPE "offload-sanitizer"
31+
32+
namespace {
33+
34+
class OffloadSanitizerImpl final {
35+
public:
36+
OffloadSanitizerImpl(Module &M, FunctionAnalysisManager &FAM)
37+
: M(M), FAM(FAM), Ctx(M.getContext()) {}
38+
39+
bool instrument();
40+
41+
private:
42+
bool shouldInstrumentFunction(Function &Fn);
43+
bool instrumentFunction(Function &Fn);
44+
bool instrumentTrapInstructions(SmallVectorImpl<IntrinsicInst *> &TrapCalls);
45+
46+
FunctionCallee getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy,
47+
ArrayRef<Type *> ArgTys) {
48+
if (!FC) {
49+
auto *NewAllocationFnTy = FunctionType::get(RetTy, ArgTys, false);
50+
FC = M.getOrInsertFunction(Name, NewAllocationFnTy);
51+
}
52+
return FC;
53+
}
54+
55+
/// void __offload_san_trap_info(Int64Ty);
56+
FunctionCallee TrapInfoFn;
57+
FunctionCallee getTrapInfoFn() {
58+
return getOrCreateFn(TrapInfoFn, "__offload_san_trap_info", VoidTy,
59+
{/*PC*/ Int64Ty});
60+
}
61+
62+
CallInst *createCall(IRBuilder<> &IRB, FunctionCallee Callee,
63+
ArrayRef<Value *> Args = std::nullopt,
64+
const Twine &Name = "") {
65+
Calls.push_back(IRB.CreateCall(Callee, Args, Name));
66+
return Calls.back();
67+
}
68+
SmallVector<CallInst *> Calls;
69+
70+
Value *getPC(IRBuilder<> &IRB) {
71+
return IRB.CreateIntrinsic(Int64Ty, Intrinsic::amdgcn_s_getpc, {}, nullptr,
72+
"PC");
73+
}
74+
75+
Module &M;
76+
FunctionAnalysisManager &FAM;
77+
LLVMContext &Ctx;
78+
79+
Type *VoidTy = Type::getVoidTy(Ctx);
80+
Type *IntptrTy = M.getDataLayout().getIntPtrType(Ctx);
81+
PointerType *PtrTy = PointerType::getUnqual(Ctx);
82+
IntegerType *Int8Ty = Type::getInt8Ty(Ctx);
83+
IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
84+
IntegerType *Int64Ty = Type::getInt64Ty(Ctx);
85+
86+
const DataLayout &DL = M.getDataLayout();
87+
};
88+
89+
} // end anonymous namespace
90+
91+
bool OffloadSanitizerImpl::shouldInstrumentFunction(Function &Fn) {
92+
if (Fn.isDeclaration())
93+
return false;
94+
if (Fn.getName().contains("ompx") || Fn.getName().contains("__kmpc") ||
95+
Fn.getName().starts_with("rpc_"))
96+
return false;
97+
return !Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation);
98+
}
99+
100+
bool OffloadSanitizerImpl::instrumentTrapInstructions(
101+
SmallVectorImpl<IntrinsicInst *> &TrapCalls) {
102+
bool Changed = false;
103+
for (auto *II : TrapCalls) {
104+
IRBuilder<> IRB(II);
105+
createCall(IRB, getTrapInfoFn(), {getPC(IRB)});
106+
}
107+
return Changed;
108+
}
109+
110+
bool OffloadSanitizerImpl::instrumentFunction(Function &Fn) {
111+
if (!shouldInstrumentFunction(Fn))
112+
return false;
113+
114+
SmallVector<IntrinsicInst *> TrapCalls;
115+
116+
bool Changed = false;
117+
for (auto &I : instructions(Fn)) {
118+
switch (I.getOpcode()) {
119+
case Instruction::Call: {
120+
auto &CI = cast<CallInst>(I);
121+
if (auto *II = dyn_cast<IntrinsicInst>(&CI))
122+
if (II->getIntrinsicID() == Intrinsic::trap)
123+
TrapCalls.push_back(II);
124+
break;
125+
}
126+
default:
127+
break;
128+
}
129+
}
130+
131+
Changed |= instrumentTrapInstructions(TrapCalls);
132+
133+
return Changed;
134+
}
135+
136+
bool OffloadSanitizerImpl::instrument() {
137+
bool Changed = false;
138+
139+
for (Function &Fn : M)
140+
Changed |= instrumentFunction(Fn);
141+
142+
removeFromUsedLists(M, [&](Constant *C) {
143+
if (!C->getName().starts_with("__offload_san"))
144+
return false;
145+
return Changed = true;
146+
});
147+
148+
return Changed;
149+
}
150+
151+
PreservedAnalyses OffloadSanitizerPass::run(Module &M,
152+
ModuleAnalysisManager &AM) {
153+
FunctionAnalysisManager &FAM =
154+
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
155+
OffloadSanitizerImpl Impl(M, FAM);
156+
if (!Impl.instrument())
157+
return PreservedAnalyses::all();
158+
LLVM_DEBUG(M.dump());
159+
return PreservedAnalyses::none();
160+
}

offload/DeviceRTL/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ set(src_files
9494
${source_directory}/Misc.cpp
9595
${source_directory}/Parallelism.cpp
9696
${source_directory}/Reduction.cpp
97+
${source_directory}/Sanitizer.cpp
9798
${source_directory}/State.cpp
9899
${source_directory}/Synchronization.cpp
99100
${source_directory}/Tasking.cpp

offload/DeviceRTL/include/Utils.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
2929

3030
uint64_t ballotSync(uint64_t Mask, int32_t Pred);
3131

32+
/// Terminate the execution of this warp.
33+
void terminateWarp();
34+
3235
/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
3336
uint64_t pack(uint32_t LowBits, uint32_t HighBits);
3437

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
//===------ Sanitizer.cpp - Track allocation for sanitizer checks ---------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include "Mapping.h"
12+
#include "Shared/Environment.h"
13+
#include "Synchronization.h"
14+
#include "Types.h"
15+
#include "Utils.h"
16+
17+
using namespace ompx;
18+
19+
#define _SAN_ATTRS \
20+
[[clang::disable_sanitizer_instrumentation, gnu::used, gnu::retain]]
21+
#define _SAN_ENTRY_ATTRS [[gnu::flatten, gnu::always_inline]] _SAN_ATTRS
22+
23+
#pragma omp begin declare target device_type(nohost)
24+
25+
[[gnu::visibility("protected")]] _SAN_ATTRS SanitizerEnvironmentTy
26+
*__sanitizer_environment_ptr;
27+
28+
namespace {
29+
30+
/// Helper to lock the sanitizer environment. While we never unlock it, this
31+
/// allows us to have a no-op "side effect" in the spin-wait function below.
32+
_SAN_ATTRS bool
33+
getSanitizerEnvironmentLock(SanitizerEnvironmentTy &SE,
34+
SanitizerEnvironmentTy::ErrorCodeTy ErrorCode) {
35+
return atomic::cas(SE.getErrorCodeLocation(), SanitizerEnvironmentTy::NONE,
36+
ErrorCode, atomic::OrderingTy::seq_cst,
37+
atomic::OrderingTy::seq_cst);
38+
}
39+
40+
/// The spin-wait function should not be inlined, it's a catch all to give one
41+
/// thread time to setup the sanitizer environment.
42+
[[clang::noinline]] _SAN_ATTRS void spinWait(SanitizerEnvironmentTy &SE) {
43+
while (!atomic::load(&SE.IsInitialized, atomic::OrderingTy::aquire))
44+
;
45+
__builtin_trap();
46+
}
47+
48+
_SAN_ATTRS
49+
void setLocation(SanitizerEnvironmentTy &SE, uint64_t PC) {
50+
for (int I = 0; I < 3; ++I) {
51+
SE.ThreadId[I] = mapping::getThreadIdInBlock(I);
52+
SE.BlockId[I] = mapping::getBlockIdInKernel(I);
53+
}
54+
SE.PC = PC;
55+
56+
// This is the last step to initialize the sanitizer environment, time to
57+
// trap via the spinWait. Flush the memory writes and signal for the end.
58+
fence::system(atomic::OrderingTy::release);
59+
atomic::store(&SE.IsInitialized, 1, atomic::OrderingTy::release);
60+
}
61+
62+
_SAN_ATTRS
63+
void raiseExecutionError(SanitizerEnvironmentTy::ErrorCodeTy ErrorCode,
64+
uint64_t PC) {
65+
SanitizerEnvironmentTy &SE = *__sanitizer_environment_ptr;
66+
bool HasLock = getSanitizerEnvironmentLock(SE, ErrorCode);
67+
68+
// If no thread of this warp has the lock, end execution gracefully.
69+
bool AnyThreadHasLock = utils::ballotSync(lanes::All, HasLock);
70+
if (!AnyThreadHasLock)
71+
utils::terminateWarp();
72+
73+
// One thread will set the location information and signal that the rest of
74+
// the wapr that the actual trap can be executed now.
75+
if (HasLock)
76+
setLocation(SE, PC);
77+
78+
synchronize::warp(lanes::All);
79+
80+
// This is not the first thread that encountered the trap, to avoid a race
81+
// on the sanitizer environment, this thread is simply going to spin-wait.
82+
// The trap above will end the program for all threads.
83+
spinWait(SE);
84+
}
85+
86+
} // namespace
87+
88+
extern "C" {
89+
90+
_SAN_ENTRY_ATTRS void __offload_san_trap_info(uint64_t PC) {
91+
raiseExecutionError(SanitizerEnvironmentTy::TRAP, PC);
92+
}
93+
}
94+
95+
#pragma omp end declare target

offload/DeviceRTL/src/Utils.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
3838
int32_t Width);
3939

4040
uint64_t ballotSync(uint64_t Mask, int32_t Pred);
41+
void terminateWarp();
4142

4243
/// AMDGCN Implementation
4344
///
@@ -63,6 +64,8 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
6364
return Mask & __builtin_amdgcn_ballot_w64(Pred);
6465
}
6566

67+
void terminateWarp() { __builtin_amdgcn_endpgm(); }
68+
6669
bool isSharedMemPtr(const void *Ptr) {
6770
return __builtin_amdgcn_is_shared(
6871
(const __attribute__((address_space(0))) void *)Ptr);
@@ -90,6 +93,8 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
9093
return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
9194
}
9295

96+
void terminateWarp() { __nvvm_exit(); }
97+
9398
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
9499

95100
#pragma omp end declare variant
@@ -126,6 +131,8 @@ uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
126131
return impl::ballotSync(Mask, Pred);
127132
}
128133

134+
void utils::terminateWarp() { return impl::terminateWarp(); }
135+
129136
bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
130137

131138
extern "C" {

0 commit comments

Comments
 (0)