Skip to content

Commit 60f9156

Browse files
authored
[CUDA][HIP] capture possible ODR-used var (llvm#136645) (llvm#3443)
2 parents 1ab8080 + 3a0e77d commit 60f9156

File tree

6 files changed

+212
-3
lines changed

6 files changed

+212
-3
lines changed

clang/include/clang/Sema/ScopeInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,9 @@ class LambdaScopeInfo final :
949949

950950
SourceLocation PotentialThisCaptureLocation;
951951

952+
/// Variables that are potentially ODR-used in CUDA/HIP.
953+
llvm::SmallPtrSet<VarDecl *, 4> CUDAPotentialODRUsedVars;
954+
952955
LambdaScopeInfo(DiagnosticsEngine &Diag)
953956
: CapturingScopeInfo(Diag, ImpCap_None) {
954957
Kind = SK_Lambda;

clang/include/clang/Sema/SemaCUDA.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,10 @@ class SemaCUDA : public SemaBase {
274274
/// parameters specified via <<<>>>.
275275
std::string getConfigureFuncName() const;
276276

277+
/// Record variables that are potentially ODR-used in CUDA/HIP.
278+
void recordPotentialODRUsedVariable(MultiExprArg Args,
279+
OverloadCandidateSet &CandidateSet);
280+
277281
private:
278282
unsigned ForceHostDeviceDepth = 0;
279283

clang/lib/Sema/SemaCUDA.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "clang/Basic/TargetInfo.h"
1919
#include "clang/Lex/Preprocessor.h"
2020
#include "clang/Sema/Lookup.h"
21+
#include "clang/Sema/Overload.h"
2122
#include "clang/Sema/ScopeInfo.h"
2223
#include "clang/Sema/Sema.h"
2324
#include "clang/Sema/Template.h"
@@ -1081,3 +1082,49 @@ std::string SemaCUDA::getConfigureFuncName() const {
10811082
// Legacy CUDA kernel configuration call
10821083
return "cudaConfigureCall";
10831084
}
1085+
1086+
// Record any local constexpr variables that are passed one way on the host
1087+
// and another on the device.
1088+
void SemaCUDA::recordPotentialODRUsedVariable(
1089+
MultiExprArg Arguments, OverloadCandidateSet &Candidates) {
1090+
sema::LambdaScopeInfo *LambdaInfo = SemaRef.getCurLambda();
1091+
if (!LambdaInfo)
1092+
return;
1093+
1094+
for (unsigned I = 0; I < Arguments.size(); ++I) {
1095+
auto *DeclRef = dyn_cast<DeclRefExpr>(Arguments[I]);
1096+
if (!DeclRef)
1097+
continue;
1098+
auto *Variable = dyn_cast<VarDecl>(DeclRef->getDecl());
1099+
if (!Variable || !Variable->isLocalVarDecl() || !Variable->isConstexpr())
1100+
continue;
1101+
1102+
bool HostByValue = false, HostByRef = false;
1103+
bool DeviceByValue = false, DeviceByRef = false;
1104+
1105+
for (OverloadCandidate &Candidate : Candidates) {
1106+
FunctionDecl *Callee = Candidate.Function;
1107+
if (!Callee || I >= Callee->getNumParams())
1108+
continue;
1109+
1110+
CUDAFunctionTarget Target = IdentifyTarget(Callee);
1111+
if (Target == CUDAFunctionTarget::InvalidTarget ||
1112+
Target == CUDAFunctionTarget::Global)
1113+
continue;
1114+
1115+
bool CoversHost = (Target == CUDAFunctionTarget::Host ||
1116+
Target == CUDAFunctionTarget::HostDevice);
1117+
bool CoversDevice = (Target == CUDAFunctionTarget::Device ||
1118+
Target == CUDAFunctionTarget::HostDevice);
1119+
1120+
bool IsRef = Callee->getParamDecl(I)->getType()->isReferenceType();
1121+
HostByValue |= CoversHost && !IsRef;
1122+
HostByRef |= CoversHost && IsRef;
1123+
DeviceByValue |= CoversDevice && !IsRef;
1124+
DeviceByRef |= CoversDevice && IsRef;
1125+
}
1126+
1127+
if ((HostByValue && DeviceByRef) || (HostByRef && DeviceByValue))
1128+
LambdaInfo->CUDAPotentialODRUsedVars.insert(Variable);
1129+
}
1130+
}

clang/lib/Sema/SemaExpr.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19423,11 +19423,29 @@ static ExprResult rebuildPotentialResultsAsNonOdrUsed(Sema &S, Expr *E,
1942319423
return false;
1942419424
};
1942519425

19426+
// Check whether this expression may be odr-used in CUDA/HIP.
19427+
auto MaybeCUDAODRUsed = [&]() -> bool {
19428+
if (!S.LangOpts.CUDA)
19429+
return false;
19430+
LambdaScopeInfo *LSI = S.getCurLambda();
19431+
if (!LSI)
19432+
return false;
19433+
auto *DRE = dyn_cast<DeclRefExpr>(E);
19434+
if (!DRE)
19435+
return false;
19436+
auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
19437+
if (!VD)
19438+
return false;
19439+
return LSI->CUDAPotentialODRUsedVars.count(VD);
19440+
};
19441+
1942619442
// Mark that this expression does not constitute an odr-use.
1942719443
auto MarkNotOdrUsed = [&] {
19428-
S.MaybeODRUseExprs.remove(E);
19429-
if (LambdaScopeInfo *LSI = S.getCurLambda())
19430-
LSI->markVariableExprAsNonODRUsed(E);
19444+
if (!MaybeCUDAODRUsed()) {
19445+
S.MaybeODRUseExprs.remove(E);
19446+
if (LambdaScopeInfo *LSI = S.getCurLambda())
19447+
LSI->markVariableExprAsNonODRUsed(E);
19448+
}
1943119449
};
1943219450

1943319451
// C++2a [basic.def.odr]p2:

clang/lib/Sema/SemaOverload.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14270,6 +14270,8 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn,
1427014270
// the UnresolvedLookupExpr was type-dependent.
1427114271
if (OverloadResult == OR_Success) {
1427214272
const FunctionDecl *FDecl = Best->Function;
14273+
if (LangOpts.CUDA)
14274+
CUDA().recordPotentialODRUsedVariable(Args, CandidateSet);
1427314275
if (FDecl && FDecl->isTemplateInstantiation() &&
1427414276
FDecl->getReturnType()->isUndeducedType()) {
1427514277
if (const auto *TP =
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
// RUN: %clang_cc1 -emit-llvm -x hip %s -o - -triple x86_64-linux-gnu \
2+
// RUN: | FileCheck -check-prefixes=CHECK,HOST %s
3+
// RUN: %clang_cc1 -emit-llvm -x hip %s -o - -triple amdgcn-amd-amdhsa -fcuda-is-device \
4+
// RUN: | FileCheck -check-prefixes=CHECK,DEV %s
5+
6+
#include "Inputs/cuda.h"
7+
8+
// CHECK: %class.anon = type { ptr, float, ptr, ptr }
9+
// CHECK: %class.anon.0 = type { ptr, float, ptr, ptr }
10+
// CHECK: %class.anon.1 = type { ptr, ptr, ptr }
11+
// CHECK: %class.anon.2 = type { ptr, float, ptr, ptr }
12+
13+
// HOST: call void @_ZN8DevByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon)
14+
// DEV: define amdgpu_kernel void @_ZN8DevByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon)
15+
16+
// Only the device function passes arugments by value.
17+
namespace DevByVal {
18+
__device__ float fun(float x, float y) {
19+
return x;
20+
}
21+
22+
float fun(const float &x, const float &y) {
23+
return x;
24+
}
25+
26+
template<typename F>
27+
void __global__ kernel(F f)
28+
{
29+
f(1);
30+
}
31+
32+
void test(float const * fl, float const * A, float * Vf)
33+
{
34+
float constexpr small(1.0e-25);
35+
36+
auto lambda = [=] __device__ __host__ (unsigned int n) {
37+
float const value = fun(small, fl[0]);
38+
Vf[0] = value * A[0];
39+
};
40+
kernel<<<1, 1>>>(lambda);
41+
}
42+
}
43+
44+
// HOST: call void @_ZN9HostByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.0)
45+
// DEV: define amdgpu_kernel void @_ZN9HostByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.0)
46+
47+
// Only the host function passes arugments by value.
48+
namespace HostByVal {
49+
float fun(float x, float y) {
50+
return x;
51+
}
52+
53+
__device__ float fun(const float &x, const float &y) {
54+
return x;
55+
}
56+
57+
template<typename F>
58+
void __global__ kernel(F f)
59+
{
60+
f(1);
61+
}
62+
63+
void test(float const * fl, float const * A, float * Vf)
64+
{
65+
float constexpr small(1.0e-25);
66+
67+
auto lambda = [=] __device__ __host__ (unsigned int n) {
68+
float const value = fun(small, fl[0]);
69+
Vf[0] = value * A[0];
70+
};
71+
kernel<<<1, 1>>>(lambda);
72+
}
73+
}
74+
75+
// HOST: call void @_ZN9BothByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.1)
76+
// DEV: define amdgpu_kernel void @_ZN9BothByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.1)
77+
78+
// Both the host and device functions pass arugments by value.
79+
namespace BothByVal {
80+
float fun(float x, float y) {
81+
return x;
82+
}
83+
84+
__device__ float fun(float x, float y) {
85+
return x;
86+
}
87+
88+
template<typename F>
89+
void __global__ kernel(F f)
90+
{
91+
f(1);
92+
}
93+
94+
void test(float const * fl, float const * A, float * Vf)
95+
{
96+
float constexpr small(1.0e-25);
97+
98+
auto lambda = [=] __device__ __host__ (unsigned int n) {
99+
float const value = fun(small, fl[0]);
100+
Vf[0] = value * A[0];
101+
};
102+
kernel<<<1, 1>>>(lambda);
103+
}
104+
}
105+
106+
// HOST: call void @_ZN12NeitherByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.2)
107+
// DEV: define amdgpu_kernel void @_ZN12NeitherByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.2)
108+
109+
// Neither the host nor device function passes arugments by value.
110+
namespace NeitherByVal {
111+
float fun(const float& x, const float& y) {
112+
return x;
113+
}
114+
115+
__device__ float fun(const float& x, const float& y) {
116+
return x;
117+
}
118+
119+
template<typename F>
120+
void __global__ kernel(F f)
121+
{
122+
f(1);
123+
}
124+
125+
void test(float const * fl, float const * A, float * Vf)
126+
{
127+
float constexpr small(1.0e-25);
128+
129+
auto lambda = [=] __device__ __host__ (unsigned int n) {
130+
float const value = fun(small, fl[0]);
131+
Vf[0] = value * A[0];
132+
};
133+
kernel<<<1, 1>>>(lambda);
134+
}
135+
}

0 commit comments

Comments
 (0)