Skip to content

Commit 29f1039

Browse files
yxsamliutstellar
authored andcommitted
[CUDA][HIP] Externalize kernels with internal linkage
This patch is a continuation of https://reviews.llvm.org/D123353. Not only kernels in anonymous namespace, but also template kernels with template arguments in anonymous namespace need to be externalized. To be more generic, this patch checks the linkage of a kernel assuming the kernel does not have __global__ attribute. If the linkage is internal then clang will externalize it. This patch also fixes the postfix for externalized symbol since nvptx does not allow '.' in symbol name. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D124189 Fixes: #54560 (cherry picked from commit 04fb816)
1 parent e6de9ed commit 29f1039

File tree

6 files changed

+91
-30
lines changed

6 files changed

+91
-30
lines changed

clang/lib/AST/ASTContext.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12267,7 +12267,9 @@ bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
1226712267
// anonymous name space needs to be externalized to avoid duplicate symbols.
1226812268
return (IsStaticVar &&
1226912269
(D->hasAttr<HIPManagedAttr>() || IsExplicitDeviceVar)) ||
12270-
(D->hasAttr<CUDAGlobalAttr>() && D->isInAnonymousNamespace());
12270+
(D->hasAttr<CUDAGlobalAttr>() &&
12271+
basicGVALinkageForFunction(*this, cast<FunctionDecl>(D)) ==
12272+
GVA_Internal);
1227112273
}
1227212274

1227312275
bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6647,6 +6647,12 @@ bool CodeGenModule::stopAutoInit() {
66476647

66486648
void CodeGenModule::printPostfixForExternalizedDecl(llvm::raw_ostream &OS,
66496649
const Decl *D) const {
6650-
OS << (isa<VarDecl>(D) ? "__static__" : ".anon.")
6651-
<< getContext().getCUIDHash();
6650+
StringRef Tag;
6651+
// ptxas does not allow '.' in symbol names. On the other hand, HIP prefers
6652+
// postfix beginning with '.' since the symbol name can be demangled.
6653+
if (LangOpts.HIP)
6654+
Tag = (isa<VarDecl>(D) ? ".static." : ".intern.");
6655+
else
6656+
Tag = (isa<VarDecl>(D) ? "__static__" : "__intern__");
6657+
OS << Tag << getContext().getCUIDHash();
66526658
}

clang/test/CodeGenCUDA/device-var-linkage.cu

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1-
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
1+
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
22
// RUN: -emit-llvm -o - -x hip %s \
33
// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
4-
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
4+
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
55
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
66
// RUN: | FileCheck -check-prefixes=DEV,RDC %s
7-
// RUN: %clang_cc1 -triple nvptx \
7+
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
88
// RUN: -emit-llvm -o - -x hip %s \
99
// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
10-
// RUN: %clang_cc1 -triple nvptx \
10+
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
1111
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
1212
// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
13+
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
14+
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - %s \
15+
// RUN: | FileCheck -check-prefixes=CUDA %s
1316

1417
#include "Inputs/cuda.h"
1518

@@ -24,7 +27,9 @@ __constant__ int v2;
2427
// DEV-DAG: @v3 = addrspace(1) externally_initialized global i32 addrspace(1)* null
2528
// NORDC-H-DAG: @v3 = internal externally_initialized global i32* null
2629
// RDC-H-DAG: @v3 = externally_initialized global i32* null
30+
#if __HIP__
2731
__managed__ int v3;
32+
#endif
2833

2934
// DEV-DAG: @ev1 = external addrspace(1) global i32
3035
// HOST-DAG: @ev1 = external global i32
@@ -34,25 +39,35 @@ extern __device__ int ev1;
3439
extern __constant__ int ev2;
3540
// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global i32 addrspace(1)*
3641
// HOST-DAG: @ev3 = external externally_initialized global i32*
42+
#if __HIP__
3743
extern __managed__ int ev3;
44+
#endif
3845

3946
// NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0
40-
// RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
47+
// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
4148
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
49+
// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
4250
static __device__ int sv1;
4351
// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized global i32 0
44-
// RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
52+
// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
4553
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
54+
// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
4655
static __constant__ int sv2;
4756
// NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 addrspace(1)* null
48-
// RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
57+
// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
4958
// HOST-DAG: @_ZL3sv3 = internal externally_initialized global i32* null
59+
#if __HIP__
5060
static __managed__ int sv3;
61+
#endif
5162

5263
__device__ __host__ int work(int *x);
5364

5465
__device__ __host__ int fun1() {
55-
return work(&ev1) + work(&ev2) + work(&ev3) + work(&sv1) + work(&sv2) + work(&sv3);
66+
return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2)
67+
#if __HIP__
68+
+ work(&ev3) + work(&sv3)
69+
#endif
70+
;
5671
}
5772

5873
// HOST: hipRegisterVar({{.*}}@v1

clang/test/CodeGenCUDA/kernel-in-anon-ns.cu

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,53 @@
66
// RUN: -aux-triple amdgcn-amd-amdhsa -std=c++11 -fgpu-rdc \
77
// RUN: -emit-llvm -o - -x hip %s > %t.host
88

9-
// RUN: cat %t.dev %t.host | FileCheck %s
9+
// RUN: cat %t.dev %t.host | FileCheck -check-prefixes=HIP,COMMON %s
10+
11+
// RUN: echo "GPU binary" > %t.fatbin
12+
13+
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \
14+
// RUN: -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \
15+
// RUN: -emit-llvm -o - %s > %t.dev
16+
17+
// RUN: %clang_cc1 -triple x86_64-gnu-linux -cuid=abc \
18+
// RUN: -aux-triple nvptx -std=c++11 -fgpu-rdc -fcuda-include-gpubinary %t.fatbin \
19+
// RUN: -emit-llvm -o - %s > %t.host
20+
21+
// RUN: cat %t.dev %t.host | FileCheck -check-prefixes=CUDA,COMMON %s
1022

1123
#include "Inputs/cuda.h"
1224

13-
// CHECK: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv\.anon\.b04fd23c98500190]](
14-
// CHECK: @[[STR:.*]] = {{.*}} c"[[KERN]]\00"
15-
// CHECK: call i32 @__hipRegisterFunction({{.*}}@[[STR]]
25+
// HIP-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv\.intern\.b04fd23c98500190]](
26+
// HIP-DAG: define weak_odr {{.*}}void @[[KERN2:_Z8tempKernIN12_GLOBAL__N_11XEEvT_\.intern\.b04fd23c98500190]](
27+
// HIP-DAG: define weak_odr {{.*}}void @[[KERN3:_Z8tempKernIN12_GLOBAL__N_1UlvE_EEvT_\.intern\.b04fd23c98500190]](
28+
29+
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
30+
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN2:_Z8tempKernIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
31+
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN3:_Z8tempKernIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](
32+
33+
// COMMON-DAG: @[[STR1:.*]] = {{.*}} c"[[KERN1]]\00"
34+
// COMMON-DAG: @[[STR2:.*]] = {{.*}} c"[[KERN2]]\00"
35+
// COMMON-DAG: @[[STR3:.*]] = {{.*}} c"[[KERN3]]\00"
36+
37+
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR1]]
38+
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR2]]
39+
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR3]]
40+
41+
42+
template <typename T>
43+
__global__ void tempKern(T x) {}
1644

1745
namespace {
18-
__global__ void kernel() {
19-
}
46+
__global__ void kernel() {}
47+
struct X {};
48+
X x;
49+
auto lambda = [](){};
2050
}
2151

2252
void test() {
2353
kernel<<<1, 1>>>();
54+
55+
tempKern<<<1, 1>>>(x);
56+
57+
tempKern<<<1, 1>>>(lambda);
2458
}

clang/test/CodeGenCUDA/managed-var.cu

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
// REQUIRES: x86-registered-target, amdgpu-registered-target
2-
31
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
42
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
53
// RUN: -check-prefixes=COMMON,DEV,NORDC-D %s
@@ -52,15 +50,15 @@ extern __managed__ int ex;
5250

5351
// NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4
5452
// NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null
55-
// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
56-
// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
53+
// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
54+
// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
5755
// HOST-DAG: @_ZL2sx.managed = internal global i32 1
5856
// HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
5957
// NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
60-
// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00"
58+
// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"
6159

62-
// POSTFIX: @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
63-
// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00"
60+
// POSTFIX: @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
61+
// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
6462
static __managed__ int sx = 1;
6563

6664
// DEV-DAG: @llvm.compiler.used

clang/test/CodeGenCUDA/static-device-var-rdc.cu

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@
4040
// RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
4141
// RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s
4242

43+
// Check postfix for CUDA.
44+
45+
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \
46+
// RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \
47+
// RUN: -check-prefixes=CUDA %s
4348

4449
#include "Inputs/cuda.h"
4550

@@ -55,11 +60,12 @@
5560
// INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
5661

5762
// Test externalized static device variables
58-
// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
59-
// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00"
63+
// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
64+
// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
65+
// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
6066

61-
// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
62-
// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00"
67+
// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
68+
// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
6369

6470
static __device__ int x;
6571

@@ -73,8 +79,8 @@ static __device__ int x2;
7379
// INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
7480

7581
// Test externalized static device variables
76-
// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
77-
// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00"
82+
// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
83+
// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
7884

7985
static __constant__ int y;
8086

0 commit comments

Comments
 (0)