Skip to content

Commit 27f9b65

Browse files
yxsamliugithub-actions[bot]
authored andcommitted
Automerge: HIP non-RDC: enable new offload driver on Windows via linker wrapper (#167918)
Use clang linker wrapper to device-link and embed HIP fat binary directly. Match CUDA non-RDC flow in new driver by producing .hipfb like .fatbin. Previously, llvm offload binary is used to package the device IR's and embed them in the host object file, then clang linker wrapper is used with each host object file to extract device IR's, perform device linking, bundle code objects into a fat binary, wrap it in a host object file, then merge it with the original host object by the host linker with '-r' option. However, the host linker in MSVC toolchain does not support '-r' option. The new approach still package the device IR's with llvm offload binary, but instead of embed it in a host object, it is passed to clang linker wrapper directly, where device IR's are extracted and linked, fat binary is generated, then embeded in the host object directly. Compared with the old offload driver, this approach can parallelize the device linking for different GPU's by using the parallelization feature of clang linker wrapper. Fixes: SWDEV-565994
2 parents 44e37c3 + ea66d26 commit 27f9b65

File tree

10 files changed

+160
-82
lines changed

10 files changed

+160
-82
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
12801280
return nullptr;
12811281
}
12821282
if (CGM.getLangOpts().OffloadViaLLVM ||
1283-
(CGM.getLangOpts().OffloadingNewDriver &&
1284-
(CGM.getLangOpts().HIP || RelocatableDeviceCode)))
1283+
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
12851284
createOffloadingEntries();
12861285
else
12871286
return makeModuleCtorFunction();

clang/lib/Driver/Driver.cpp

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4416,10 +4416,6 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
44164416
options::OPT_no_offload_new_driver,
44174417
C.isOffloadingHostKind(Action::OFK_Cuda));
44184418

4419-
bool HIPNoRDC =
4420-
C.isOffloadingHostKind(Action::OFK_HIP) &&
4421-
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4422-
44234419
// Builder to be used to build offloading actions.
44244420
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
44254421
!UseNewOffloadingDriver
@@ -4553,7 +4549,7 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
45534549
// Check if this Linker Job should emit a static library.
45544550
if (ShouldEmitStaticLibrary(Args)) {
45554551
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
4556-
} else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
4552+
} else if (UseNewOffloadingDriver ||
45574553
Args.hasArg(options::OPT_offload_link)) {
45584554
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
45594555
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4890,20 +4886,6 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
48904886
<< "-fhip-emit-relocatable"
48914887
<< "--offload-device-only";
48924888

4893-
// For HIP non-rdc non-device-only compilation, create a linker wrapper
4894-
// action for each host object to link, bundle and wrap device files in
4895-
// it.
4896-
if ((isa<AssembleJobAction>(HostAction) ||
4897-
(isa<BackendJobAction>(HostAction) &&
4898-
HostAction->getType() == types::TY_LTO_BC)) &&
4899-
HIPNoRDC && !offloadDeviceOnly()) {
4900-
ActionList AL{HostAction};
4901-
HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
4902-
HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
4903-
/*BoundArch=*/nullptr);
4904-
return HostAction;
4905-
}
4906-
49074889
// Don't build offloading actions if we do not have a compile action. If
49084890
// preprocessing only ignore embedding.
49094891
if (!(isa<CompileJobAction>(HostAction) ||
@@ -5068,6 +5050,21 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
50685050
DDep.add(*FatbinAction,
50695051
*C.getOffloadToolChains<Action::OFK_HIP>().first->second, nullptr,
50705052
Action::OFK_HIP);
5053+
} else if (HIPNoRDC) {
5054+
// Package all the offloading actions into a single output that can be
5055+
// embedded in the host and linked.
5056+
Action *PackagerAction =
5057+
C.MakeAction<OffloadPackagerJobAction>(OffloadActions, types::TY_Image);
5058+
5059+
// For HIP non-RDC compilation, wrap the device binary with linker wrapper
5060+
// before bundling with host code. Do not bind a specific GPU arch here,
5061+
// as the packaged image may contain entries for multiple GPUs.
5062+
ActionList AL{PackagerAction};
5063+
PackagerAction =
5064+
C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_HIP_FATBIN);
5065+
DDep.add(*PackagerAction,
5066+
*C.getOffloadToolChains<Action::OFK_HIP>().first->second,
5067+
/*BoundArch=*/nullptr, Action::OFK_HIP);
50715068
} else {
50725069
// Package all the offloading actions into a single output that can be
50735070
// embedded in the host and linked.
@@ -5197,6 +5194,14 @@ Action *Driver::ConstructPhaseAction(
51975194
return C.MakeAction<CompileJobAction>(Input, types::TY_LLVM_BC);
51985195
}
51995196
case phases::Backend: {
5197+
// Skip a redundant Backend phase for HIP device code when using the new
5198+
// offload driver, where mid-end is done in linker wrapper.
5199+
if (TargetDeviceOffloadKind == Action::OFK_HIP &&
5200+
Args.hasFlag(options::OPT_offload_new_driver,
5201+
options::OPT_no_offload_new_driver, false) &&
5202+
!offloadDeviceOnly())
5203+
return Input;
5204+
52005205
if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
52015206
types::ID Output;
52025207
if (Args.hasArg(options::OPT_ffat_lto_objects) &&
@@ -5216,7 +5221,8 @@ Action *Driver::ConstructPhaseAction(
52165221
if (Args.hasArg(options::OPT_emit_llvm) ||
52175222
TargetDeviceOffloadKind == Action::OFK_SYCL ||
52185223
(((Input->getOffloadingToolChain() &&
5219-
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
5224+
Input->getOffloadingToolChain()->getTriple().isAMDGPU() &&
5225+
TargetDeviceOffloadKind != Action::OFK_None) ||
52205226
TargetDeviceOffloadKind == Action::OFK_HIP) &&
52215227
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
52225228
false) ||

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7636,7 +7636,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
76367636
CmdArgs.push_back("-fcuda-include-gpubinary");
76377637
CmdArgs.push_back(CudaDeviceInput->getFilename());
76387638
} else if (!HostOffloadingInputs.empty()) {
7639-
if (IsCuda && !IsRDCMode) {
7639+
if ((IsCuda || IsHIP) && !IsRDCMode) {
76407640
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
76417641
CmdArgs.push_back("-fcuda-include-gpubinary");
76427642
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9093,7 +9093,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
90939093
auto ShouldForward = [&](const llvm::DenseSet<unsigned> &Set, Arg *A,
90949094
const ToolChain &TC) {
90959095
// CMake hack to avoid printing verbose informatoin for HIP non-RDC mode.
9096-
if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object)
9096+
if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_HIP_FATBIN)
90979097
return false;
90989098
return (Set.contains(A->getOption().getID()) ||
90999099
(A->getOption().getGroup().isValid() &&
@@ -9175,7 +9175,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
91759175
// non-RDC mode compilation. This confuses default CMake implicit linker
91769176
// argument parsing when the language is set to HIP and the system linker is
91779177
// also `ld.lld`.
9178-
if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object)
9178+
if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_HIP_FATBIN)
91799179
CmdArgs.push_back("--wrapper-verbose");
91809180
if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ))
91819181
CmdArgs.push_back(
@@ -9247,14 +9247,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
92479247

92489248
// We use action type to differentiate two use cases of the linker wrapper.
92499249
// TY_Image for normal linker wrapper work.
9250-
// TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
9251-
// object.
9252-
assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
9253-
if (JA.getType() == types::TY_Object) {
9250+
// TY_HIP_FATBIN for HIP fno-gpu-rdc emitting a fat binary without wrapping.
9251+
assert(JA.getType() == types::TY_HIP_FATBIN ||
9252+
JA.getType() == types::TY_Image);
9253+
if (JA.getType() == types::TY_HIP_FATBIN) {
9254+
CmdArgs.push_back("--emit-fatbin-only");
92549255
CmdArgs.append({"-o", Output.getFilename()});
92559256
for (auto Input : Inputs)
92569257
CmdArgs.push_back(Input.getFilename());
9257-
CmdArgs.push_back("-r");
92589258
} else
92599259
for (const char *LinkArg : LinkCommand->getArguments())
92609260
CmdArgs.push_back(LinkArg);

clang/test/Driver/hip-binding.hip

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,6 @@
101101
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
102102
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
103103
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
104-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
105-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
106-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
104+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
105+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Linker", inputs: ["[[PKG]]"], output: "[[HIPFB:.+]]"
106+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[HIPFB]]"], output: "hip-binding.o"

clang/test/Driver/hip-phases.hip

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -33,32 +33,33 @@
3333
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
3434
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
3535
// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
36-
// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
3736
// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
3837
// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
3938
// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
4039
// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
41-
// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
40+
// NEW-DAG: [[P6:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P5]]}, ir
4241
// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
43-
// NEW-DAG: [[P10:[0-9]+]]: llvm-offload-binary, {[[P9]]}, image, (device-[[T]])
42+
// NEW-DAG: [[P7:[0-9]+]]: llvm-offload-binary, {[[P6]]}, image, (device-[[T]])
43+
// NEWN-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
44+
// NEWLTO-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
4445
// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
4546

4647
// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
47-
// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
48+
// NEWN-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
49+
// NEWLTO-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
50+
// NEWR-DAG: [[P8:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P7]]}, ir
4851
// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
4952
// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
5053
// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
51-
// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
52-
// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
53-
// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
54-
// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
55-
// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
54+
// NEWN-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, assembler, (host-[[T]])
55+
// NEWN-DAG: [[P11:[0-9]+]]: assembler, {[[P10]]}, object, (host-[[T]])
56+
// NEWLTO-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, lto-bc, (host-hip)
57+
// NEWR-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (host-[[T]])
58+
// NEWR-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (host-[[T]])
5659
// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
57-
// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
58-
// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
60+
// NEWN-DAG: [[P12:[0-9]+]]: clang-linker-wrapper, {[[P11]]}, image, (host-[[T]])
5961
// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
60-
// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
61-
// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
62+
// NEWR-DAG: [[P11:[0-9]+]]: clang-linker-wrapper, {[[P10]]}, image, (host-[[T]])
6263

6364
//
6465
// Test single gpu architecture up to the assemble phase.
@@ -613,7 +614,6 @@
613614
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx803)
614615
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx900)
615616
// MIXED-DAG: input, "{{.*}}empty.cpp", c++
616-
// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (host-hip)
617617
// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (device-hip
618618

619619
// MIXED2-DAG: input, "{{.*}}empty.hip", hip, (host-hip)
@@ -658,17 +658,15 @@
658658
// LTO-NEXT: 3: input, "[[INPUT]]", hip, (device-hip, gfx908)
659659
// LTO-NEXT: 4: preprocessor, {3}, hip-cpp-output, (device-hip, gfx908)
660660
// LTO-NEXT: 5: compiler, {4}, ir, (device-hip, gfx908)
661-
// LTO-NEXT: 6: backend, {5}, lto-bc, (device-hip, gfx908)
662-
// LTO-NEXT: 7: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {6}, lto-bc
663-
// LTO-NEXT: 8: input, "[[INPUT]]", hip, (device-hip, gfx90a)
664-
// LTO-NEXT: 9: preprocessor, {8}, hip-cpp-output, (device-hip, gfx90a)
665-
// LTO-NEXT: 10: compiler, {9}, ir, (device-hip, gfx90a)
666-
// LTO-NEXT: 11: backend, {10}, lto-bc, (device-hip, gfx90a)
667-
// LTO-NEXT: 12: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {11}, lto-bc
668-
// LTO-NEXT: 13: llvm-offload-binary, {7, 12}, image, (device-hip)
669-
// LTO-NEXT: 14: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {13}, ir
670-
// LTO-NEXT: 15: backend, {14}, assembler, (host-hip)
671-
// LTO-NEXT: 16: assembler, {15}, object, (host-hip)
661+
// LTO-NEXT: 6: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {5}, ir
662+
// LTO-NEXT: 7: input, "[[INPUT]]", hip, (device-hip, gfx90a)
663+
// LTO-NEXT: 8: preprocessor, {7}, hip-cpp-output, (device-hip, gfx90a)
664+
// LTO-NEXT: 9: compiler, {8}, ir, (device-hip, gfx90a)
665+
// LTO-NEXT: 10: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {9}, ir
666+
// LTO-NEXT: 11: llvm-offload-binary, {6, 10}, image, (device-hip)
667+
// LTO-NEXT: 12: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {11}, ir
668+
// LTO-NEXT: 13: backend, {12}, assembler, (host-hip)
669+
// LTO-NEXT: 14: assembler, {13}, object, (host-hip)
672670

673671
//
674672
// Test the new driver when not bundling

clang/test/Driver/hip-spirv-translator-new-driver.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
// RUN: | FileCheck %s
77

88
// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
9-
// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}
9+
// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.hipfb"}}

clang/test/Driver/hip-toolchain-no-rdc.hip

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -101,19 +101,19 @@
101101
// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
102102
// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
103103

104+
// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
105+
// NEW-SAME: "--emit-fatbin-only"
106+
// NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]"
107+
104108
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
105109
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
106110
// CHECK-SAME: "-emit-obj"
107111
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
108112
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
109-
// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
110-
// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
111-
// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
113+
// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_A]]"
114+
// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
112115
// CHECK-SAME: {{.*}} [[A_SRC]]
113116

114-
// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
115-
// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
116-
117117
//
118118
// Compile device code in b.hip to code object for gfx803.
119119
//
@@ -173,19 +173,19 @@
173173
// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
174174
// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
175175

176+
// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
177+
// NEW-SAME: "--emit-fatbin-only"
178+
// NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]"
179+
176180
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
177181
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
178182
// CHECK-SAME: "-emit-obj"
179183
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
180184
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
181-
// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
182-
// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
183-
// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
185+
// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_B]]"
186+
// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
184187
// CHECK-SAME: {{.*}} [[B_SRC]]
185188

186-
// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
187-
// NEW: "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
188-
189189
//
190190
// Link host objects.
191191
//
@@ -219,5 +219,5 @@
219219
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \
220220
// RUN: --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE
221221
// VERBOSE: clang-linker-wrapper
222-
// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v
223-
// VERBOSE-NOT: --wrapper-verbose
222+
// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v {{.*}}-o {{.*}}.hipfb
223+
// VERBOSE-NOT: --wrapper-verbose {{.*}}-o {{.*}}.hipfb
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// UNSUPPORTED: system-windows
2+
// REQUIRES: amdgpu-registered-target
3+
4+
// Test HIP non-RDC linker wrapper behavior with new offload driver.
5+
// The linker wrapper should output .hipfb files directly without using -r option.
6+
7+
// An externally visible variable so static libraries extract.
8+
__attribute__((visibility("protected"), used)) int x;
9+
10+
// Create device binaries and package them
11+
// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
12+
// RUN: llvm-offload-binary -o %t.out \
13+
// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1100 \
14+
// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1200
15+
16+
// Test that linker wrapper outputs .hipfb file without -r option for HIP non-RDC
17+
// The linker wrapper is called directly with the packaged device binary (not embedded in host object)
18+
// Note: When called directly (not through the driver), the linker wrapper processes architectures
19+
// from the packaged binary. The test verifies it can process at least one architecture correctly.
20+
// RUN: clang-linker-wrapper --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.hipfb 2>&1
21+
22+
// Verify the fat binary was created
23+
// RUN: test -f %t.hipfb
24+
25+
// List code objects in the fat binary
26+
// RUN: clang-offload-bundler -type=o -input=%t.hipfb -list | FileCheck %s --check-prefix=HIP-FATBIN-LIST
27+
28+
// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1100
29+
// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1200
30+
// HIP-FATBIN-LIST-DAG: host-x86_64-unknown-linux-gnu-
31+
32+
// Extract code objects for both architectures from the fat binary
33+
// RUN: clang-offload-bundler -type=o -targets=hip-amdgcn-amd-amdhsa--gfx1100,hip-amdgcn-amd-amdhsa--gfx1200 \
34+
// RUN: -output=%t.gfx1100.co -output=%t.gfx1200.co -input=%t.hipfb -unbundle
35+
36+
// Verify extracted code objects exist and are not empty
37+
// RUN: test -f %t.gfx1100.co
38+
// RUN: test -s %t.gfx1100.co
39+
// RUN: test -f %t.gfx1200.co
40+
// RUN: test -s %t.gfx1200.co

0 commit comments

Comments
 (0)