Skip to content

Commit ea66d26

Browse files
authored
HIP non-RDC: enable new offload driver on Windows via linker wrapper (#167918)
Use clang linker wrapper to device-link and embed HIP fat binary directly. Match CUDA non-RDC flow in new driver by producing .hipfb like .fatbin. Previously, llvm offload binary is used to package the device IR's and embed them in the host object file, then clang linker wrapper is used with each host object file to extract device IR's, perform device linking, bundle code objects into a fat binary, wrap it in a host object file, then merge it with the original host object by the host linker with '-r' option. However, the host linker in MSVC toolchain does not support '-r' option. The new approach still package the device IR's with llvm offload binary, but instead of embed it in a host object, it is passed to clang linker wrapper directly, where device IR's are extracted and linked, fat binary is generated, then embeded in the host object directly. Compared with the old offload driver, this approach can parallelize the device linking for different GPU's by using the parallelization feature of clang linker wrapper. Fixes: SWDEV-565994
1 parent 820daa5 commit ea66d26

File tree

10 files changed

+160
-82
lines changed

10 files changed

+160
-82
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
12801280
return nullptr;
12811281
}
12821282
if (CGM.getLangOpts().OffloadViaLLVM ||
1283-
(CGM.getLangOpts().OffloadingNewDriver &&
1284-
(CGM.getLangOpts().HIP || RelocatableDeviceCode)))
1283+
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
12851284
createOffloadingEntries();
12861285
else
12871286
return makeModuleCtorFunction();

clang/lib/Driver/Driver.cpp

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4413,10 +4413,6 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
44134413
options::OPT_no_offload_new_driver,
44144414
C.isOffloadingHostKind(Action::OFK_Cuda));
44154415

4416-
bool HIPNoRDC =
4417-
C.isOffloadingHostKind(Action::OFK_HIP) &&
4418-
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4419-
44204416
// Builder to be used to build offloading actions.
44214417
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
44224418
!UseNewOffloadingDriver
@@ -4550,7 +4546,7 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
45504546
// Check if this Linker Job should emit a static library.
45514547
if (ShouldEmitStaticLibrary(Args)) {
45524548
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
4553-
} else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
4549+
} else if (UseNewOffloadingDriver ||
45544550
Args.hasArg(options::OPT_offload_link)) {
45554551
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
45564552
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4887,20 +4883,6 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
48874883
<< "-fhip-emit-relocatable"
48884884
<< "--offload-device-only";
48894885

4890-
// For HIP non-rdc non-device-only compilation, create a linker wrapper
4891-
// action for each host object to link, bundle and wrap device files in
4892-
// it.
4893-
if ((isa<AssembleJobAction>(HostAction) ||
4894-
(isa<BackendJobAction>(HostAction) &&
4895-
HostAction->getType() == types::TY_LTO_BC)) &&
4896-
HIPNoRDC && !offloadDeviceOnly()) {
4897-
ActionList AL{HostAction};
4898-
HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
4899-
HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
4900-
/*BoundArch=*/nullptr);
4901-
return HostAction;
4902-
}
4903-
49044886
// Don't build offloading actions if we do not have a compile action. If
49054887
// preprocessing only ignore embedding.
49064888
if (!(isa<CompileJobAction>(HostAction) ||
@@ -5065,6 +5047,21 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
50655047
DDep.add(*FatbinAction,
50665048
*C.getOffloadToolChains<Action::OFK_HIP>().first->second, nullptr,
50675049
Action::OFK_HIP);
5050+
} else if (HIPNoRDC) {
5051+
// Package all the offloading actions into a single output that can be
5052+
// embedded in the host and linked.
5053+
Action *PackagerAction =
5054+
C.MakeAction<OffloadPackagerJobAction>(OffloadActions, types::TY_Image);
5055+
5056+
// For HIP non-RDC compilation, wrap the device binary with linker wrapper
5057+
// before bundling with host code. Do not bind a specific GPU arch here,
5058+
// as the packaged image may contain entries for multiple GPUs.
5059+
ActionList AL{PackagerAction};
5060+
PackagerAction =
5061+
C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_HIP_FATBIN);
5062+
DDep.add(*PackagerAction,
5063+
*C.getOffloadToolChains<Action::OFK_HIP>().first->second,
5064+
/*BoundArch=*/nullptr, Action::OFK_HIP);
50685065
} else {
50695066
// Package all the offloading actions into a single output that can be
50705067
// embedded in the host and linked.
@@ -5194,6 +5191,14 @@ Action *Driver::ConstructPhaseAction(
51945191
return C.MakeAction<CompileJobAction>(Input, types::TY_LLVM_BC);
51955192
}
51965193
case phases::Backend: {
5194+
// Skip a redundant Backend phase for HIP device code when using the new
5195+
// offload driver, where mid-end is done in linker wrapper.
5196+
if (TargetDeviceOffloadKind == Action::OFK_HIP &&
5197+
Args.hasFlag(options::OPT_offload_new_driver,
5198+
options::OPT_no_offload_new_driver, false) &&
5199+
!offloadDeviceOnly())
5200+
return Input;
5201+
51975202
if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
51985203
types::ID Output;
51995204
if (Args.hasArg(options::OPT_ffat_lto_objects) &&
@@ -5213,7 +5218,8 @@ Action *Driver::ConstructPhaseAction(
52135218
if (Args.hasArg(options::OPT_emit_llvm) ||
52145219
TargetDeviceOffloadKind == Action::OFK_SYCL ||
52155220
(((Input->getOffloadingToolChain() &&
5216-
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
5221+
Input->getOffloadingToolChain()->getTriple().isAMDGPU() &&
5222+
TargetDeviceOffloadKind != Action::OFK_None) ||
52175223
TargetDeviceOffloadKind == Action::OFK_HIP) &&
52185224
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
52195225
false) ||

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7636,7 +7636,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
76367636
CmdArgs.push_back("-fcuda-include-gpubinary");
76377637
CmdArgs.push_back(CudaDeviceInput->getFilename());
76387638
} else if (!HostOffloadingInputs.empty()) {
7639-
if (IsCuda && !IsRDCMode) {
7639+
if ((IsCuda || IsHIP) && !IsRDCMode) {
76407640
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
76417641
CmdArgs.push_back("-fcuda-include-gpubinary");
76427642
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9093,7 +9093,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
90939093
auto ShouldForward = [&](const llvm::DenseSet<unsigned> &Set, Arg *A,
90949094
const ToolChain &TC) {
90959095
// CMake hack to avoid printing verbose informatoin for HIP non-RDC mode.
9096-
if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object)
9096+
if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_HIP_FATBIN)
90979097
return false;
90989098
return (Set.contains(A->getOption().getID()) ||
90999099
(A->getOption().getGroup().isValid() &&
@@ -9175,7 +9175,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
91759175
// non-RDC mode compilation. This confuses default CMake implicit linker
91769176
// argument parsing when the language is set to HIP and the system linker is
91779177
// also `ld.lld`.
9178-
if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object)
9178+
if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_HIP_FATBIN)
91799179
CmdArgs.push_back("--wrapper-verbose");
91809180
if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ))
91819181
CmdArgs.push_back(
@@ -9247,14 +9247,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
92479247

92489248
// We use action type to differentiate two use cases of the linker wrapper.
92499249
// TY_Image for normal linker wrapper work.
9250-
// TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
9251-
// object.
9252-
assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
9253-
if (JA.getType() == types::TY_Object) {
9250+
// TY_HIP_FATBIN for HIP fno-gpu-rdc emitting a fat binary without wrapping.
9251+
assert(JA.getType() == types::TY_HIP_FATBIN ||
9252+
JA.getType() == types::TY_Image);
9253+
if (JA.getType() == types::TY_HIP_FATBIN) {
9254+
CmdArgs.push_back("--emit-fatbin-only");
92549255
CmdArgs.append({"-o", Output.getFilename()});
92559256
for (auto Input : Inputs)
92569257
CmdArgs.push_back(Input.getFilename());
9257-
CmdArgs.push_back("-r");
92589258
} else
92599259
for (const char *LinkArg : LinkCommand->getArguments())
92609260
CmdArgs.push_back(LinkArg);

clang/test/Driver/hip-binding.hip

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,6 @@
101101
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
102102
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
103103
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
104-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
105-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
106-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
104+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
105+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Linker", inputs: ["[[PKG]]"], output: "[[HIPFB:.+]]"
106+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[HIPFB]]"], output: "hip-binding.o"

clang/test/Driver/hip-phases.hip

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -33,32 +33,33 @@
3333
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
3434
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
3535
// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
36-
// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
3736
// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
3837
// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
3938
// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
4039
// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
41-
// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
40+
// NEW-DAG: [[P6:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P5]]}, ir
4241
// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
43-
// NEW-DAG: [[P10:[0-9]+]]: llvm-offload-binary, {[[P9]]}, image, (device-[[T]])
42+
// NEW-DAG: [[P7:[0-9]+]]: llvm-offload-binary, {[[P6]]}, image, (device-[[T]])
43+
// NEWN-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
44+
// NEWLTO-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
4445
// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
4546

4647
// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
47-
// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
48+
// NEWN-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
49+
// NEWLTO-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
50+
// NEWR-DAG: [[P8:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P7]]}, ir
4851
// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
4952
// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
5053
// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
51-
// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
52-
// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
53-
// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
54-
// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
55-
// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
54+
// NEWN-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, assembler, (host-[[T]])
55+
// NEWN-DAG: [[P11:[0-9]+]]: assembler, {[[P10]]}, object, (host-[[T]])
56+
// NEWLTO-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, lto-bc, (host-hip)
57+
// NEWR-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (host-[[T]])
58+
// NEWR-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (host-[[T]])
5659
// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
57-
// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
58-
// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
60+
// NEWN-DAG: [[P12:[0-9]+]]: clang-linker-wrapper, {[[P11]]}, image, (host-[[T]])
5961
// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
60-
// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
61-
// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
62+
// NEWR-DAG: [[P11:[0-9]+]]: clang-linker-wrapper, {[[P10]]}, image, (host-[[T]])
6263

6364
//
6465
// Test single gpu architecture up to the assemble phase.
@@ -613,7 +614,6 @@
613614
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx803)
614615
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx900)
615616
// MIXED-DAG: input, "{{.*}}empty.cpp", c++
616-
// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (host-hip)
617617
// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (device-hip
618618

619619
// MIXED2-DAG: input, "{{.*}}empty.hip", hip, (host-hip)
@@ -658,17 +658,15 @@
658658
// LTO-NEXT: 3: input, "[[INPUT]]", hip, (device-hip, gfx908)
659659
// LTO-NEXT: 4: preprocessor, {3}, hip-cpp-output, (device-hip, gfx908)
660660
// LTO-NEXT: 5: compiler, {4}, ir, (device-hip, gfx908)
661-
// LTO-NEXT: 6: backend, {5}, lto-bc, (device-hip, gfx908)
662-
// LTO-NEXT: 7: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {6}, lto-bc
663-
// LTO-NEXT: 8: input, "[[INPUT]]", hip, (device-hip, gfx90a)
664-
// LTO-NEXT: 9: preprocessor, {8}, hip-cpp-output, (device-hip, gfx90a)
665-
// LTO-NEXT: 10: compiler, {9}, ir, (device-hip, gfx90a)
666-
// LTO-NEXT: 11: backend, {10}, lto-bc, (device-hip, gfx90a)
667-
// LTO-NEXT: 12: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {11}, lto-bc
668-
// LTO-NEXT: 13: llvm-offload-binary, {7, 12}, image, (device-hip)
669-
// LTO-NEXT: 14: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {13}, ir
670-
// LTO-NEXT: 15: backend, {14}, assembler, (host-hip)
671-
// LTO-NEXT: 16: assembler, {15}, object, (host-hip)
661+
// LTO-NEXT: 6: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {5}, ir
662+
// LTO-NEXT: 7: input, "[[INPUT]]", hip, (device-hip, gfx90a)
663+
// LTO-NEXT: 8: preprocessor, {7}, hip-cpp-output, (device-hip, gfx90a)
664+
// LTO-NEXT: 9: compiler, {8}, ir, (device-hip, gfx90a)
665+
// LTO-NEXT: 10: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {9}, ir
666+
// LTO-NEXT: 11: llvm-offload-binary, {6, 10}, image, (device-hip)
667+
// LTO-NEXT: 12: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {11}, ir
668+
// LTO-NEXT: 13: backend, {12}, assembler, (host-hip)
669+
// LTO-NEXT: 14: assembler, {13}, object, (host-hip)
672670

673671
//
674672
// Test the new driver when not bundling

clang/test/Driver/hip-spirv-translator-new-driver.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
// RUN: | FileCheck %s
77

88
// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
9-
// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}
9+
// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.hipfb"}}

clang/test/Driver/hip-toolchain-no-rdc.hip

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -101,19 +101,19 @@
101101
// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
102102
// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
103103

104+
// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
105+
// NEW-SAME: "--emit-fatbin-only"
106+
// NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]"
107+
104108
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
105109
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
106110
// CHECK-SAME: "-emit-obj"
107111
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
108112
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
109-
// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
110-
// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
111-
// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
113+
// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_A]]"
114+
// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
112115
// CHECK-SAME: {{.*}} [[A_SRC]]
113116

114-
// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
115-
// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
116-
117117
//
118118
// Compile device code in b.hip to code object for gfx803.
119119
//
@@ -173,19 +173,19 @@
173173
// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
174174
// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
175175

176+
// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
177+
// NEW-SAME: "--emit-fatbin-only"
178+
// NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]"
179+
176180
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
177181
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
178182
// CHECK-SAME: "-emit-obj"
179183
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
180184
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
181-
// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
182-
// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
183-
// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
185+
// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_B]]"
186+
// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
184187
// CHECK-SAME: {{.*}} [[B_SRC]]
185188

186-
// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
187-
// NEW: "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
188-
189189
//
190190
// Link host objects.
191191
//
@@ -219,5 +219,5 @@
219219
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \
220220
// RUN: --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE
221221
// VERBOSE: clang-linker-wrapper
222-
// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v
223-
// VERBOSE-NOT: --wrapper-verbose
222+
// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v {{.*}}-o {{.*}}.hipfb
223+
// VERBOSE-NOT: --wrapper-verbose {{.*}}-o {{.*}}.hipfb
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// UNSUPPORTED: system-windows
2+
// REQUIRES: amdgpu-registered-target
3+
4+
// Test HIP non-RDC linker wrapper behavior with new offload driver.
5+
// The linker wrapper should output .hipfb files directly without using -r option.
6+
7+
// An externally visible variable so static libraries extract.
8+
__attribute__((visibility("protected"), used)) int x;
9+
10+
// Create device binaries and package them
11+
// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
12+
// RUN: llvm-offload-binary -o %t.out \
13+
// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1100 \
14+
// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1200
15+
16+
// Test that linker wrapper outputs .hipfb file without -r option for HIP non-RDC
17+
// The linker wrapper is called directly with the packaged device binary (not embedded in host object)
18+
// Note: When called directly (not through the driver), the linker wrapper processes architectures
19+
// from the packaged binary. The test verifies it can process at least one architecture correctly.
20+
// RUN: clang-linker-wrapper --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.hipfb 2>&1
21+
22+
// Verify the fat binary was created
23+
// RUN: test -f %t.hipfb
24+
25+
// List code objects in the fat binary
26+
// RUN: clang-offload-bundler -type=o -input=%t.hipfb -list | FileCheck %s --check-prefix=HIP-FATBIN-LIST
27+
28+
// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1100
29+
// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1200
30+
// HIP-FATBIN-LIST-DAG: host-x86_64-unknown-linux-gnu-
31+
32+
// Extract code objects for both architectures from the fat binary
33+
// RUN: clang-offload-bundler -type=o -targets=hip-amdgcn-amd-amdhsa--gfx1100,hip-amdgcn-amd-amdhsa--gfx1200 \
34+
// RUN: -output=%t.gfx1100.co -output=%t.gfx1200.co -input=%t.hipfb -unbundle
35+
36+
// Verify extracted code objects exist and are not empty
37+
// RUN: test -f %t.gfx1100.co
38+
// RUN: test -s %t.gfx1100.co
39+
// RUN: test -f %t.gfx1200.co
40+
// RUN: test -s %t.gfx1200.co

0 commit comments

Comments
 (0)