Skip to content

Commit 66bef6e

Browse files
committed
HIP non-RDC: enable new offload driver on Windows via linker wrapper
Use clang linker wrapper to device-link and embed HIP fat binary directly. Match CUDA non-RDC flow in new driver by producing .hipfb like .fatbin. Previously, llvm offload binary is used to package the device IR's and embed them in the host object file, then clang linker wrapper is used with each host object file to extract device IR's, perform device linking, bundle code objects into a fat binary, wrap it in a host object file, then merge it with the original host object by the host linker with '-r' option. However, the host linker in MSVC toolchain does not support '-r' option. The new approach still package the device IR's with llvm offload binary, but instead of embed it in a host object, it is passed to clang linker wrapper directly, where device IR's are extracted and linked, fat binary is generated, then embeded in the host object directly. Compared with the old offload driver, this approach can parallelize the device linking for different GPU's by using the parallelization feature of clang linker wrapper.
1 parent 182c415 commit 66bef6e

File tree

9 files changed

+194
-98
lines changed

9 files changed

+194
-98
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
12801280
return nullptr;
12811281
}
12821282
if (CGM.getLangOpts().OffloadViaLLVM ||
1283-
(CGM.getLangOpts().OffloadingNewDriver &&
1284-
(CGM.getLangOpts().HIP || RelocatableDeviceCode)))
1283+
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
12851284
createOffloadingEntries();
12861285
else
12871286
return makeModuleCtorFunction();

clang/lib/Driver/Driver.cpp

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4413,10 +4413,6 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
44134413
options::OPT_no_offload_new_driver,
44144414
C.isOffloadingHostKind(Action::OFK_Cuda));
44154415

4416-
bool HIPNoRDC =
4417-
C.isOffloadingHostKind(Action::OFK_HIP) &&
4418-
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4419-
44204416
// Builder to be used to build offloading actions.
44214417
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
44224418
!UseNewOffloadingDriver
@@ -4550,7 +4546,7 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
45504546
// Check if this Linker Job should emit a static library.
45514547
if (ShouldEmitStaticLibrary(Args)) {
45524548
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
4553-
} else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
4549+
} else if (UseNewOffloadingDriver ||
45544550
Args.hasArg(options::OPT_offload_link)) {
45554551
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
45564552
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4887,20 +4883,6 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
48874883
<< "-fhip-emit-relocatable"
48884884
<< "--offload-device-only";
48894885

4890-
// For HIP non-rdc non-device-only compilation, create a linker wrapper
4891-
// action for each host object to link, bundle and wrap device files in
4892-
// it.
4893-
if ((isa<AssembleJobAction>(HostAction) ||
4894-
(isa<BackendJobAction>(HostAction) &&
4895-
HostAction->getType() == types::TY_LTO_BC)) &&
4896-
HIPNoRDC && !offloadDeviceOnly()) {
4897-
ActionList AL{HostAction};
4898-
HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
4899-
HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
4900-
/*BoundArch=*/nullptr);
4901-
return HostAction;
4902-
}
4903-
49044886
// Don't build offloading actions if we do not have a compile action. If
49054887
// preprocessing only ignore embedding.
49064888
if (!(isa<CompileJobAction>(HostAction) ||
@@ -5057,14 +5039,31 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
50575039
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
50585040
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
50595041
nullptr, Action::OFK_Cuda);
5060-
} else if (HIPNoRDC && offloadDeviceOnly()) {
5061-
// If we are in device-only non-RDC-mode we just emit the final HIP
5062-
// fatbinary for each translation unit, linking each input individually.
5063-
Action *FatbinAction =
5064-
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
5065-
DDep.add(*FatbinAction,
5066-
*C.getOffloadToolChains<Action::OFK_HIP>().first->second, nullptr,
5067-
Action::OFK_HIP);
5042+
} else if (HIPNoRDC) {
5043+
if (offloadDeviceOnly()) {
5044+
// If we are in device-only non-RDC-mode we just emit the final HIP
5045+
// fatbinary for each translation unit, linking each input individually.
5046+
Action *FatbinAction =
5047+
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
5048+
DDep.add(*FatbinAction,
5049+
*C.getOffloadToolChains<Action::OFK_HIP>().first->second,
5050+
nullptr, Action::OFK_HIP);
5051+
} else {
5052+
// Package all the offloading actions into a single output that can be
5053+
// embedded in the host and linked.
5054+
Action *PackagerAction = C.MakeAction<OffloadPackagerJobAction>(
5055+
OffloadActions, types::TY_Image);
5056+
5057+
// For HIP non-RDC compilation, wrap the device binary with linker wrapper
5058+
// before bundling with host code. Do not bind a specific GPU arch here,
5059+
// as the packaged image may contain entries for multiple GPUs.
5060+
ActionList AL{PackagerAction};
5061+
PackagerAction =
5062+
C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_HIP_FATBIN);
5063+
DDep.add(*PackagerAction,
5064+
*C.getOffloadToolChains<Action::OFK_HIP>().first->second,
5065+
/*BoundArch=*/nullptr, Action::OFK_HIP);
5066+
}
50685067
} else {
50695068
// Package all the offloading actions into a single output that can be
50705069
// embedded in the host and linked.
@@ -5194,6 +5193,14 @@ Action *Driver::ConstructPhaseAction(
51945193
return C.MakeAction<CompileJobAction>(Input, types::TY_LLVM_BC);
51955194
}
51965195
case phases::Backend: {
5196+
// Skip a redundant Backend phase for HIP device code when using the new
5197+
// offload driver, where mid-end is done in linker wrapper.
5198+
if (TargetDeviceOffloadKind == Action::OFK_HIP &&
5199+
Args.hasFlag(options::OPT_offload_new_driver,
5200+
options::OPT_no_offload_new_driver, false) &&
5201+
!offloadDeviceOnly())
5202+
return Input;
5203+
51975204
if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
51985205
types::ID Output;
51995206
if (Args.hasArg(options::OPT_ffat_lto_objects) &&
@@ -5213,7 +5220,8 @@ Action *Driver::ConstructPhaseAction(
52135220
if (Args.hasArg(options::OPT_emit_llvm) ||
52145221
TargetDeviceOffloadKind == Action::OFK_SYCL ||
52155222
(((Input->getOffloadingToolChain() &&
5216-
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
5223+
Input->getOffloadingToolChain()->getTriple().isAMDGPU() &&
5224+
TargetDeviceOffloadKind != Action::OFK_None) ||
52175225
TargetDeviceOffloadKind == Action::OFK_HIP) &&
52185226
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
52195227
false) ||

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7636,7 +7636,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
76367636
CmdArgs.push_back("-fcuda-include-gpubinary");
76377637
CmdArgs.push_back(CudaDeviceInput->getFilename());
76387638
} else if (!HostOffloadingInputs.empty()) {
7639-
if (IsCuda && !IsRDCMode) {
7639+
if ((IsCuda || IsHIP) && !IsRDCMode) {
76407640
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
76417641
CmdArgs.push_back("-fcuda-include-gpubinary");
76427642
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9086,7 +9086,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
90869086
auto ShouldForward = [&](const llvm::DenseSet<unsigned> &Set, Arg *A,
90879087
const ToolChain &TC) {
90889088
// CMake hack to avoid printing verbose informatoin for HIP non-RDC mode.
9089-
if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object)
9089+
if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_HIP_FATBIN)
90909090
return false;
90919091
return (Set.contains(A->getOption().getID()) ||
90929092
(A->getOption().getGroup().isValid() &&
@@ -9168,7 +9168,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
91689168
// non-RDC mode compilation. This confuses default CMake implicit linker
91699169
// argument parsing when the language is set to HIP and the system linker is
91709170
// also `ld.lld`.
9171-
if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object)
9171+
if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_HIP_FATBIN)
91729172
CmdArgs.push_back("--wrapper-verbose");
91739173
if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ))
91749174
CmdArgs.push_back(
@@ -9240,14 +9240,13 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
92409240

92419241
// We use action type to differentiate two use cases of the linker wrapper.
92429242
// TY_Image for normal linker wrapper work.
9243-
// TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
9244-
// object.
9245-
assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
9246-
if (JA.getType() == types::TY_Object) {
9243+
// TY_HIP_FATBIN for HIP fno-gpu-rdc emitting a fat binary without wrapping.
9244+
assert(JA.getType() == types::TY_HIP_FATBIN ||
9245+
JA.getType() == types::TY_Image);
9246+
if (JA.getType() == types::TY_HIP_FATBIN) {
92479247
CmdArgs.append({"-o", Output.getFilename()});
92489248
for (auto Input : Inputs)
92499249
CmdArgs.push_back(Input.getFilename());
9250-
CmdArgs.push_back("-r");
92519250
} else
92529251
for (const char *LinkArg : LinkCommand->getArguments())
92539252
CmdArgs.push_back(LinkArg);

clang/test/Driver/hip-binding.hip

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,6 @@
101101
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
102102
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
103103
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
104-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
105-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
106-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
104+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
105+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Linker", inputs: ["[[PKG]]"], output: "[[HIPFB:.+]]"
106+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[HIPFB]]"], output: "hip-binding.o"

clang/test/Driver/hip-phases.hip

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -33,32 +33,33 @@
3333
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
3434
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
3535
// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
36-
// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
3736
// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
3837
// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
3938
// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
4039
// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
41-
// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
40+
// NEW-DAG: [[P6:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P5]]}, ir
4241
// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
43-
// NEW-DAG: [[P10:[0-9]+]]: llvm-offload-binary, {[[P9]]}, image, (device-[[T]])
42+
// NEW-DAG: [[P7:[0-9]+]]: llvm-offload-binary, {[[P6]]}, image, (device-[[T]])
43+
// NEWN-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
44+
// NEWLTO-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
4445
// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
4546

4647
// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
47-
// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
48+
// NEWN-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
49+
// NEWLTO-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
50+
// NEWR-DAG: [[P8:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P7]]}, ir
4851
// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
4952
// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
5053
// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
51-
// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
52-
// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
53-
// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
54-
// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
55-
// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
54+
// NEWN-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, assembler, (host-[[T]])
55+
// NEWN-DAG: [[P11:[0-9]+]]: assembler, {[[P10]]}, object, (host-[[T]])
56+
// NEWLTO-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, lto-bc, (host-hip)
57+
// NEWR-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (host-[[T]])
58+
// NEWR-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (host-[[T]])
5659
// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
57-
// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
58-
// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
60+
// NEWN-DAG: [[P12:[0-9]+]]: clang-linker-wrapper, {[[P11]]}, image, (host-[[T]])
5961
// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
60-
// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
61-
// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
62+
// NEWR-DAG: [[P11:[0-9]+]]: clang-linker-wrapper, {[[P10]]}, image, (host-[[T]])
6263

6364
//
6465
// Test single gpu architecture up to the assemble phase.
@@ -613,7 +614,6 @@
613614
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx803)
614615
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx900)
615616
// MIXED-DAG: input, "{{.*}}empty.cpp", c++
616-
// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (host-hip)
617617
// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (device-hip
618618

619619
// MIXED2-DAG: input, "{{.*}}empty.hip", hip, (host-hip)
@@ -658,17 +658,15 @@
658658
// LTO-NEXT: 3: input, "[[INPUT]]", hip, (device-hip, gfx908)
659659
// LTO-NEXT: 4: preprocessor, {3}, hip-cpp-output, (device-hip, gfx908)
660660
// LTO-NEXT: 5: compiler, {4}, ir, (device-hip, gfx908)
661-
// LTO-NEXT: 6: backend, {5}, lto-bc, (device-hip, gfx908)
662-
// LTO-NEXT: 7: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {6}, lto-bc
663-
// LTO-NEXT: 8: input, "[[INPUT]]", hip, (device-hip, gfx90a)
664-
// LTO-NEXT: 9: preprocessor, {8}, hip-cpp-output, (device-hip, gfx90a)
665-
// LTO-NEXT: 10: compiler, {9}, ir, (device-hip, gfx90a)
666-
// LTO-NEXT: 11: backend, {10}, lto-bc, (device-hip, gfx90a)
667-
// LTO-NEXT: 12: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {11}, lto-bc
668-
// LTO-NEXT: 13: llvm-offload-binary, {7, 12}, image, (device-hip)
669-
// LTO-NEXT: 14: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {13}, ir
670-
// LTO-NEXT: 15: backend, {14}, assembler, (host-hip)
671-
// LTO-NEXT: 16: assembler, {15}, object, (host-hip)
661+
// LTO-NEXT: 6: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {5}, ir
662+
// LTO-NEXT: 7: input, "[[INPUT]]", hip, (device-hip, gfx90a)
663+
// LTO-NEXT: 8: preprocessor, {7}, hip-cpp-output, (device-hip, gfx90a)
664+
// LTO-NEXT: 9: compiler, {8}, ir, (device-hip, gfx90a)
665+
// LTO-NEXT: 10: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {9}, ir
666+
// LTO-NEXT: 11: llvm-offload-binary, {6, 10}, image, (device-hip)
667+
// LTO-NEXT: 12: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {11}, ir
668+
// LTO-NEXT: 13: backend, {12}, assembler, (host-hip)
669+
// LTO-NEXT: 14: assembler, {13}, object, (host-hip)
672670

673671
//
674672
// Test the new driver when not bundling

clang/test/Driver/hip-spirv-translator-new-driver.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
// RUN: | FileCheck %s
77

88
// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
9-
// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}
9+
// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.hipfb"}}

clang/test/Driver/hip-toolchain-no-rdc.hip

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -101,19 +101,18 @@
101101
// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
102102
// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
103103

104+
// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
105+
// NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]"
106+
104107
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
105108
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
106109
// CHECK-SAME: "-emit-obj"
107110
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
108111
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
109-
// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
110-
// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
111-
// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
112+
// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_A]]"
113+
// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
112114
// CHECK-SAME: {{.*}} [[A_SRC]]
113115

114-
// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
115-
// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
116-
117116
//
118117
// Compile device code in b.hip to code object for gfx803.
119118
//
@@ -173,19 +172,18 @@
173172
// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
174173
// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
175174

175+
// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
176+
// NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]"
177+
176178
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
177179
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
178180
// CHECK-SAME: "-emit-obj"
179181
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
180182
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
181-
// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
182-
// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
183-
// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
183+
// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_B]]"
184+
// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
184185
// CHECK-SAME: {{.*}} [[B_SRC]]
185186

186-
// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
187-
// NEW: "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
188-
189187
//
190188
// Link host objects.
191189
//
@@ -219,5 +217,5 @@
219217
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \
220218
// RUN: --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE
221219
// VERBOSE: clang-linker-wrapper
222-
// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v
223-
// VERBOSE-NOT: --wrapper-verbose
220+
// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v {{.*}}-o {{.*}}.hipfb
221+
// VERBOSE-NOT: --wrapper-verbose {{.*}}-o {{.*}}.hipfb
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// UNSUPPORTED: system-windows
2+
// REQUIRES: amdgpu-registered-target
3+
4+
// Test HIP non-RDC linker wrapper behavior with new offload driver.
5+
// The linker wrapper should output .hipfb files directly without using -r option.
6+
7+
// An externally visible variable so static libraries extract.
8+
__attribute__((visibility("protected"), used)) int x;
9+
10+
// Create device binaries and package them
11+
// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
12+
// RUN: llvm-offload-binary -o %t.out \
13+
// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1100 \
14+
// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1200
15+
16+
// Test that linker wrapper outputs .hipfb file without -r option for HIP non-RDC
17+
// The linker wrapper is called directly with the packaged device binary (not embedded in host object)
18+
// Note: When called directly (not through the driver), the linker wrapper processes architectures
19+
// from the packaged binary. The test verifies it can process at least one architecture correctly.
20+
// RUN: clang-linker-wrapper %t.out -o %t.hipfb 2>&1
21+
22+
// Verify the fat binary was created
23+
// RUN: test -f %t.hipfb
24+
25+
// List code objects in the fat binary
26+
// RUN: clang-offload-bundler -type=o -input=%t.hipfb -list | FileCheck %s --check-prefix=HIP-FATBIN-LIST
27+
28+
// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1100
29+
// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1200
30+
// HIP-FATBIN-LIST-DAG: host-x86_64-unknown-linux-gnu-
31+
32+
// Extract code objects for both architectures from the fat binary
33+
// RUN: clang-offload-bundler -type=o -targets=hip-amdgcn-amd-amdhsa--gfx1100,hip-amdgcn-amd-amdhsa--gfx1200 \
34+
// RUN: -output=%t.gfx1100.co -output=%t.gfx1200.co -input=%t.hipfb -unbundle
35+
36+
// Verify extracted code objects exist and are not empty
37+
// RUN: test -f %t.gfx1100.co
38+
// RUN: test -s %t.gfx1100.co
39+
// RUN: test -f %t.gfx1200.co
40+
// RUN: test -s %t.gfx1200.co

0 commit comments

Comments
 (0)