Skip to content

Commit 2f71c26

Browse files
[Driver][SYCL] Enable --offload-arch support for SYCL offloading.
1 parent 8337d01 commit 2f71c26

File tree

17 files changed

+928
-14
lines changed

17 files changed

+928
-14
lines changed

clang/include/clang/Basic/Cuda.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ enum class OffloadArch {
106106
GFX90a,
107107
GFX90c,
108108
GFX9_4_GENERIC,
109+
GFX940,
110+
GFX941,
109111
GFX942,
110112
GFX950,
111113
GFX10_1_GENERIC,

clang/include/clang/Basic/DiagnosticDriverKinds.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -843,4 +843,14 @@ def warn_missing_include_dirs : Warning<
843843

844844
def err_drv_malformed_warning_suppression_mapping : Error<
845845
"failed to process suppression mapping file '%0': %1">;
846+
847+
def err_drv_sycl_offload_arch_missing_value : Error<
848+
"must pass in an explicit cpu or gpu architecture to '--offload-arch'">;
849+
850+
def err_drv_invalid_sycl_target : Error<"SYCL target is invalid: '%0'">;
851+
852+
def warn_drv_sycl_offload_target_duplicate : Warning<
853+
"SYCL offloading target '%0' is similar to target '%1' already specified; "
854+
"will be ignored">, InGroup<SyclTarget>;
855+
846856
}

clang/include/clang/Basic/DiagnosticGroups.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,3 +1628,7 @@ def ExplicitSpecializationStorageClass : DiagGroup<"explicit-specialization-stor
16281628

16291629
// A warning for options that enable a feature that is not yet complete
16301630
def ExperimentalOption : DiagGroup<"experimental-option">;
1631+
1632+
// SYCL Warnings
1633+
def SyclTarget : DiagGroup<"sycl-target">;
1634+

clang/include/clang/Driver/Driver.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,23 @@ class Driver {
846846
/// Compute the default -fmodule-cache-path.
847847
/// \return True if the system provides a default cache directory.
848848
static bool getDefaultModuleCachePath(SmallVectorImpl<char> &Result);
849+
850+
/// Vector of Macros that need to be added to the Host compilation in a
851+
/// SYCL based offloading scenario. These macros are gathered during
852+
/// construction of the device compilations.
853+
mutable std::vector<std::string> SYCLTargetMacroArgs;
854+
855+
/// addSYCLTargetMacroArg - Add the given macro to the vector of args to be
856+
/// added to the host compilation step.
857+
void addSYCLTargetMacroArg(const llvm::opt::ArgList &Args,
858+
StringRef Macro) const {
859+
SYCLTargetMacroArgs.push_back(Args.MakeArgString(Macro));
860+
}
861+
862+
/// getSYCLTargetMacroArgs - return the previously gathered macro target args.
863+
llvm::ArrayRef<std::string> getSYCLTargetMacroArgs() const {
864+
return SYCLTargetMacroArgs;
865+
}
849866
};
850867

851868
/// \return True if the last defined optimization level is -Ofast.

clang/lib/Basic/Cuda.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ static const OffloadArchToStringMap arch_names[] = {
124124
GFX(90a), // gfx90a
125125
GFX(90c), // gfx90c
126126
{OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"},
127+
GFX(940), // gfx940
128+
GFX(941), // gfx941
127129
GFX(942), // gfx942
128130
GFX(950), // gfx950
129131
{OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"},

clang/lib/Driver/Driver.cpp

Lines changed: 122 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -833,10 +833,14 @@ Driver::OpenMPRuntimeKind Driver::getOpenMPRuntime(const ArgList &Args) const {
833833

834834
static llvm::Triple getSYCLDeviceTriple(StringRef TargetArch) {
835835
SmallVector<StringRef, 5> SYCLAlias = {"spir", "spir64", "spirv", "spirv32",
836-
"spirv64"};
836+
"spirv64", "spir64_x86_64",
837+
"spir64_gen", "nvptx64"};
837838
if (llvm::is_contained(SYCLAlias, TargetArch)) {
838839
llvm::Triple TargetTriple;
839840
TargetTriple.setArchName(TargetArch);
841+
// Return the full SYCL target triple string for NVidia GPU targets.
842+
if (TargetTriple.getArch() == llvm::Triple::nvptx64)
843+
return llvm::Triple("nvptx64-nvidia-cuda");
840844
TargetTriple.setVendor(llvm::Triple::UnknownVendor);
841845
TargetTriple.setOS(llvm::Triple::UnknownOS);
842846
return TargetTriple;
@@ -846,16 +850,25 @@ static llvm::Triple getSYCLDeviceTriple(StringRef TargetArch) {
846850

847851
static bool addSYCLDefaultTriple(Compilation &C,
848852
SmallVectorImpl<llvm::Triple> &SYCLTriples) {
853+
854+
llvm::Triple DefaultTriple = getSYCLDeviceTriple(
855+
C.getDefaultToolChain().getTriple().isArch32Bit() ? "spirv32"
856+
: "spirv64");
857+
for (const auto &SYCLTriple : SYCLTriples) {
858+
if (SYCLTriple == DefaultTriple)
859+
return false;
860+
// If we encounter a known non-spir* target, do not add the default triple.
861+
if (SYCLTriple.isNVPTX() || SYCLTriple.isAMDGCN())
862+
return false;
863+
if(SYCLTriple.isSPIRAOT())
864+
return false;
865+
}
849866
// Check current set of triples to see if the default has already been set.
850867
for (const auto &SYCLTriple : SYCLTriples) {
851868
if (SYCLTriple.getSubArch() == llvm::Triple::NoSubArch &&
852869
SYCLTriple.isSPIROrSPIRV())
853870
return false;
854871
}
855-
// Add the default triple as it was not found.
856-
llvm::Triple DefaultTriple = getSYCLDeviceTriple(
857-
C.getDefaultToolChain().getTriple().isArch32Bit() ? "spirv32"
858-
: "spirv64");
859872
SYCLTriples.insert(SYCLTriples.begin(), DefaultTriple);
860873
return true;
861874
}
@@ -1066,19 +1079,119 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
10661079
// -ffreestanding cannot be used with -fsycl
10671080
argSYCLIncompatible(options::OPT_ffreestanding);
10681081

1082+
// Map of SYCL target triple strings to their corresponding target archs.
1083+
// Example: spir64_x86_64 --> SKYLAKEAVX512
1084+
llvm::StringMap<llvm::DenseSet<StringRef>> DerivedArchs;
1085+
llvm::StringMap<StringRef> FoundNormalizedTriples;
10691086
llvm::SmallVector<llvm::Triple, 4> UniqueSYCLTriplesVec;
1070-
1087+
// StringSet to contain SYCL target triples.
1088+
llvm::StringSet<> SYCLTriples;
1089+
// If the user specified --offload-arch, deduce the offloading
1090+
// target triple(s) from the set of architecture(s).
1091+
// Create a toolchain for each valid triple.
1092+
// We do not support SYCL offloading if any of the inputs is a
1093+
// .cu (for CUDA type) or .hip (for HIP type) file.
10711094
if (IsSYCL) {
1072-
addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
1095+
if(C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) && !IsHIP &&
1096+
!IsCuda) {
1097+
1098+
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
1099+
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
1100+
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
1101+
HostTC->getTriple());
1102+
1103+
// Attempt to deduce the offloading triple from the set of architectures.
1104+
// We need to temporarily create these toolchains so that we can access
1105+
// tools for inferring architectures.
1106+
llvm::DenseSet<StringRef> Archs;
1107+
if (NVPTXTriple) {
1108+
auto TempTC = std::make_unique<toolchains::CudaToolChain>(
1109+
*this, *NVPTXTriple, *HostTC, C.getInputArgs());
1110+
for (StringRef Arch :
1111+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, &*TempTC, true))
1112+
Archs.insert(Arch);
1113+
}
1114+
if (AMDTriple) {
1115+
auto TempTC = std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
1116+
*this, *AMDTriple, *HostTC, C.getInputArgs());
1117+
for (StringRef Arch :
1118+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, &*TempTC, true))
1119+
Archs.insert(Arch);
1120+
}
1121+
if (!AMDTriple && !NVPTXTriple) {
1122+
for (StringRef Arch :
1123+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, nullptr, true))
1124+
Archs.insert(Arch);
1125+
}
1126+
for (StringRef Arch : Archs) {
1127+
if (NVPTXTriple && IsSYCLSupportedNVidiaGPUArch(StringToOffloadArch(
1128+
getProcessorFromTargetID(*NVPTXTriple, Arch)))) {
1129+
DerivedArchs[NVPTXTriple->getTriple()].insert(Arch);
1130+
} else if (AMDTriple &&
1131+
IsSYCLSupportedAMDGPUArch(StringToOffloadArch(
1132+
getProcessorFromTargetID(*AMDTriple, Arch)))) {
1133+
DerivedArchs[AMDTriple->getTriple()].insert(Arch);
1134+
} else if (IsSYCLSupportedIntelCPUArch(StringToOffloadArchSYCL(Arch))) {
1135+
DerivedArchs[getSYCLDeviceTriple("spir64_x86_64").getTriple()].insert(
1136+
Arch);
1137+
} else if (IsSYCLSupportedIntelGPUArch(StringToOffloadArchSYCL(Arch))) {
1138+
StringRef IntelGPUArch;
1139+
// For Intel Graphics AOT target, valid values for '--offload-arch'
1140+
// are mapped to valid device names accepted by OCLOC (the Intel GPU AOT
1141+
// compiler) via the '-device' option. The mapIntelGPUArchName
1142+
// function maps the accepted values for '--offload-arch' to enable SYCL
1143+
// offloading to Intel GPUs and the corresponding '-device' value passed
1144+
// to OCLOC.
1145+
IntelGPUArch = mapIntelGPUArchName(Arch).data();
1146+
DerivedArchs[getSYCLDeviceTriple("spir64_gen").getTriple()].insert(
1147+
IntelGPUArch);
1148+
} else {
1149+
Diag(clang::diag::err_drv_invalid_sycl_target) << Arch;
1150+
return;
1151+
}
1152+
}
1153+
// Emit an error if architecture value is not provided
1154+
// to --offload-arch.
1155+
if (Archs.empty()) {
1156+
Diag(clang::diag::err_drv_sycl_offload_arch_missing_value);
1157+
return;
1158+
}
1159+
1160+
for (const auto &TripleAndArchs : DerivedArchs)
1161+
SYCLTriples.insert(TripleAndArchs.first());
1162+
1163+
for (const auto &Val : SYCLTriples) {
1164+
llvm::Triple SYCLTargetTriple(getSYCLDeviceTriple(Val.getKey()));
1165+
std::string NormalizedName = SYCLTargetTriple.normalize();
1166+
1167+
// Make sure we don't have a duplicate triple.
1168+
auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
1169+
if (Duplicate != FoundNormalizedTriples.end()) {
1170+
Diag(clang::diag::warn_drv_sycl_offload_target_duplicate)
1171+
<< Val.getKey() << Duplicate->second;
1172+
continue;
1173+
}
1174+
1175+
// Store the current triple so that we can check for duplicates in the
1176+
// following iterations.
1177+
FoundNormalizedTriples[NormalizedName] = Val.getKey();
1178+
UniqueSYCLTriplesVec.push_back(SYCLTargetTriple);
1179+
}
1180+
1181+
addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
1182+
} else
1183+
addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
10731184

10741185
// We'll need to use the SYCL and host triples as the key into
1075-
// getOffloadingDeviceToolChain, because the device toolchains we're
1186+
// getOffloadToolChain, because the device toolchains we're
10761187
// going to create will depend on both.
10771188
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
10781189
for (const auto &TT : UniqueSYCLTriplesVec) {
10791190
auto SYCLTC = &getOffloadToolChain(C.getInputArgs(), Action::OFK_SYCL, TT,
10801191
HostTC->getTriple());
10811192
C.addOffloadDeviceToolChain(SYCLTC, Action::OFK_SYCL);
1193+
if (DerivedArchs.contains(TT.getTriple()))
1194+
KnownArchs[SYCLTC] = DerivedArchs[TT.getTriple()];
10821195
}
10831196
}
10841197

@@ -6596,7 +6709,7 @@ const ToolChain &Driver::getOffloadToolChain(
65966709
if (Kind == Action::OFK_HIP)
65976710
TC = std::make_unique<toolchains::HIPAMDToolChain>(*this, Target,
65986711
*HostTC, Args);
6599-
else if (Kind == Action::OFK_OpenMP)
6712+
else if ((Kind == Action::OFK_OpenMP) || (Kind == Action::OFK_SYCL))
66006713
TC = std::make_unique<toolchains::AMDGPUOpenMPToolChain>(*this, Target,
66016714
*HostTC, Args);
66026715
break;

clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,9 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
4646
Action::OffloadKind DeviceOffloadingKind) const {
4747
HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
4848

49-
assert(DeviceOffloadingKind == Action::OFK_OpenMP &&
50-
"Only OpenMP offloading kinds are supported.");
49+
assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
50+
DeviceOffloadingKind == Action::OFK_SYCL) &&
51+
"Only OpenMP or SYCL offloading kinds are supported.");
5152

5253
if (!DriverArgs.hasFlag(options::OPT_offloadlib, options::OPT_no_offloadlib,
5354
true))

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5091,6 +5091,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
50915091
InputInfoList HostOffloadingInputs;
50925092
const InputInfo *CudaDeviceInput = nullptr;
50935093
const InputInfo *OpenMPDeviceInput = nullptr;
5094+
const InputInfo *SYCLDeviceInput = nullptr;
50945095
for (const InputInfo &I : Inputs) {
50955096
if (&I == &Input || I.getType() == types::TY_Nothing) {
50965097
// This is the primary input or contains nothing.
@@ -5108,13 +5109,15 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
51085109
CudaDeviceInput = &I;
51095110
} else if (IsOpenMPDevice && !OpenMPDeviceInput) {
51105111
OpenMPDeviceInput = &I;
5112+
} else if (IsSYCL && !SYCLDeviceInput) {
5113+
SYCLDeviceInput = &I;
51115114
} else {
51125115
llvm_unreachable("unexpectedly given multiple inputs");
51135116
}
51145117
}
51155118

51165119
const llvm::Triple *AuxTriple =
5117-
(IsCuda || IsHIP) ? TC.getAuxTriple() : nullptr;
5120+
(IsSYCL || IsCuda || IsHIP) ? TC.getAuxTriple() : nullptr;
51185121
bool IsWindowsMSVC = RawTriple.isWindowsMSVCEnvironment();
51195122
bool IsUEFI = RawTriple.isUEFI();
51205123
bool IsIAMCU = RawTriple.isOSIAMCU();
@@ -5208,6 +5211,14 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
52085211

52095212
if (IsSYCL) {
52105213
if (IsSYCLDevice) {
5214+
if (Triple.isNVPTX()) {
5215+
StringRef GPUArchName = JA.getOffloadingArch();
5216+
// TODO: Once default arch is moved to at least SM_53, empty arch should
5217+
// also result in the flag added.
5218+
if (!GPUArchName.empty() &&
5219+
StringToOffloadArch(GPUArchName) >= OffloadArch::SM_53)
5220+
CmdArgs.push_back("-fnative-half-type");
5221+
}
52115222
// Host triple is needed when doing SYCL device compilations.
52125223
llvm::Triple AuxT = C.getDefaultToolChain().getTriple();
52135224
std::string NormalizedTriple = AuxT.normalize();
@@ -5220,13 +5231,45 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
52205231
// Set O2 optimization level by default
52215232
if (!Args.getLastArg(options::OPT_O_Group))
52225233
CmdArgs.push_back("-O2");
5234+
// Add any predefined macros associated with intel_gpu* type targets
5235+
// passed in with -fsycl-targets
5236+
// TODO: Macros are populated during device compilations and saved for
5237+
// addition to the host compilation. There is no dependence connection
5238+
// between device and host where we should be able to use the offloading
5239+
// arch to add the macro to the host compile.
5240+
auto addTargetMacros = [&](const llvm::Triple &Triple) {
5241+
if (!Triple.isSPIR() && !Triple.isNVPTX() && !Triple.isAMDGCN())
5242+
return;
5243+
SmallString<64> Macro;
5244+
if ((Triple.isSPIR() &&
5245+
Triple.getSubArch() == llvm::Triple::SPIRSubArch_gen) ||
5246+
Triple.isNVPTX() || Triple.isAMDGCN()) {
5247+
StringRef Device = JA.getOffloadingArch();
5248+
if (!Device.empty() &&
5249+
!clang::driver::getGenDeviceMacro(Device).empty()) {
5250+
Macro = "-D";
5251+
Macro += clang::driver::getGenDeviceMacro(Device);
5252+
}
5253+
} else if (Triple.getSubArch() == llvm::Triple::SPIRSubArch_x86_64)
5254+
Macro = "-D__SYCL_TARGET_INTEL_X86_64__";
5255+
if (Macro.size()) {
5256+
CmdArgs.push_back(Args.MakeArgString(Macro));
5257+
D.addSYCLTargetMacroArg(Args, Macro);
5258+
}
5259+
};
5260+
addTargetMacros(RawTriple);
52235261
} else {
52245262
// Add any options that are needed specific to SYCL offload while
52255263
// performing the host side compilation.
52265264

52275265
// Let the front-end host compilation flow know about SYCL offload
52285266
// compilation.
52295267
CmdArgs.push_back("-fsycl-is-host");
5268+
5269+
// Add the SYCL target macro arguments that were generated during the
5270+
// device compilation step.
5271+
for (auto &Macro : D.getSYCLTargetMacroArgs())
5272+
CmdArgs.push_back(Args.MakeArgString(Macro));
52305273
}
52315274

52325275
// Set options for both host and device.

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -848,8 +848,8 @@ void CudaToolChain::addClangTargetOptions(
848848

849849
StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
850850
assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
851-
DeviceOffloadingKind == Action::OFK_Cuda) &&
852-
"Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
851+
DeviceOffloadingKind == Action::OFK_Cuda || DeviceOffloadingKind == Action::OFK_SYCL) &&
852+
"Only OpenMP or CUDA or SYCL offloading kinds are supported for NVIDIA GPUs.");
853853

854854
CC1Args.append({"-fcuda-is-device", "-mllvm",
855855
"-enable-memcpyopt-without-libcalls",

0 commit comments

Comments
 (0)