Skip to content

Commit f7e72fc

Browse files
committed
Merge remote-tracking branch 'upstream/sycl' into sycl-web
2 parents 3e75635 + 3f56c58 commit f7e72fc

33 files changed

+978
-107
lines changed

.github/workflows/pr-code-format.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ on:
77
pull_request:
88
branches:
99
- main
10+
- sycl
11+
- sycl-devops-pr/**
12+
- sycl-rel-**
1013
- 'users/**'
1114

1215
jobs:

buildbot/configure.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def do_configure(args):
6464

6565
sycl_enable_xpti_tracing = "ON"
6666
xpti_enable_werror = "OFF"
67-
llvm_enable_zstd = "OFF"
67+
llvm_enable_zstd = "ON"
6868

6969
if sys.platform != "darwin":
7070
sycl_enabled_backends.append("level_zero")
@@ -134,8 +134,6 @@ def do_configure(args):
134134

135135
# For clang-format, clang-tidy and code coverage
136136
llvm_enable_projects += ";clang-tools-extra;compiler-rt"
137-
# Build with zstd enabled on CI.
138-
llvm_enable_zstd = "ON"
139137
if sys.platform != "darwin":
140138
# libclc is required for CI validation
141139
libclc_enabled = True

clang/include/clang/Basic/DiagnosticDriverKinds.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,10 @@ def err_drv_sycl_missing_amdgpu_arch : Error<
401401
"missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend%select{|=%1}0 --offload-arch=<arch-name>'">;
402402
def err_drv_sycl_thinlto_split_off: Error<
403403
"'%0' is not supported when '%1' is set with '-fsycl'">;
404+
def err_drv_sycl_offload_arch_new_driver: Error<
405+
"'--offload-arch' is supported when '-fsycl' is set with '--offload-new-driver'">;
406+
def err_drv_sycl_offload_arch_missing_value : Error<
407+
"must pass in an explicit cpu or gpu architecture to '--offload-arch'">;
404408
def warn_drv_sycl_offload_target_duplicate : Warning<
405409
"SYCL offloading target '%0' is similar to target '%1' already specified; "
406410
"will be ignored">, InGroup<SyclTarget>;

clang/include/clang/Driver/Action.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,14 @@ class OffloadWrapperJobAction : public JobAction {
698698
// Get the compilation step setting.
699699
bool getCompileStep() const { return CompileStep; }
700700

701+
// Set the individual wrapping setting. This is used to tell the wrapper job
702+
// action that the wrapping (and subsequent compile step) should be done
703+
// with for-each instead of using -batch.
704+
void setWrapIndividualFiles() { WrapIndividualFiles = true; }
705+
706+
// Get the individual wrapping setting.
707+
bool getWrapIndividualFiles() const { return WrapIndividualFiles; }
708+
701709
// Set the offload kind for the current wrapping job action. Default usage
702710
// is to use the kind of the current toolchain.
703711
void setOffloadKind(OffloadKind SetKind) { Kind = SetKind; }
@@ -707,6 +715,7 @@ class OffloadWrapperJobAction : public JobAction {
707715

708716
private:
709717
bool CompileStep = true;
718+
bool WrapIndividualFiles = false;
710719
OffloadKind Kind = OFK_None;
711720
};
712721

clang/lib/Driver/Driver.cpp

Lines changed: 165 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,12 +1203,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
12031203
llvm::StringMap<llvm::DenseSet<StringRef>> DerivedArchs;
12041204
llvm::StringMap<StringRef> FoundNormalizedTriples;
12051205
llvm::SmallVector<llvm::Triple, 4> UniqueSYCLTriplesVec;
1206+
// StringSet to contain SYCL target triples.
1207+
llvm::StringSet<> SYCLTriples;
12061208
if (HasSYCLTargetsOption) {
12071209
// At this point, we know we have a valid combination
12081210
// of -fsycl*target options passed
12091211
Arg *SYCLTargetsValues = SYCLTargets;
12101212
if (SYCLTargetsValues) {
1211-
llvm::StringSet<> SYCLTriples;
12121213
if (SYCLTargetsValues->getNumValues()) {
12131214

12141215
// Multiple targets are currently not supported when using
@@ -1308,6 +1309,109 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
13081309
Diag(clang::diag::warn_drv_empty_joined_argument)
13091310
<< SYCLTargetsValues->getAsString(C.getInputArgs());
13101311
}
1312+
}
1313+
// If the user specified --offload-arch, deduce the offloading
1314+
// target triple(s) from the set of architecture(s).
1315+
// Create a toolchain for each valid triple.
1316+
// We do not support SYCL offloading if any of the inputs is a
1317+
// .cu (for CUDA type) or .hip (for HIP type) file.
1318+
else if (HasValidSYCLRuntime &&
1319+
C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) && !IsHIP &&
1320+
!IsCuda) {
1321+
// SYCL offloading to AOT Targets with '--offload-arch'
1322+
// is currently enabled only with '--offload-new-driver' option.
1323+
// Emit a diagnostic if '--offload-arch' is invoked without
1324+
// '--offload-new driver' option.
1325+
if (!C.getInputArgs().hasFlag(options::OPT_offload_new_driver,
1326+
options::OPT_no_offload_new_driver, false)) {
1327+
Diag(clang::diag::err_drv_sycl_offload_arch_new_driver);
1328+
return;
1329+
}
1330+
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
1331+
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
1332+
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
1333+
HostTC->getTriple());
1334+
1335+
// Attempt to deduce the offloading triple from the set of architectures.
1336+
// We need to temporarily create these toolchains so that we can access
1337+
// tools for inferring architectures.
1338+
llvm::DenseSet<StringRef> Archs;
1339+
if (NVPTXTriple) {
1340+
auto TempTC = std::make_unique<toolchains::CudaToolChain>(
1341+
*this, *NVPTXTriple, *HostTC, C.getInputArgs(), Action::OFK_None);
1342+
for (StringRef Arch :
1343+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, &*TempTC, true))
1344+
Archs.insert(Arch);
1345+
}
1346+
if (AMDTriple) {
1347+
auto TempTC = std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
1348+
*this, *AMDTriple, *HostTC, C.getInputArgs());
1349+
for (StringRef Arch :
1350+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, &*TempTC, true))
1351+
Archs.insert(Arch);
1352+
}
1353+
if (!AMDTriple && !NVPTXTriple) {
1354+
for (StringRef Arch :
1355+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, nullptr, true))
1356+
Archs.insert(Arch);
1357+
}
1358+
for (StringRef Arch : Archs) {
1359+
if (NVPTXTriple && IsSYCLSupportedNVidiaGPUArch(StringToOffloadArch(
1360+
getProcessorFromTargetID(*NVPTXTriple, Arch)))) {
1361+
DerivedArchs[NVPTXTriple->getTriple()].insert(Arch);
1362+
} else if (AMDTriple &&
1363+
IsSYCLSupportedAMDGPUArch(StringToOffloadArch(
1364+
getProcessorFromTargetID(*AMDTriple, Arch)))) {
1365+
DerivedArchs[AMDTriple->getTriple()].insert(Arch);
1366+
} else if (IsSYCLSupportedIntelCPUArch(StringToOffloadArchSYCL(Arch))) {
1367+
DerivedArchs[MakeSYCLDeviceTriple("spir64_x86_64").getTriple()].insert(
1368+
Arch);
1369+
} else if (IsSYCLSupportedIntelGPUArch(StringToOffloadArchSYCL(Arch))) {
1370+
StringRef IntelGPUArch;
1371+
// For Intel Graphics AOT target, valid values for '--offload-arch'
1372+
// are mapped to valid device names accepted by OCLOC (the Intel GPU AOT
1373+
// compiler) via the '-device' option. The mapIntelGPUArchName
1374+
// function maps the accepted values for '--offload-arch' to enable SYCL
1375+
// offloading to Intel GPUs and the corresponding '-device' value passed
1376+
// to OCLOC.
1377+
IntelGPUArch = mapIntelGPUArchName(Arch).data();
1378+
DerivedArchs[MakeSYCLDeviceTriple("spir64_gen").getTriple()].insert(
1379+
IntelGPUArch);
1380+
} else {
1381+
Diag(clang::diag::err_drv_invalid_sycl_target) << Arch;
1382+
return;
1383+
}
1384+
}
1385+
// Emit an error if architecture value is not provided
1386+
// to --offload-arch.
1387+
if (Archs.empty()) {
1388+
Diag(clang::diag::err_drv_sycl_offload_arch_missing_value);
1389+
return;
1390+
}
1391+
1392+
for (const auto &TripleAndArchs : DerivedArchs)
1393+
SYCLTriples.insert(TripleAndArchs.first());
1394+
1395+
for (const auto &Val : SYCLTriples) {
1396+
llvm::Triple SYCLTargetTriple(MakeSYCLDeviceTriple(Val.getKey()));
1397+
std::string NormalizedName = SYCLTargetTriple.normalize();
1398+
1399+
// Make sure we don't have a duplicate triple.
1400+
auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
1401+
if (Duplicate != FoundNormalizedTriples.end()) {
1402+
Diag(clang::diag::warn_drv_sycl_offload_target_duplicate)
1403+
<< Val.getKey() << Duplicate->second;
1404+
continue;
1405+
}
1406+
1407+
// Store the current triple so that we can check for duplicates in the
1408+
// following iterations.
1409+
FoundNormalizedTriples[NormalizedName] = Val.getKey();
1410+
UniqueSYCLTriplesVec.push_back(SYCLTargetTriple);
1411+
}
1412+
1413+
addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
1414+
13111415
} else {
13121416
// If -fsycl is supplied without -fsycl-targets we will assume SPIR-V.
13131417
// For -fsycl-device-only, we also setup the implied triple as needed.
@@ -5455,9 +5559,58 @@ class OffloadingActionBuilder final {
54555559
BundlingActions, types::TY_Object);
54565560
if (auto *OWA = dyn_cast<OffloadWrapperJobAction>(DeviceAction))
54575561
OWA->setOffloadKind(Action::OFK_Host);
5562+
// The Backend compilation step performed here is being done for
5563+
// creating FPGA archives. The possible split binaries after
5564+
// sycl-post-link need to be individually wrapped as opposed to
5565+
// being passed into the clang-offload-wrapper via a table and
5566+
// using the -batch option - effectively creating a single
5567+
// binary. The resulting archive created from -fsycl-link should
5568+
// not contain the singular binary, but should be individual
5569+
// binaries to be consumed later by either the -fsycl-link=image
5570+
// device compilation step or being linked into the final exe.
5571+
//
5572+
// Typical compile flow:
5573+
// .bc
5574+
// |
5575+
// sycl-post-link -split=kernel
5576+
// |
5577+
// +--------+--------+
5578+
// | | |
5579+
// split1 split2 split3
5580+
// | | |
5581+
// llvm-spirv llvm-spirv llvm-spirv
5582+
// | | |
5583+
// ocloc ocloc ocloc
5584+
// | | |
5585+
// +--------+--------+
5586+
// |
5587+
// clang-offload-wrapper -batch
5588+
// |
5589+
// .o
5590+
//
5591+
// Individual wrap compile flow:
5592+
// .bc
5593+
// |
5594+
// sycl-post-link -split=kernel
5595+
// |
5596+
// +--------+--------+
5597+
// | | |
5598+
// split1 split2 split3
5599+
// | | |
5600+
// llvm-spirv llvm-spirv llvm-spirv
5601+
// | | |
5602+
// ocloc ocloc ocloc
5603+
// | | |
5604+
// wrap wrap wrap
5605+
// | | |
5606+
// .o .o .o
5607+
//
54585608
Action *CompiledDeviceAction =
5459-
C.MakeAction<OffloadWrapperJobAction>(WrapperItems,
5460-
types::TY_Object);
5609+
C.MakeAction<OffloadWrapperJobAction>(FPGAAOTAction,
5610+
types::TY_Tempfilelist);
5611+
if (auto *OWA =
5612+
dyn_cast<OffloadWrapperJobAction>(CompiledDeviceAction))
5613+
OWA->setWrapIndividualFiles();
54615614
addDeps(CompiledDeviceAction, TC, BoundArch);
54625615
}
54635616
addDeps(DeviceAction, TC, BoundArch);
@@ -5731,6 +5884,9 @@ class OffloadingActionBuilder final {
57315884
};
57325885

57335886
Action *ExtractIRFilesAction = createExtractIRFilesAction();
5887+
// Device binaries that are individually wrapped when creating an
5888+
// FPGA Archive.
5889+
ActionList FPGAArchiveWrapperInputs;
57345890

57355891
if (IsNVPTX || IsAMDGCN) {
57365892
JobAction *FinAction =
@@ -5816,6 +5972,7 @@ class OffloadingActionBuilder final {
58165972
FileTableTformJobAction::COL_CODE,
58175973
FileTableTformJobAction::COL_CODE);
58185974
WrapperInputs.push_back(ReplaceFilesAction);
5975+
FPGAArchiveWrapperInputs.push_back(BuildCodeAction);
58195976
}
58205977
if (SkipWrapper) {
58215978
// Wrapper step not requested.
@@ -5850,8 +6007,11 @@ class OffloadingActionBuilder final {
58506007
if (auto *OWA = dyn_cast<OffloadWrapperJobAction>(DeviceAction))
58516008
OWA->setOffloadKind(Action::OFK_Host);
58526009
Action *CompiledDeviceAction =
5853-
C.MakeAction<OffloadWrapperJobAction>(WrapperInputs,
5854-
types::TY_Object);
6010+
C.MakeAction<OffloadWrapperJobAction>(
6011+
FPGAArchiveWrapperInputs, types::TY_Tempfilelist);
6012+
if (auto *OWA =
6013+
dyn_cast<OffloadWrapperJobAction>(CompiledDeviceAction))
6014+
OWA->setWrapIndividualFiles();
58556015
addDeps(CompiledDeviceAction, TC, nullptr);
58566016
}
58576017
addDeps(DeviceAction, TC, nullptr);

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10255,8 +10255,18 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
1025510255
const InputInfo &I = Inputs[0];
1025610256
assert(I.isFilename() && "Invalid input.");
1025710257

10258-
if (I.getType() == types::TY_Tempfiletable ||
10259-
I.getType() == types::TY_Tempfilelist || IsEmbeddedIR)
10258+
// TODO: The embedded compilation step after the wrapping step restricts
10259+
// the ability to control the 'for each' methodology used when performing
10260+
// device code splitting. We set the individual wrap behavior when we know
10261+
// the wrapping and compile step should be done individually. Ideally this
10262+
// would be controlled at the JobAction creation, but we cannot do that
10263+
// until the compilation of the wrap is it's own JobAction.
10264+
bool IndividualWrapCompile = WrapperJob.getWrapIndividualFiles();
10265+
const InputInfo TempOutput(types::TY_LLVM_BC, WrapperFileName,
10266+
WrapperFileName);
10267+
if (!IndividualWrapCompile &&
10268+
(I.getType() == types::TY_Tempfiletable ||
10269+
I.getType() == types::TY_Tempfilelist || IsEmbeddedIR))
1026010270
// Input files are passed via the batch job file table.
1026110271
WrapperArgs.push_back(C.getArgs().MakeArgString("-batch"));
1026210272
WrapperArgs.push_back(C.getArgs().MakeArgString(I.getFilename()));
@@ -10265,7 +10275,17 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
1026510275
JA, *this, ResponseFileSupport::None(),
1026610276
TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
1026710277
WrapperArgs, std::nullopt);
10268-
C.addCommand(std::move(Cmd));
10278+
10279+
if (IndividualWrapCompile) {
10280+
// When wrapping FPGA device binaries for FPGA archives, create individual
10281+
// wrapped and compiled entries for the archive.
10282+
StringRef ParallelJobs =
10283+
C.getArgs().getLastArgValue(options::OPT_fsycl_max_parallel_jobs_EQ);
10284+
clang::driver::tools::SYCL::constructLLVMForeachCommand(
10285+
C, JA, std::move(Cmd), Inputs, TempOutput, this, "", "bc",
10286+
ParallelJobs);
10287+
} else
10288+
C.addCommand(std::move(Cmd));
1026910289

1027010290
if (WrapperCompileEnabled) {
1027110291
// TODO Use TC.SelectTool().
@@ -10288,9 +10308,19 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
1028810308
SmallString<128> ClangPath(C.getDriver().Dir);
1028910309
llvm::sys::path::append(ClangPath, "clang");
1029010310
const char *Clang = C.getArgs().MakeArgString(ClangPath);
10291-
C.addCommand(std::make_unique<Command>(JA, *this,
10292-
ResponseFileSupport::None(), Clang,
10293-
ClangArgs, std::nullopt));
10311+
auto PostWrapCompileCmd =
10312+
std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
10313+
Clang, ClangArgs, std::nullopt);
10314+
if (IndividualWrapCompile) {
10315+
StringRef ParallelJobs = C.getArgs().getLastArgValue(
10316+
options::OPT_fsycl_max_parallel_jobs_EQ);
10317+
InputInfoList Inputs;
10318+
Inputs.push_back(TempOutput);
10319+
clang::driver::tools::SYCL::constructLLVMForeachCommand(
10320+
C, JA, std::move(PostWrapCompileCmd), Inputs, Output, this, "",
10321+
"bc", ParallelJobs);
10322+
} else
10323+
C.addCommand(std::move(PostWrapCompileCmd));
1029410324
}
1029510325
return;
1029610326
} // end of SYCL flavor of offload wrapper command creation

0 commit comments

Comments
 (0)