Skip to content

Commit 31d790d

Browse files
committed
Merge branch 'sycl' into cperkins-kernel_compiler-sycl-cache
2 parents 9841c43 + 7e2d615 commit 31d790d

File tree

50 files changed

+1151
-242
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1151
-242
lines changed

.github/workflows/pr-code-format.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ on:
77
pull_request:
88
branches:
99
- main
10+
- sycl
11+
- sycl-devops-pr/**
12+
- sycl-rel-**
1013
- 'users/**'
1114

1215
jobs:

.github/workflows/sycl-linux-build.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,12 @@ jobs:
212212
if: always() && !cancelled() && contains(inputs.changes, 'libdevice')
213213
run: |
214214
cmake --build $GITHUB_WORKSPACE/build --target check-libdevice
215+
- name: Check E2E test requirements
216+
if: always() && !cancelled() && !contains(inputs.changes, 'sycl')
217+
run: |
218+
# TODO consider moving this to Dockerfile.
219+
export LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
220+
LIT_OPTS="--allow-empty-runs" LIT_FILTER="e2e_test_requirements" cmake --build $GITHUB_WORKSPACE/build --target check-sycl
215221
- name: Install
216222
if: ${{ always() && !cancelled() && steps.build.conclusion == 'success' }}
217223
# TODO replace utility installation with a single CMake target

buildbot/configure.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def do_configure(args):
6464

6565
sycl_enable_xpti_tracing = "ON"
6666
xpti_enable_werror = "OFF"
67-
llvm_enable_zstd = "OFF"
67+
llvm_enable_zstd = "ON"
6868

6969
if sys.platform != "darwin":
7070
sycl_enabled_backends.append("level_zero")
@@ -134,8 +134,6 @@ def do_configure(args):
134134

135135
# For clang-format, clang-tidy and code coverage
136136
llvm_enable_projects += ";clang-tools-extra;compiler-rt"
137-
# Build with zstd enabled on CI.
138-
llvm_enable_zstd = "ON"
139137
if sys.platform != "darwin":
140138
# libclc is required for CI validation
141139
libclc_enabled = True

clang/include/clang/Basic/DiagnosticDriverKinds.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,10 @@ def err_drv_sycl_missing_amdgpu_arch : Error<
398398
"missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend%select{|=%1}0 --offload-arch=<arch-name>'">;
399399
def err_drv_sycl_thinlto_split_off: Error<
400400
"'%0' is not supported when '%1' is set with '-fsycl'">;
401+
def err_drv_sycl_offload_arch_new_driver: Error<
402+
"'--offload-arch' is supported when '-fsycl' is set with '--offload-new-driver'">;
403+
def err_drv_sycl_offload_arch_missing_value : Error<
404+
"must pass in an explicit cpu or gpu architecture to '--offload-arch'">;
401405
def warn_drv_sycl_offload_target_duplicate : Warning<
402406
"SYCL offloading target '%0' is similar to target '%1' already specified; "
403407
"will be ignored">, InGroup<SyclTarget>;

clang/include/clang/Driver/Action.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,14 @@ class OffloadWrapperJobAction : public JobAction {
698698
// Get the compilation step setting.
699699
bool getCompileStep() const { return CompileStep; }
700700

701+
// Set the individual wrapping setting. This is used to tell the wrapper job
702+
// action that the wrapping (and subsequent compile step) should be done
703+
// with for-each instead of using -batch.
704+
void setWrapIndividualFiles() { WrapIndividualFiles = true; }
705+
706+
// Get the individual wrapping setting.
707+
bool getWrapIndividualFiles() const { return WrapIndividualFiles; }
708+
701709
// Set the offload kind for the current wrapping job action. Default usage
702710
// is to use the kind of the current toolchain.
703711
void setOffloadKind(OffloadKind SetKind) { Kind = SetKind; }
@@ -707,6 +715,7 @@ class OffloadWrapperJobAction : public JobAction {
707715

708716
private:
709717
bool CompileStep = true;
718+
bool WrapIndividualFiles = false;
710719
OffloadKind Kind = OFK_None;
711720
};
712721

clang/lib/Driver/Driver.cpp

Lines changed: 166 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,12 +1191,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
11911191
llvm::StringMap<llvm::DenseSet<StringRef>> DerivedArchs;
11921192
llvm::StringMap<StringRef> FoundNormalizedTriples;
11931193
llvm::SmallVector<llvm::Triple, 4> UniqueSYCLTriplesVec;
1194+
// StringSet to contain SYCL target triples.
1195+
llvm::StringSet<> SYCLTriples;
11941196
if (HasSYCLTargetsOption) {
11951197
// At this point, we know we have a valid combination
11961198
// of -fsycl*target options passed
11971199
Arg *SYCLTargetsValues = SYCLTargets;
11981200
if (SYCLTargetsValues) {
1199-
llvm::StringSet<> SYCLTriples;
12001201
if (SYCLTargetsValues->getNumValues()) {
12011202

12021203
// Multiple targets are currently not supported when using
@@ -1296,6 +1297,109 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
12961297
Diag(clang::diag::warn_drv_empty_joined_argument)
12971298
<< SYCLTargetsValues->getAsString(C.getInputArgs());
12981299
}
1300+
}
1301+
// If the user specified --offload-arch, deduce the offloading
1302+
// target triple(s) from the set of architecture(s).
1303+
// Create a toolchain for each valid triple.
1304+
// We do not support SYCL offloading if any of the inputs is a
1305+
// .cu (for CUDA type) or .hip (for HIP type) file.
1306+
else if (HasValidSYCLRuntime &&
1307+
C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) && !IsHIP &&
1308+
!IsCuda) {
1309+
// SYCL offloading to AOT Targets with '--offload-arch'
1310+
// is currently enabled only with '--offload-new-driver' option.
1311+
// Emit a diagnostic if '--offload-arch' is invoked without
1312+
// '--offload-new driver' option.
1313+
if (!C.getInputArgs().hasFlag(options::OPT_offload_new_driver,
1314+
options::OPT_no_offload_new_driver, false)) {
1315+
Diag(clang::diag::err_drv_sycl_offload_arch_new_driver);
1316+
return;
1317+
}
1318+
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
1319+
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
1320+
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
1321+
HostTC->getTriple());
1322+
1323+
// Attempt to deduce the offloading triple from the set of architectures.
1324+
// We need to temporarily create these toolchains so that we can access
1325+
// tools for inferring architectures.
1326+
llvm::DenseSet<StringRef> Archs;
1327+
if (NVPTXTriple) {
1328+
auto TempTC = std::make_unique<toolchains::CudaToolChain>(
1329+
*this, *NVPTXTriple, *HostTC, C.getInputArgs(), Action::OFK_None);
1330+
for (StringRef Arch :
1331+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, &*TempTC, true))
1332+
Archs.insert(Arch);
1333+
}
1334+
if (AMDTriple) {
1335+
auto TempTC = std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
1336+
*this, *AMDTriple, *HostTC, C.getInputArgs());
1337+
for (StringRef Arch :
1338+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, &*TempTC, true))
1339+
Archs.insert(Arch);
1340+
}
1341+
if (!AMDTriple && !NVPTXTriple) {
1342+
for (StringRef Arch :
1343+
getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, nullptr, true))
1344+
Archs.insert(Arch);
1345+
}
1346+
for (StringRef Arch : Archs) {
1347+
if (NVPTXTriple && IsSYCLSupportedNVidiaGPUArch(StringToOffloadArch(
1348+
getProcessorFromTargetID(*NVPTXTriple, Arch)))) {
1349+
DerivedArchs[NVPTXTriple->getTriple()].insert(Arch);
1350+
} else if (AMDTriple &&
1351+
IsSYCLSupportedAMDGPUArch(StringToOffloadArch(
1352+
getProcessorFromTargetID(*AMDTriple, Arch)))) {
1353+
DerivedArchs[AMDTriple->getTriple()].insert(Arch);
1354+
} else if (IsSYCLSupportedIntelCPUArch(StringToOffloadArchSYCL(Arch))) {
1355+
DerivedArchs[MakeSYCLDeviceTriple("spir64_x86_64").getTriple()].insert(
1356+
Arch);
1357+
} else if (IsSYCLSupportedIntelGPUArch(StringToOffloadArchSYCL(Arch))) {
1358+
StringRef IntelGPUArch;
1359+
// For Intel Graphics AOT target, valid values for '--offload-arch'
1360+
// are mapped to valid device names accepted by OCLOC (the Intel GPU AOT
1361+
// compiler) via the '-device' option. The mapIntelGPUArchName
1362+
// function maps the accepted values for '--offload-arch' to enable SYCL
1363+
// offloading to Intel GPUs and the corresponding '-device' value passed
1364+
// to OCLOC.
1365+
IntelGPUArch = mapIntelGPUArchName(Arch).data();
1366+
DerivedArchs[MakeSYCLDeviceTriple("spir64_gen").getTriple()].insert(
1367+
IntelGPUArch);
1368+
} else {
1369+
Diag(clang::diag::err_drv_invalid_sycl_target) << Arch;
1370+
return;
1371+
}
1372+
}
1373+
// Emit an error if architecture value is not provided
1374+
// to --offload-arch.
1375+
if (Archs.empty()) {
1376+
Diag(clang::diag::err_drv_sycl_offload_arch_missing_value);
1377+
return;
1378+
}
1379+
1380+
for (const auto &TripleAndArchs : DerivedArchs)
1381+
SYCLTriples.insert(TripleAndArchs.first());
1382+
1383+
for (const auto &Val : SYCLTriples) {
1384+
llvm::Triple SYCLTargetTriple(MakeSYCLDeviceTriple(Val.getKey()));
1385+
std::string NormalizedName = SYCLTargetTriple.normalize();
1386+
1387+
// Make sure we don't have a duplicate triple.
1388+
auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
1389+
if (Duplicate != FoundNormalizedTriples.end()) {
1390+
Diag(clang::diag::warn_drv_sycl_offload_target_duplicate)
1391+
<< Val.getKey() << Duplicate->second;
1392+
continue;
1393+
}
1394+
1395+
// Store the current triple so that we can check for duplicates in the
1396+
// following iterations.
1397+
FoundNormalizedTriples[NormalizedName] = Val.getKey();
1398+
UniqueSYCLTriplesVec.push_back(SYCLTargetTriple);
1399+
}
1400+
1401+
addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
1402+
12991403
} else {
13001404
// If -fsycl is supplied without -fsycl-targets we will assume SPIR-V.
13011405
// For -fsycl-device-only, we also setup the implied triple as needed.
@@ -5432,9 +5536,58 @@ class OffloadingActionBuilder final {
54325536
BundlingActions, types::TY_Object);
54335537
if (auto *OWA = dyn_cast<OffloadWrapperJobAction>(DeviceAction))
54345538
OWA->setOffloadKind(Action::OFK_Host);
5539+
// The Backend compilation step performed here is being done for
5540+
// creating FPGA archives. The possible split binaries after
5541+
// sycl-post-link need to be individually wrapped as opposed to
5542+
// being passed into the clang-offload-wrapper via a table and
5543+
// using the -batch option - effectively creating a single
5544+
// binary. The resulting archive created from -fsycl-link should
5545+
// not contain the singular binary, but should be individual
5546+
// binaries to be consumed later by either the -fsycl-link=image
5547+
// device compilation step or being linked into the final exe.
5548+
//
5549+
// Typical compile flow:
5550+
// .bc
5551+
// |
5552+
// sycl-post-link -split=kernel
5553+
// |
5554+
// +--------+--------+
5555+
// | | |
5556+
// split1 split2 split3
5557+
// | | |
5558+
// llvm-spirv llvm-spirv llvm-spirv
5559+
// | | |
5560+
// ocloc ocloc ocloc
5561+
// | | |
5562+
// +--------+--------+
5563+
// |
5564+
// clang-offload-wrapper -batch
5565+
// |
5566+
// .o
5567+
//
5568+
// Individual wrap compile flow:
5569+
// .bc
5570+
// |
5571+
// sycl-post-link -split=kernel
5572+
// |
5573+
// +--------+--------+
5574+
// | | |
5575+
// split1 split2 split3
5576+
// | | |
5577+
// llvm-spirv llvm-spirv llvm-spirv
5578+
// | | |
5579+
// ocloc ocloc ocloc
5580+
// | | |
5581+
// wrap wrap wrap
5582+
// | | |
5583+
// .o .o .o
5584+
//
54355585
Action *CompiledDeviceAction =
5436-
C.MakeAction<OffloadWrapperJobAction>(WrapperItems,
5437-
types::TY_Object);
5586+
C.MakeAction<OffloadWrapperJobAction>(FPGAAOTAction,
5587+
types::TY_Tempfilelist);
5588+
if (auto *OWA =
5589+
dyn_cast<OffloadWrapperJobAction>(CompiledDeviceAction))
5590+
OWA->setWrapIndividualFiles();
54385591
addDeps(CompiledDeviceAction, TC, BoundArch);
54395592
}
54405593
addDeps(DeviceAction, TC, BoundArch);
@@ -5708,6 +5861,9 @@ class OffloadingActionBuilder final {
57085861
};
57095862

57105863
Action *ExtractIRFilesAction = createExtractIRFilesAction();
5864+
// Device binaries that are individually wrapped when creating an
5865+
// FPGA Archive.
5866+
ActionList FPGAArchiveWrapperInputs;
57115867

57125868
if (IsNVPTX || IsAMDGCN) {
57135869
JobAction *FinAction =
@@ -5793,6 +5949,7 @@ class OffloadingActionBuilder final {
57935949
FileTableTformJobAction::COL_CODE,
57945950
FileTableTformJobAction::COL_CODE);
57955951
WrapperInputs.push_back(ReplaceFilesAction);
5952+
FPGAArchiveWrapperInputs.push_back(BuildCodeAction);
57965953
}
57975954
if (SkipWrapper) {
57985955
// Wrapper step not requested.
@@ -5827,8 +5984,11 @@ class OffloadingActionBuilder final {
58275984
if (auto *OWA = dyn_cast<OffloadWrapperJobAction>(DeviceAction))
58285985
OWA->setOffloadKind(Action::OFK_Host);
58295986
Action *CompiledDeviceAction =
5830-
C.MakeAction<OffloadWrapperJobAction>(WrapperInputs,
5831-
types::TY_Object);
5987+
C.MakeAction<OffloadWrapperJobAction>(
5988+
FPGAArchiveWrapperInputs, types::TY_Tempfilelist);
5989+
if (auto *OWA =
5990+
dyn_cast<OffloadWrapperJobAction>(CompiledDeviceAction))
5991+
OWA->setWrapIndividualFiles();
58325992
addDeps(CompiledDeviceAction, TC, nullptr);
58335993
}
58345994
addDeps(DeviceAction, TC, nullptr);
@@ -6357,7 +6517,7 @@ class OffloadingActionBuilder final {
63576517
if (GpuInitHasErrors)
63586518
return true;
63596519

6360-
int GenIndex = 0;
6520+
size_t GenIndex = 0;
63616521
// Fill SYCLTargetInfoList
63626522
for (auto &TT : SYCLTripleList) {
63636523
auto TCIt = llvm::find_if(

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10263,8 +10263,18 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
1026310263
const InputInfo &I = Inputs[0];
1026410264
assert(I.isFilename() && "Invalid input.");
1026510265

10266-
if (I.getType() == types::TY_Tempfiletable ||
10267-
I.getType() == types::TY_Tempfilelist || IsEmbeddedIR)
10266+
// TODO: The embedded compilation step after the wrapping step restricts
10267+
// the ability to control the 'for each' methodology used when performing
10268+
// device code splitting. We set the individual wrap behavior when we know
10269+
// the wrapping and compile step should be done individually. Ideally this
10270+
// would be controlled at the JobAction creation, but we cannot do that
10271+
// until the compilation of the wrap is it's own JobAction.
10272+
bool IndividualWrapCompile = WrapperJob.getWrapIndividualFiles();
10273+
const InputInfo TempOutput(types::TY_LLVM_BC, WrapperFileName,
10274+
WrapperFileName);
10275+
if (!IndividualWrapCompile &&
10276+
(I.getType() == types::TY_Tempfiletable ||
10277+
I.getType() == types::TY_Tempfilelist || IsEmbeddedIR))
1026810278
// Input files are passed via the batch job file table.
1026910279
WrapperArgs.push_back(C.getArgs().MakeArgString("-batch"));
1027010280
WrapperArgs.push_back(C.getArgs().MakeArgString(I.getFilename()));
@@ -10273,7 +10283,17 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
1027310283
JA, *this, ResponseFileSupport::None(),
1027410284
TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
1027510285
WrapperArgs, std::nullopt);
10276-
C.addCommand(std::move(Cmd));
10286+
10287+
if (IndividualWrapCompile) {
10288+
// When wrapping FPGA device binaries for FPGA archives, create individual
10289+
// wrapped and compiled entries for the archive.
10290+
StringRef ParallelJobs =
10291+
C.getArgs().getLastArgValue(options::OPT_fsycl_max_parallel_jobs_EQ);
10292+
clang::driver::tools::SYCL::constructLLVMForeachCommand(
10293+
C, JA, std::move(Cmd), Inputs, TempOutput, this, "", "bc",
10294+
ParallelJobs);
10295+
} else
10296+
C.addCommand(std::move(Cmd));
1027710297

1027810298
if (WrapperCompileEnabled) {
1027910299
// TODO Use TC.SelectTool().
@@ -10296,9 +10316,19 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
1029610316
SmallString<128> ClangPath(C.getDriver().Dir);
1029710317
llvm::sys::path::append(ClangPath, "clang");
1029810318
const char *Clang = C.getArgs().MakeArgString(ClangPath);
10299-
C.addCommand(std::make_unique<Command>(JA, *this,
10300-
ResponseFileSupport::None(), Clang,
10301-
ClangArgs, std::nullopt));
10319+
auto PostWrapCompileCmd =
10320+
std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
10321+
Clang, ClangArgs, std::nullopt);
10322+
if (IndividualWrapCompile) {
10323+
StringRef ParallelJobs = C.getArgs().getLastArgValue(
10324+
options::OPT_fsycl_max_parallel_jobs_EQ);
10325+
InputInfoList Inputs;
10326+
Inputs.push_back(TempOutput);
10327+
clang::driver::tools::SYCL::constructLLVMForeachCommand(
10328+
C, JA, std::move(PostWrapCompileCmd), Inputs, Output, this, "",
10329+
"bc", ParallelJobs);
10330+
} else
10331+
C.addCommand(std::move(PostWrapCompileCmd));
1030210332
}
1030310333
return;
1030410334
} // end of SYCL flavor of offload wrapper command creation

0 commit comments

Comments
 (0)