@@ -1191,12 +1191,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
11911191 llvm::StringMap<llvm::DenseSet<StringRef>> DerivedArchs;
11921192 llvm::StringMap<StringRef> FoundNormalizedTriples;
11931193 llvm::SmallVector<llvm::Triple, 4 > UniqueSYCLTriplesVec;
1194+ // StringSet to contain SYCL target triples.
1195+ llvm::StringSet<> SYCLTriples;
11941196 if (HasSYCLTargetsOption) {
11951197 // At this point, we know we have a valid combination
11961198 // of -fsycl*target options passed
11971199 Arg *SYCLTargetsValues = SYCLTargets;
11981200 if (SYCLTargetsValues) {
1199- llvm::StringSet<> SYCLTriples;
12001201 if (SYCLTargetsValues->getNumValues ()) {
12011202
12021203 // Multiple targets are currently not supported when using
@@ -1296,6 +1297,109 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
12961297 Diag (clang::diag::warn_drv_empty_joined_argument)
12971298 << SYCLTargetsValues->getAsString (C.getInputArgs ());
12981299 }
1300+ }
1301+ // If the user specified --offload-arch, deduce the offloading
1302+ // target triple(s) from the set of architecture(s).
1303+ // Create a toolchain for each valid triple.
1304+ // We do not support SYCL offloading if any of the inputs is a
1305+ // .cu (for CUDA type) or .hip (for HIP type) file.
1306+ else if (HasValidSYCLRuntime &&
1307+ C.getInputArgs ().hasArg (options::OPT_offload_arch_EQ) && !IsHIP &&
1308+ !IsCuda) {
1309+ // SYCL offloading to AOT Targets with '--offload-arch'
1310+ // is currently enabled only with '--offload-new-driver' option.
1311+ // Emit a diagnostic if '--offload-arch' is invoked without
1312+ // '--offload-new driver' option.
1313+ if (!C.getInputArgs ().hasFlag (options::OPT_offload_new_driver,
1314+ options::OPT_no_offload_new_driver, false )) {
1315+ Diag (clang::diag::err_drv_sycl_offload_arch_new_driver);
1316+ return ;
1317+ }
1318+ const ToolChain *HostTC = C.getSingleOffloadToolChain <Action::OFK_Host>();
1319+ auto AMDTriple = getHIPOffloadTargetTriple (*this , C.getInputArgs ());
1320+ auto NVPTXTriple = getNVIDIAOffloadTargetTriple (*this , C.getInputArgs (),
1321+ HostTC->getTriple ());
1322+
1323+ // Attempt to deduce the offloading triple from the set of architectures.
1324+ // We need to temporarily create these toolchains so that we can access
1325+ // tools for inferring architectures.
1326+ llvm::DenseSet<StringRef> Archs;
1327+ if (NVPTXTriple) {
1328+ auto TempTC = std::make_unique<toolchains::CudaToolChain>(
1329+ *this , *NVPTXTriple, *HostTC, C.getInputArgs (), Action::OFK_None);
1330+ for (StringRef Arch :
1331+ getOffloadArchs (C, C.getArgs (), Action::OFK_SYCL, &*TempTC, true ))
1332+ Archs.insert (Arch);
1333+ }
1334+ if (AMDTriple) {
1335+ auto TempTC = std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
1336+ *this , *AMDTriple, *HostTC, C.getInputArgs ());
1337+ for (StringRef Arch :
1338+ getOffloadArchs (C, C.getArgs (), Action::OFK_SYCL, &*TempTC, true ))
1339+ Archs.insert (Arch);
1340+ }
1341+ if (!AMDTriple && !NVPTXTriple) {
1342+ for (StringRef Arch :
1343+ getOffloadArchs (C, C.getArgs (), Action::OFK_SYCL, nullptr , true ))
1344+ Archs.insert (Arch);
1345+ }
1346+ for (StringRef Arch : Archs) {
1347+ if (NVPTXTriple && IsSYCLSupportedNVidiaGPUArch (StringToOffloadArch (
1348+ getProcessorFromTargetID (*NVPTXTriple, Arch)))) {
1349+ DerivedArchs[NVPTXTriple->getTriple ()].insert (Arch);
1350+ } else if (AMDTriple &&
1351+ IsSYCLSupportedAMDGPUArch (StringToOffloadArch (
1352+ getProcessorFromTargetID (*AMDTriple, Arch)))) {
1353+ DerivedArchs[AMDTriple->getTriple ()].insert (Arch);
1354+ } else if (IsSYCLSupportedIntelCPUArch (StringToOffloadArchSYCL (Arch))) {
1355+ DerivedArchs[MakeSYCLDeviceTriple (" spir64_x86_64" ).getTriple ()].insert (
1356+ Arch);
1357+ } else if (IsSYCLSupportedIntelGPUArch (StringToOffloadArchSYCL (Arch))) {
1358+ StringRef IntelGPUArch;
1359+ // For Intel Graphics AOT target, valid values for '--offload-arch'
1360+ // are mapped to valid device names accepted by OCLOC (the Intel GPU AOT
1361+ // compiler) via the '-device' option. The mapIntelGPUArchName
1362+ // function maps the accepted values for '--offload-arch' to enable SYCL
1363+ // offloading to Intel GPUs and the corresponding '-device' value passed
1364+ // to OCLOC.
1365+ IntelGPUArch = mapIntelGPUArchName (Arch).data ();
1366+ DerivedArchs[MakeSYCLDeviceTriple (" spir64_gen" ).getTriple ()].insert (
1367+ IntelGPUArch);
1368+ } else {
1369+ Diag (clang::diag::err_drv_invalid_sycl_target) << Arch;
1370+ return ;
1371+ }
1372+ }
1373+ // Emit an error if architecture value is not provided
1374+ // to --offload-arch.
1375+ if (Archs.empty ()) {
1376+ Diag (clang::diag::err_drv_sycl_offload_arch_missing_value);
1377+ return ;
1378+ }
1379+
1380+ for (const auto &TripleAndArchs : DerivedArchs)
1381+ SYCLTriples.insert (TripleAndArchs.first ());
1382+
1383+ for (const auto &Val : SYCLTriples) {
1384+ llvm::Triple SYCLTargetTriple (MakeSYCLDeviceTriple (Val.getKey ()));
1385+ std::string NormalizedName = SYCLTargetTriple.normalize ();
1386+
1387+ // Make sure we don't have a duplicate triple.
1388+ auto Duplicate = FoundNormalizedTriples.find (NormalizedName);
1389+ if (Duplicate != FoundNormalizedTriples.end ()) {
1390+ Diag (clang::diag::warn_drv_sycl_offload_target_duplicate)
1391+ << Val.getKey () << Duplicate->second ;
1392+ continue ;
1393+ }
1394+
1395+ // Store the current triple so that we can check for duplicates in the
1396+ // following iterations.
1397+ FoundNormalizedTriples[NormalizedName] = Val.getKey ();
1398+ UniqueSYCLTriplesVec.push_back (SYCLTargetTriple);
1399+ }
1400+
1401+ addSYCLDefaultTriple (C, UniqueSYCLTriplesVec);
1402+
12991403 } else {
13001404 // If -fsycl is supplied without -fsycl-targets we will assume SPIR-V.
13011405 // For -fsycl-device-only, we also setup the implied triple as needed.
@@ -5432,9 +5536,58 @@ class OffloadingActionBuilder final {
54325536 BundlingActions, types::TY_Object);
54335537 if (auto *OWA = dyn_cast<OffloadWrapperJobAction>(DeviceAction))
54345538 OWA->setOffloadKind (Action::OFK_Host);
5539+ // The Backend compilation step performed here is being done for
5540+ // creating FPGA archives. The possible split binaries after
5541+ // sycl-post-link need to be individually wrapped as opposed to
5542+ // being passed into the clang-offload-wrapper via a table and
5543+ // using the -batch option - effectively creating a single
5544+ // binary. The resulting archive created from -fsycl-link should
5545+ // not contain the singular binary, but should be individual
5546+ // binaries to be consumed later by either the -fsycl-link=image
5547+ // device compilation step or being linked into the final exe.
5548+ //
5549+ // Typical compile flow:
5550+ // .bc
5551+ // |
5552+ // sycl-post-link -split=kernel
5553+ // |
5554+ // +--------+--------+
5555+ // | | |
5556+ // split1 split2 split3
5557+ // | | |
5558+ // llvm-spirv llvm-spirv llvm-spirv
5559+ // | | |
5560+ // ocloc ocloc ocloc
5561+ // | | |
5562+ // +--------+--------+
5563+ // |
5564+ // clang-offload-wrapper -batch
5565+ // |
5566+ // .o
5567+ //
5568+ // Individual wrap compile flow:
5569+ // .bc
5570+ // |
5571+ // sycl-post-link -split=kernel
5572+ // |
5573+ // +--------+--------+
5574+ // | | |
5575+ // split1 split2 split3
5576+ // | | |
5577+ // llvm-spirv llvm-spirv llvm-spirv
5578+ // | | |
5579+ // ocloc ocloc ocloc
5580+ // | | |
5581+ // wrap wrap wrap
5582+ // | | |
5583+ // .o .o .o
5584+ //
54355585 Action *CompiledDeviceAction =
5436- C.MakeAction <OffloadWrapperJobAction>(WrapperItems,
5437- types::TY_Object);
5586+ C.MakeAction <OffloadWrapperJobAction>(FPGAAOTAction,
5587+ types::TY_Tempfilelist);
5588+ if (auto *OWA =
5589+ dyn_cast<OffloadWrapperJobAction>(CompiledDeviceAction))
5590+ OWA->setWrapIndividualFiles ();
54385591 addDeps (CompiledDeviceAction, TC, BoundArch);
54395592 }
54405593 addDeps (DeviceAction, TC, BoundArch);
@@ -5708,6 +5861,9 @@ class OffloadingActionBuilder final {
57085861 };
57095862
57105863 Action *ExtractIRFilesAction = createExtractIRFilesAction ();
5864+ // Device binaries that are individually wrapped when creating an
5865+ // FPGA Archive.
5866+ ActionList FPGAArchiveWrapperInputs;
57115867
57125868 if (IsNVPTX || IsAMDGCN) {
57135869 JobAction *FinAction =
@@ -5793,6 +5949,7 @@ class OffloadingActionBuilder final {
57935949 FileTableTformJobAction::COL_CODE,
57945950 FileTableTformJobAction::COL_CODE);
57955951 WrapperInputs.push_back (ReplaceFilesAction);
5952+ FPGAArchiveWrapperInputs.push_back (BuildCodeAction);
57965953 }
57975954 if (SkipWrapper) {
57985955 // Wrapper step not requested.
@@ -5827,8 +5984,11 @@ class OffloadingActionBuilder final {
58275984 if (auto *OWA = dyn_cast<OffloadWrapperJobAction>(DeviceAction))
58285985 OWA->setOffloadKind (Action::OFK_Host);
58295986 Action *CompiledDeviceAction =
5830- C.MakeAction <OffloadWrapperJobAction>(WrapperInputs,
5831- types::TY_Object);
5987+ C.MakeAction <OffloadWrapperJobAction>(
5988+ FPGAArchiveWrapperInputs, types::TY_Tempfilelist);
5989+ if (auto *OWA =
5990+ dyn_cast<OffloadWrapperJobAction>(CompiledDeviceAction))
5991+ OWA->setWrapIndividualFiles ();
58325992 addDeps (CompiledDeviceAction, TC, nullptr );
58335993 }
58345994 addDeps (DeviceAction, TC, nullptr );
@@ -6357,7 +6517,7 @@ class OffloadingActionBuilder final {
63576517 if (GpuInitHasErrors)
63586518 return true ;
63596519
6360- int GenIndex = 0 ;
6520+ size_t GenIndex = 0 ;
63616521 // Fill SYCLTargetInfoList
63626522 for (auto &TT : SYCLTripleList) {
63636523 auto TCIt = llvm::find_if (
0 commit comments