diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml index 394b0c74d24ed..f5318aecc53a7 100644 --- a/.github/workflows/release-binaries-all.yml +++ b/.github/workflows/release-binaries-all.yml @@ -43,6 +43,7 @@ on: - '.github/workflows/release-binaries.yml' - '.github/workflows/release-binaries-setup-stage/*' - '.github/workflows/release-binaries-save-stage/*' + - 'clang/cmake/caches/Release.cmake' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || 'dispatch' }} diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp index 90b317527ee41..1ff61bae46b1e 100644 --- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp @@ -25,6 +25,13 @@ AST_MATCHER_P(DeducedTemplateSpecializationType, refsToTemplatedDecl, return false; } +AST_MATCHER_P(Type, asTagDecl, clang::ast_matchers::internal::Matcher, + DeclMatcher) { + if (const TagDecl *ND = Node.getAsTagDecl()) + return DeclMatcher.matches(*ND, Finder, Builder); + return false; +} + } // namespace // A function that helps to tell whether a TargetDecl in a UsingDecl will be @@ -61,7 +68,8 @@ void UnusedUsingDeclsCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher(userDefinedLiteral().bind("used"), this); Finder->addMatcher( loc(elaboratedType(unless(hasQualifier(nestedNameSpecifier())), - hasUnqualifiedDesugaredType(type().bind("usedType")))), + hasUnqualifiedDesugaredType( + type(asTagDecl(tagDecl().bind("used")))))), this); // Cases where we can identify the UsingShadowDecl directly, rather than // just its target. @@ -139,12 +147,6 @@ void UnusedUsingDeclsCheck::check(const MatchFinder::MatchResult &Result) { return; } - if (const auto *T = Result.Nodes.getNodeAs("usedType")) { - if (const auto *ND = T->getAsTagDecl()) - RemoveNamedDecl(ND); - return; - } - if (const auto *UsedShadow = Result.Nodes.getNodeAs("usedShadow")) { removeFromFoundDecls(UsedShadow->getTargetDecl()); diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index 89eee392837af..6711eb7dc10f8 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1409,6 +1409,9 @@ bool semaCodeComplete(std::unique_ptr Consumer, Clang->getPreprocessorOpts().SingleFileParseMode = CompletingInPreamble; Clang->setCodeCompletionConsumer(Consumer.release()); + if (Input.Preamble.RequiredModules) + Input.Preamble.RequiredModules->adjustHeaderSearchOptions(Clang->getHeaderSearchOpts()); + SyntaxOnlyAction Action; if (!Action.BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0])) { log("BeginSourceFile() failed when running codeComplete for {0}", @@ -2122,7 +2125,7 @@ clang::CodeCompleteOptions CodeCompleteOptions::getClangCompleteOpts() const { // When an is used, Sema is responsible for completing the main file, // the index can provide results from the preamble. // Tell Sema not to deserialize the preamble to look for results. - Result.LoadExternal = !Index; + Result.LoadExternal = ForceLoadPreamble || !Index; Result.IncludeFixIts = IncludeFixIts; return Result; diff --git a/clang-tools-extra/clangd/CodeComplete.h b/clang-tools-extra/clangd/CodeComplete.h index a7c1ae95dcbf4..9bcdeb0227cd4 100644 --- a/clang-tools-extra/clangd/CodeComplete.h +++ b/clang-tools-extra/clangd/CodeComplete.h @@ -52,6 +52,11 @@ struct CodeCompleteOptions { /// For example, private members are usually inaccessible. bool IncludeIneligibleResults = false; + /// Force sema to load decls from preamble even if an index is provided. + /// This is helpful for cases the index can't provide symbols, e.g. with + /// experimental c++20 modules + bool ForceLoadPreamble = false; + /// Combine overloads into a single completion item where possible. /// If none, the implementation may choose an appropriate behavior. /// (In practice, ClangdLSPServer enables bundling if the client claims diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp index a76894cf0855f..d1d744a21cfd5 100644 --- a/clang-tools-extra/clangd/index/SymbolCollector.cpp +++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp @@ -41,6 +41,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -75,18 +76,62 @@ bool isPrivateProtoDecl(const NamedDecl &ND) { if (ND.getIdentifier() == nullptr) return false; auto Name = ND.getIdentifier()->getName(); - if (!Name.contains('_')) - return false; - // Nested proto entities (e.g. Message::Nested) have top-level decls - // that shouldn't be used (Message_Nested). Ignore them completely. - // The nested entities are dangling type aliases, we may want to reconsider - // including them in the future. - // For enum constants, SOME_ENUM_CONSTANT is not private and should be - // indexed. Outer_INNER is private. This heuristic relies on naming style, it - // will include OUTER_INNER and exclude some_enum_constant. - // FIXME: the heuristic relies on naming style (i.e. no underscore in - // user-defined names) and can be improved. - return (ND.getKind() != Decl::EnumConstant) || llvm::any_of(Name, islower); + // There are some internal helpers like _internal_set_foo(); + if (Name.contains("_internal_")) + return true; + + // https://protobuf.dev/reference/cpp/cpp-generated/#nested-types + // Nested entities (messages/enums) has two names, one at the top-level scope, + // with a mangled name created by prepending all the outer types. These names + // are almost never preferred by the developers, so exclude them from index. + // e.g. + // message Foo { + // message Bar {} + // enum E { A } + // } + // + // yields: + // class Foo_Bar {}; + // enum Foo_E { Foo_E_A }; + // class Foo { + // using Bar = Foo_Bar; + // static constexpr Foo_E A = Foo_E_A; + // }; + + // We get rid of Foo_Bar and Foo_E by discarding any top-level entries with + // `_` in the name. This relies on original message/enum not having `_` in the + // name. Hence might go wrong in certain cases. + if (ND.getDeclContext()->isNamespace()) { + // Strip off some known public suffix helpers for enums, rest of the helpers + // are generated inside record decls so we don't care. + // https://protobuf.dev/reference/cpp/cpp-generated/#enum + Name.consume_back("_descriptor"); + Name.consume_back("_IsValid"); + Name.consume_back("_Name"); + Name.consume_back("_Parse"); + Name.consume_back("_MIN"); + Name.consume_back("_MAX"); + Name.consume_back("_ARRAYSIZE"); + return Name.contains('_'); + } + + // EnumConstantDecls need some special attention, despite being nested in a + // TagDecl, they might still have mangled names. We filter those by checking + // if it has parent's name as a prefix. + // This might go wrong if a nested entity has a name that starts with parent's + // name, e.g: enum Foo { Foo_X }. + if (llvm::isa(&ND)) { + auto *DC = llvm::cast(ND.getDeclContext()); + if (!DC || !DC->getIdentifier()) + return false; + auto CtxName = DC->getIdentifier()->getName(); + return !CtxName.empty() && Name.consume_front(CtxName) && + Name.consume_front("_"); + } + + // Now we're only left with fields/methods without an `_internal_` in the + // name, they're intended for public use. + return false; } // We only collect #include paths for symbols that are suitable for global code diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index 3a5449ac8c799..1b669c50fa31a 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -919,6 +919,9 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var Opts.CodeComplete.EnableFunctionArgSnippets = EnableFunctionArgSnippets; Opts.CodeComplete.RunParser = CodeCompletionParse; Opts.CodeComplete.RankingModel = RankingModel; + // FIXME: If we're using C++20 modules, force the lookup process to load + // external decls, since currently the index doesn't support C++20 modules. + Opts.CodeComplete.ForceLoadPreamble = ExperimentalModulesSupport; RealThreadsafeFS TFS; std::vector> ProviderStack; diff --git a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp index 7bbb95c8b8d67..691a93e7acd0a 100644 --- a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp +++ b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp @@ -402,6 +402,86 @@ import A; EXPECT_TRUE(D.isFromASTFile()); } +// An end to end test for code complete in modules +TEST_F(PrerequisiteModulesTests, CodeCompleteTest) { + MockDirectoryCompilationDatabase CDB(TestDir, FS); + + CDB.addFile("A.cppm", R"cpp( +export module A; +export void printA(); + )cpp"); + + llvm::StringLiteral UserContents = R"cpp( +import A; +void func() { + print^ +} +)cpp"; + + CDB.addFile("Use.cpp", UserContents); + Annotations Test(UserContents); + + ModulesBuilder Builder(CDB); + + ParseInputs Use = getInputs("Use.cpp", CDB); + Use.ModulesManager = &Builder; + + std::unique_ptr CI = + buildCompilerInvocation(Use, DiagConsumer); + EXPECT_TRUE(CI); + + auto Preamble = + buildPreamble(getFullPath("Use.cpp"), *CI, Use, /*InMemory=*/true, + /*Callback=*/nullptr); + EXPECT_TRUE(Preamble); + EXPECT_TRUE(Preamble->RequiredModules); + + auto Result = codeComplete(getFullPath("Use.cpp"), Test.point(), + Preamble.get(), Use, {}); + EXPECT_FALSE(Result.Completions.empty()); + EXPECT_EQ(Result.Completions[0].Name, "printA"); +} + +TEST_F(PrerequisiteModulesTests, SignatureHelpTest) { + MockDirectoryCompilationDatabase CDB(TestDir, FS); + + CDB.addFile("A.cppm", R"cpp( +export module A; +export void printA(int a); + )cpp"); + + llvm::StringLiteral UserContents = R"cpp( +import A; +void func() { + printA(^); +} +)cpp"; + + CDB.addFile("Use.cpp", UserContents); + Annotations Test(UserContents); + + ModulesBuilder Builder(CDB); + + ParseInputs Use = getInputs("Use.cpp", CDB); + Use.ModulesManager = &Builder; + + std::unique_ptr CI = + buildCompilerInvocation(Use, DiagConsumer); + EXPECT_TRUE(CI); + + auto Preamble = + buildPreamble(getFullPath("Use.cpp"), *CI, Use, /*InMemory=*/true, + /*Callback=*/nullptr); + EXPECT_TRUE(Preamble); + EXPECT_TRUE(Preamble->RequiredModules); + + auto Result = signatureHelp(getFullPath("Use.cpp"), Test.point(), + *Preamble.get(), Use, MarkupKind::PlainText); + EXPECT_FALSE(Result.signatures.empty()); + EXPECT_EQ(Result.signatures[0].label, "printA(int a) -> void"); + EXPECT_EQ(Result.signatures[0].parameters[0].labelString, "int a"); +} + } // namespace } // namespace clang::clangd diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp index 0666be95b6b9e..e8088cb37fa51 100644 --- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp +++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp @@ -201,19 +201,63 @@ TEST_F(ShouldCollectSymbolTest, NoPrivateProtoSymbol) { build( R"(// Generated by the protocol buffer compiler. DO NOT EDIT! namespace nx { - class Top_Level {}; - class TopLevel {}; - enum Kind { - KIND_OK, - Kind_Not_Ok, + enum Outer_Enum : int { + Outer_Enum_KIND1, + Outer_Enum_Kind_2, }; + bool Outer_Enum_IsValid(int); + + class Outer_Inner {}; + class Outer { + using Inner = Outer_Inner; + using Enum = Outer_Enum; + static constexpr Enum KIND1 = Outer_Enum_KIND1; + static constexpr Enum Kind_2 = Outer_Enum_Kind_2; + static bool Enum_IsValid(int); + int &x(); + void set_x(); + void _internal_set_x(); + + int &Outer_y(); + }; + enum Foo { + FOO_VAL1, + Foo_VAL2, + }; + bool Foo_IsValid(int); })"); - EXPECT_TRUE(shouldCollect("nx::TopLevel")); - EXPECT_TRUE(shouldCollect("nx::Kind::KIND_OK")); - EXPECT_TRUE(shouldCollect("nx::Kind")); - EXPECT_FALSE(shouldCollect("nx::Top_Level")); - EXPECT_FALSE(shouldCollect("nx::Kind::Kind_Not_Ok")); + // Make sure all the mangled names for Outer::Enum is discarded. + EXPECT_FALSE(shouldCollect("nx::Outer_Enum")); + EXPECT_FALSE(shouldCollect("nx::Outer_Enum_KIND1")); + EXPECT_FALSE(shouldCollect("nx::Outer_Enum_Kind_2")); + EXPECT_FALSE(shouldCollect("nx::Outer_Enum_IsValid")); + // But nested aliases are preserved. + EXPECT_TRUE(shouldCollect("nx::Outer::Enum")); + EXPECT_TRUE(shouldCollect("nx::Outer::KIND1")); + EXPECT_TRUE(shouldCollect("nx::Outer::Kind_2")); + EXPECT_TRUE(shouldCollect("nx::Outer::Enum_IsValid")); + + // Check for Outer::Inner. + EXPECT_FALSE(shouldCollect("nx::Outer_Inner")); + EXPECT_TRUE(shouldCollect("nx::Outer")); + EXPECT_TRUE(shouldCollect("nx::Outer::Inner")); + + // Make sure field related information is preserved, unless it's explicitly + // marked with `_internal_`. + EXPECT_TRUE(shouldCollect("nx::Outer::x")); + EXPECT_TRUE(shouldCollect("nx::Outer::set_x")); + EXPECT_FALSE(shouldCollect("nx::Outer::_internal_set_x")); + EXPECT_TRUE(shouldCollect("nx::Outer::Outer_y")); + + // Handling of a top-level enum + EXPECT_TRUE(shouldCollect("nx::Foo::FOO_VAL1")); + EXPECT_TRUE(shouldCollect("nx::FOO_VAL1")); + EXPECT_TRUE(shouldCollect("nx::Foo_IsValid")); + // Our heuristic goes wrong here, if the user has a nested name that starts + // with parent's name. + EXPECT_FALSE(shouldCollect("nx::Foo::Foo_VAL2")); + EXPECT_FALSE(shouldCollect("nx::Foo_VAL2")); } TEST_F(ShouldCollectSymbolTest, DoubleCheckProtoHeaderComment) { diff --git a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h index cd2111cf72abf..46ca3c9d08074 100644 --- a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h +++ b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h @@ -21,6 +21,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include +#include namespace clang { class SourceLocation; @@ -62,7 +63,8 @@ void walkUsed(llvm::ArrayRef ASTRoots, struct AnalysisResults { std::vector Unused; - std::vector Missing; // Spellings, like "" + // Spellings, like "" paired with the Header that generated it. + std::vector> Missing; }; /// Determine which headers should be inserted or removed from the main file. diff --git a/clang-tools-extra/include-cleaner/lib/Analysis.cpp b/clang-tools-extra/include-cleaner/lib/Analysis.cpp index 05e9d14734a95..16013f53894e8 100644 --- a/clang-tools-extra/include-cleaner/lib/Analysis.cpp +++ b/clang-tools-extra/include-cleaner/lib/Analysis.cpp @@ -26,8 +26,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSet.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include @@ -84,7 +84,7 @@ analyze(llvm::ArrayRef ASTRoots, auto &SM = PP.getSourceManager(); const auto MainFile = *SM.getFileEntryRefForID(SM.getMainFileID()); llvm::DenseSet Used; - llvm::StringSet<> Missing; + llvm::StringMap
Missing; if (!HeaderFilter) HeaderFilter = [](llvm::StringRef) { return false; }; OptionalDirectoryEntryRef ResourceDir = @@ -119,7 +119,7 @@ analyze(llvm::ArrayRef ASTRoots, Satisfied = true; } if (!Satisfied) - Missing.insert(std::move(Spelling)); + Missing.try_emplace(std::move(Spelling), Providers.front()); }); AnalysisResults Results; @@ -144,8 +144,8 @@ analyze(llvm::ArrayRef ASTRoots, } Results.Unused.push_back(&I); } - for (llvm::StringRef S : Missing.keys()) - Results.Missing.push_back(S.str()); + for (auto &E : Missing) + Results.Missing.emplace_back(E.first().str(), E.second); llvm::sort(Results.Missing); return Results; } @@ -158,9 +158,9 @@ std::string fixIncludes(const AnalysisResults &Results, // Encode insertions/deletions in the magic way clang-format understands. for (const Include *I : Results.Unused) cantFail(R.add(tooling::Replacement(FileName, UINT_MAX, 1, I->quote()))); - for (llvm::StringRef Spelled : Results.Missing) - cantFail(R.add(tooling::Replacement(FileName, UINT_MAX, 0, - ("#include " + Spelled).str()))); + for (auto &[Spelled, _] : Results.Missing) + cantFail(R.add( + tooling::Replacement(FileName, UINT_MAX, 0, "#include " + Spelled))); // "cleanup" actually turns the UINT_MAX replacements into concrete edits. auto Positioned = cantFail(format::cleanupAroundReplacements(Code, R, Style)); return cantFail(tooling::applyAllReplacements(Code, Positioned)); diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp index afae4365587ae..080099adc9a07 100644 --- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp +++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp @@ -192,7 +192,7 @@ class Action : public clang::ASTFrontendAction { case PrintStyle::Changes: for (const Include *I : Results.Unused) llvm::outs() << "- " << I->quote() << " @Line:" << I->Line << "\n"; - for (const auto &I : Results.Missing) + for (const auto &[I, _] : Results.Missing) llvm::outs() << "+ " << I << "\n"; break; case PrintStyle::Final: diff --git a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp index 43634ee8f2d80..d2d137a0dfb42 100644 --- a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp +++ b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/VirtualFileSystem.h" @@ -39,6 +40,7 @@ namespace clang::include_cleaner { namespace { +using testing::_; using testing::AllOf; using testing::Contains; using testing::ElementsAre; @@ -262,10 +264,12 @@ int x = a + c; auto Results = analyze(std::vector{Decls.begin(), Decls.end()}, PP.MacroReferences, PP.Includes, &PI, AST.preprocessor()); + auto CHeader = llvm::cantFail( + AST.context().getSourceManager().getFileManager().getFileRef("c.h")); const Include *B = PP.Includes.atLine(3); ASSERT_EQ(B->Spelled, "b.h"); - EXPECT_THAT(Results.Missing, ElementsAre("\"c.h\"")); + EXPECT_THAT(Results.Missing, ElementsAre(Pair("\"c.h\"", Header(CHeader)))); EXPECT_THAT(Results.Unused, ElementsAre(B)); } @@ -370,7 +374,7 @@ TEST_F(AnalyzeTest, SpellingIncludesWithSymlinks) { auto Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor()); // Check that we're spelling header using the symlink, and not underlying // path. - EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\"")); + EXPECT_THAT(Results.Missing, testing::ElementsAre(Pair("\"inner.h\"", _))); // header.h should be unused. EXPECT_THAT(Results.Unused, Not(testing::IsEmpty())); @@ -379,7 +383,7 @@ TEST_F(AnalyzeTest, SpellingIncludesWithSymlinks) { auto HeaderFilter = [](llvm::StringRef Path) { return Path == "inner.h"; }; Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor(), HeaderFilter); - EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\"")); + EXPECT_THAT(Results.Missing, testing::ElementsAre(Pair("\"inner.h\"", _))); // header.h should be unused. EXPECT_THAT(Results.Unused, Not(testing::IsEmpty())); } @@ -389,7 +393,7 @@ TEST_F(AnalyzeTest, SpellingIncludesWithSymlinks) { HeaderFilter); // header.h should be ignored now. EXPECT_THAT(Results.Unused, Not(testing::IsEmpty())); - EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\"")); + EXPECT_THAT(Results.Missing, testing::ElementsAre(Pair("\"inner.h\"", _))); } } @@ -414,9 +418,9 @@ TEST(FixIncludes, Basic) { Inc.add(I); AnalysisResults Results; - Results.Missing.push_back("\"aa.h\""); - Results.Missing.push_back("\"ab.h\""); - Results.Missing.push_back(""); + Results.Missing.emplace_back("\"aa.h\"", Header("")); + Results.Missing.emplace_back("\"ab.h\"", Header("")); + Results.Missing.emplace_back("", Header("")); Results.Unused.push_back(Inc.atLine(3)); Results.Unused.push_back(Inc.atLine(4)); @@ -429,7 +433,7 @@ R"cpp(#include "d.h" )cpp"); Results = {}; - Results.Missing.push_back("\"d.h\""); + Results.Missing.emplace_back("\"d.h\"", Header("")); Code = R"cpp(#include "a.h")cpp"; EXPECT_EQ(fixIncludes(Results, "d.cc", Code, format::getLLVMStyle()), R"cpp(#include "d.h" diff --git a/clang/CodeOwners.rst b/clang/CodeOwners.rst index 2ae04c129eb76..f067b7183ae73 100644 --- a/clang/CodeOwners.rst +++ b/clang/CodeOwners.rst @@ -120,7 +120,7 @@ OpenBSD driver Driver parts not covered by someone else ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | Fangrui Song -| maskray\@google.com (email), MaskRay (Phabricator), MaskRay (GitHub) +| i\@maskray.me (email), MaskRay (Phabricator), MaskRay (GitHub) Tools diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index 4da99e899e7f7..f8a20a1e22472 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -133,7 +133,7 @@ def from_param(cls, param: str | bytes | None) -> c_interop_string: ) @staticmethod - def to_python_string(x: c_interop_string, *args: Any) -> str | None: + def to_python_string(x: c_interop_string) -> str | None: return x.value @@ -241,9 +241,9 @@ def __del__(self) -> None: conf.lib.clang_disposeString(self) @staticmethod - def from_result(res: _CXString, fn: Any = None, args: Any = None) -> str: + def from_result(res: _CXString) -> str: assert isinstance(res, _CXString) - pystr: str | None = conf.lib.clang_getCString(res) + pystr = c_interop_string.to_python_string(conf.lib.clang_getCString(res)) if pystr is None: return "" return pystr @@ -424,7 +424,7 @@ def location(self): @property def spelling(self): - return conf.lib.clang_getDiagnosticSpelling(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getDiagnosticSpelling(self)) @property def ranges(self) -> NoSliceSequence[SourceRange]: @@ -453,7 +453,9 @@ def __len__(self) -> int: def __getitem__(self, key: int) -> FixIt: range = SourceRange() - value = conf.lib.clang_getDiagnosticFixIt(self.diag, key, byref(range)) + value = _CXString.from_result( + conf.lib.clang_getDiagnosticFixIt(self.diag, key, byref(range)) + ) if len(value) == 0: raise IndexError @@ -486,12 +488,12 @@ def category_number(self): @property def category_name(self): """The string name of the category for this diagnostic.""" - return conf.lib.clang_getDiagnosticCategoryText(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getDiagnosticCategoryText(self)) @property def option(self): """The command-line option that enables this diagnostic.""" - return conf.lib.clang_getDiagnosticOption(self, None) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getDiagnosticOption(self, None)) @property def disable_option(self): @@ -511,7 +513,7 @@ def format(self, options=None): options = conf.lib.clang_defaultDiagnosticDisplayOptions() if options & ~Diagnostic._FormatOptionsMask: raise ValueError("Invalid format options") - return conf.lib.clang_formatDiagnostic(self, options) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_formatDiagnostic(self, options)) def __repr__(self): return "" % ( @@ -1734,7 +1736,7 @@ def get_definition(self): """ # TODO: Should probably check that this is either a reference or # declaration prior to issuing the lookup. - return conf.lib.clang_getCursorDefinition(self) # type: ignore [no-any-return] + return Cursor.from_result(conf.lib.clang_getCursorDefinition(self), self) def get_usr(self): """Return the Unified Symbol Resolution (USR) for the entity referenced @@ -1745,13 +1747,13 @@ def get_usr(self): program. USRs can be compared across translation units to determine, e.g., when references in one translation refer to an entity defined in another translation unit.""" - return conf.lib.clang_getCursorUSR(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getCursorUSR(self)) def get_included_file(self): """Returns the File that is included by the current inclusion cursor.""" assert self.kind == CursorKind.INCLUSION_DIRECTIVE - return conf.lib.clang_getIncludedFile(self) # type: ignore [no-any-return] + return File.from_result(conf.lib.clang_getIncludedFile(self), self) @property def kind(self): @@ -1762,7 +1764,9 @@ def kind(self): def spelling(self): """Return the spelling of the entity pointed at by the cursor.""" if not hasattr(self, "_spelling"): - self._spelling = conf.lib.clang_getCursorSpelling(self) + self._spelling = _CXString.from_result( + conf.lib.clang_getCursorSpelling(self) + ) return self._spelling @@ -1776,7 +1780,9 @@ def displayname(self): arguments of a class template specialization. """ if not hasattr(self, "_displayname"): - self._displayname = conf.lib.clang_getCursorDisplayName(self) + self._displayname = _CXString.from_result( + conf.lib.clang_getCursorDisplayName(self) + ) return self._displayname @@ -1784,7 +1790,9 @@ def displayname(self): def mangled_name(self): """Return the mangled name for the entity referenced by this cursor.""" if not hasattr(self, "_mangled_name"): - self._mangled_name = conf.lib.clang_Cursor_getMangling(self) + self._mangled_name = _CXString.from_result( + conf.lib.clang_Cursor_getMangling(self) + ) return self._mangled_name @@ -1876,7 +1884,7 @@ def type(self): Retrieve the Type (if any) of the entity pointed at by the cursor. """ if not hasattr(self, "_type"): - self._type = conf.lib.clang_getCursorType(self) + self._type = Type.from_result(conf.lib.clang_getCursorType(self), (self,)) return self._type @@ -1890,7 +1898,9 @@ def canonical(self): declarations will be identical. """ if not hasattr(self, "_canonical"): - self._canonical = conf.lib.clang_getCanonicalCursor(self) + self._canonical = Cursor.from_cursor_result( + conf.lib.clang_getCanonicalCursor(self), self + ) return self._canonical @@ -1898,7 +1908,9 @@ def canonical(self): def result_type(self): """Retrieve the Type of the result for this Cursor.""" if not hasattr(self, "_result_type"): - self._result_type = conf.lib.clang_getCursorResultType(self) + self._result_type = Type.from_result( + conf.lib.clang_getCursorResultType(self), (self,) + ) return self._result_type @@ -1925,7 +1937,9 @@ def underlying_typedef_type(self): """ if not hasattr(self, "_underlying_type"): assert self.kind.is_declaration() - self._underlying_type = conf.lib.clang_getTypedefDeclUnderlyingType(self) + self._underlying_type = Type.from_result( + conf.lib.clang_getTypedefDeclUnderlyingType(self), (self,) + ) return self._underlying_type @@ -1938,7 +1952,9 @@ def enum_type(self): """ if not hasattr(self, "_enum_type"): assert self.kind == CursorKind.ENUM_DECL - self._enum_type = conf.lib.clang_getEnumDeclIntegerType(self) + self._enum_type = Type.from_result( + conf.lib.clang_getEnumDeclIntegerType(self), (self,) + ) return self._enum_type @@ -1972,7 +1988,9 @@ def enum_value(self): def objc_type_encoding(self): """Return the Objective-C type encoding as a str.""" if not hasattr(self, "_objc_type_encoding"): - self._objc_type_encoding = conf.lib.clang_getDeclObjCTypeEncoding(self) + self._objc_type_encoding = _CXString.from_result( + conf.lib.clang_getDeclObjCTypeEncoding(self) + ) return self._objc_type_encoding @@ -1988,7 +2006,9 @@ def hash(self): def semantic_parent(self): """Return the semantic parent for this cursor.""" if not hasattr(self, "_semantic_parent"): - self._semantic_parent = conf.lib.clang_getCursorSemanticParent(self) + self._semantic_parent = Cursor.from_cursor_result( + conf.lib.clang_getCursorSemanticParent(self), self + ) return self._semantic_parent @@ -1996,7 +2016,9 @@ def semantic_parent(self): def lexical_parent(self): """Return the lexical parent for this cursor.""" if not hasattr(self, "_lexical_parent"): - self._lexical_parent = conf.lib.clang_getCursorLexicalParent(self) + self._lexical_parent = Cursor.from_cursor_result( + conf.lib.clang_getCursorLexicalParent(self), self + ) return self._lexical_parent @@ -2014,25 +2036,27 @@ def referenced(self): representing the entity that it references. """ if not hasattr(self, "_referenced"): - self._referenced = conf.lib.clang_getCursorReferenced(self) + self._referenced = Cursor.from_result( + conf.lib.clang_getCursorReferenced(self), self + ) return self._referenced @property def brief_comment(self): """Returns the brief comment text associated with that Cursor""" - return conf.lib.clang_Cursor_getBriefCommentText(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_Cursor_getBriefCommentText(self)) @property def raw_comment(self): """Returns the raw comment text associated with that Cursor""" - return conf.lib.clang_Cursor_getRawCommentText(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_Cursor_getRawCommentText(self)) def get_arguments(self): """Return an iterator for accessing the arguments of this cursor.""" num_args = conf.lib.clang_Cursor_getNumArguments(self) for i in range(0, num_args): - yield conf.lib.clang_Cursor_getArgument(self, i) + yield Cursor.from_result(conf.lib.clang_Cursor_getArgument(self, i), self) def get_num_template_arguments(self): """Returns the number of template args associated with this cursor.""" @@ -2041,11 +2065,15 @@ def get_num_template_arguments(self): def get_template_argument_kind(self, num): """Returns the TemplateArgumentKind for the indicated template argument.""" - return conf.lib.clang_Cursor_getTemplateArgumentKind(self, num) # type: ignore [no-any-return] + return TemplateArgumentKind.from_id( + conf.lib.clang_Cursor_getTemplateArgumentKind(self, num) + ) def get_template_argument_type(self, num): """Returns the CXType for the indicated template argument.""" - return conf.lib.clang_Cursor_getTemplateArgumentType(self, num) # type: ignore [no-any-return] + return Type.from_result( + conf.lib.clang_Cursor_getTemplateArgumentType(self, num), (self, num) + ) def get_template_argument_value(self, num): """Returns the value of the indicated arg as a signed 64b integer.""" @@ -2116,7 +2144,7 @@ def get_bitfield_width(self): return conf.lib.clang_getFieldDeclBitWidth(self) # type: ignore [no-any-return] @staticmethod - def from_result(res, fn, args): + def from_result(res, arg): assert isinstance(res, Cursor) # FIXME: There should just be an isNull method. if res == conf.lib.clang_getNullCursor(): @@ -2125,14 +2153,10 @@ def from_result(res, fn, args): # Store a reference to the TU in the Python object so it won't get GC'd # before the Cursor. tu = None - for arg in args: - if isinstance(arg, TranslationUnit): - tu = arg - break - - if hasattr(arg, "translation_unit"): - tu = arg.translation_unit - break + if isinstance(arg, TranslationUnit): + tu = arg + elif hasattr(arg, "translation_unit"): + tu = arg.translation_unit assert tu is not None @@ -2140,12 +2164,12 @@ def from_result(res, fn, args): return res @staticmethod - def from_cursor_result(res, fn, args): + def from_cursor_result(res, arg): assert isinstance(res, Cursor) if res == conf.lib.clang_getNullCursor(): return None - res._tu = args[0]._tu + res._tu = arg._tu return res @@ -2250,7 +2274,7 @@ class TypeKind(BaseEnumeration): @property def spelling(self): """Retrieve the spelling of this TypeKind.""" - return conf.lib.clang_getTypeKindSpelling(self.value) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getTypeKindSpelling(self.value)) INVALID = 0 UNEXPOSED = 1 @@ -2438,7 +2462,9 @@ def __getitem__(self, key: int) -> Type: "%d > %d" % (key, len(self)) ) - result: Type = conf.lib.clang_getArgType(self.parent, key) + result = Type.from_result( + conf.lib.clang_getArgType(self.parent, key), (self.parent, key) + ) if result.kind == TypeKind.INVALID: raise IndexError("Argument could not be retrieved.") @@ -2454,7 +2480,7 @@ def element_type(self): If accessed on a type that is not an array, complex, or vector type, an exception will be raised. """ - result = conf.lib.clang_getElementType(self) + result = Type.from_result(conf.lib.clang_getElementType(self), (self,)) if result.kind == TypeKind.INVALID: raise Exception("Element type not available on this type.") @@ -2482,7 +2508,7 @@ def translation_unit(self): return self._tu @staticmethod - def from_result(res, fn, args): + def from_result(res, args): assert isinstance(res, Type) tu = None @@ -2500,7 +2526,9 @@ def get_num_template_arguments(self): return conf.lib.clang_Type_getNumTemplateArguments(self) # type: ignore [no-any-return] def get_template_argument_type(self, num): - return conf.lib.clang_Type_getTemplateArgumentAsType(self, num) # type: ignore [no-any-return] + return Type.from_result( + conf.lib.clang_Type_getTemplateArgumentAsType(self, num), (self, num) + ) def get_canonical(self): """ @@ -2512,7 +2540,7 @@ def get_canonical(self): example, if 'T' is a typedef for 'int', the canonical type for 'T' would be 'int'. """ - return conf.lib.clang_getCanonicalType(self) # type: ignore [no-any-return] + return Type.from_result(conf.lib.clang_getCanonicalType(self), (self,)) def is_const_qualified(self): """Determine whether a Type has the "const" qualifier set. @@ -2548,7 +2576,7 @@ def get_address_space(self): return conf.lib.clang_getAddressSpace(self) # type: ignore [no-any-return] def get_typedef_name(self): - return conf.lib.clang_getTypedefName(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getTypedefName(self)) def is_pod(self): """Determine whether this Type represents plain old data (POD).""" @@ -2558,25 +2586,25 @@ def get_pointee(self): """ For pointer types, returns the type of the pointee. """ - return conf.lib.clang_getPointeeType(self) # type: ignore [no-any-return] + return Type.from_result(conf.lib.clang_getPointeeType(self), (self,)) def get_declaration(self): """ Return the cursor for the declaration of the given type. """ - return conf.lib.clang_getTypeDeclaration(self) # type: ignore [no-any-return] + return Cursor.from_result(conf.lib.clang_getTypeDeclaration(self), self) def get_result(self): """ Retrieve the result type associated with a function type. """ - return conf.lib.clang_getResultType(self) # type: ignore [no-any-return] + return Type.from_result(conf.lib.clang_getResultType(self), (self,)) def get_array_element_type(self): """ Retrieve the type of the elements of the array type. """ - return conf.lib.clang_getArrayElementType(self) # type: ignore [no-any-return] + return Type.from_result(conf.lib.clang_getArrayElementType(self), (self,)) def get_array_size(self): """ @@ -2588,13 +2616,13 @@ def get_class_type(self): """ Retrieve the class type of the member pointer type. """ - return conf.lib.clang_Type_getClassType(self) # type: ignore [no-any-return] + return Type.from_result(conf.lib.clang_Type_getClassType(self), (self,)) def get_named_type(self): """ Retrieve the type named by the qualified-id. """ - return conf.lib.clang_Type_getNamedType(self) # type: ignore [no-any-return] + return Type.from_result(conf.lib.clang_Type_getNamedType(self), (self,)) def get_align(self): """ @@ -2647,7 +2675,7 @@ def get_exception_specification_kind(self): @property def spelling(self): """Retrieve the spelling of this Type.""" - return conf.lib.clang_getTypeSpelling(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getTypeSpelling(self)) def __eq__(self, other): if type(other) != type(self): @@ -2737,7 +2765,9 @@ def __repr__(self): def spelling(self): if self.__kindNumber in SpellingCache: return SpellingCache[self.__kindNumber] - return conf.lib.clang_getCompletionChunkText(self.cs, self.key) # type: ignore [no-any-return] + return _CXString.from_result( + conf.lib.clang_getCompletionChunkText(self.cs, self.key) + ) # We do not use @CachedProperty here, as the manual implementation is # apparently still significantly faster. Please profile carefully if you @@ -2839,7 +2869,9 @@ def availability(self): @property def briefComment(self): if conf.function_exists("clang_getCompletionBriefComment"): - return conf.lib.clang_getCompletionBriefComment(self.obj) # type: ignore [no-any-return] + return _CXString.from_result( + conf.lib.clang_getCompletionBriefComment(self.obj) + ) return _CXString() def __repr__(self): @@ -3125,12 +3157,12 @@ def __del__(self): @property def cursor(self): """Retrieve the cursor that represents the given translation unit.""" - return conf.lib.clang_getTranslationUnitCursor(self) # type: ignore [no-any-return] + return Cursor.from_result(conf.lib.clang_getTranslationUnitCursor(self), self) @property def spelling(self): """Get the original translation unit source file name.""" - return conf.lib.clang_getTranslationUnitSpelling(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getTranslationUnitSpelling(self)) def get_includes(self): """ @@ -3356,7 +3388,7 @@ def from_name(translation_unit, file_name): @property def name(self): """Return the complete file and path name of the file.""" - return conf.lib.clang_getFileName(self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getFileName(self)) @property def time(self): @@ -3370,12 +3402,12 @@ def __repr__(self): return "" % (self.name) @staticmethod - def from_result(res, fn, args): + def from_result(res, arg): assert isinstance(res, c_object_p) res = File(res) # Copy a reference to the TranslationUnit to prevent premature GC. - res._tu = args[0]._tu + res._tu = arg._tu return res @@ -3440,12 +3472,16 @@ def __init__(self, cmd, ccmds): @property def directory(self): """Get the working directory for this CompileCommand""" - return conf.lib.clang_CompileCommand_getDirectory(self.cmd) # type: ignore [no-any-return] + return _CXString.from_result( + conf.lib.clang_CompileCommand_getDirectory(self.cmd) + ) @property def filename(self): """Get the working filename for this CompileCommand""" - return conf.lib.clang_CompileCommand_getFilename(self.cmd) # type: ignore [no-any-return] + return _CXString.from_result( + conf.lib.clang_CompileCommand_getFilename(self.cmd) + ) @property def arguments(self): @@ -3457,7 +3493,9 @@ def arguments(self): """ length = conf.lib.clang_CompileCommand_getNumArgs(self.cmd) for i in range(length): - yield conf.lib.clang_CompileCommand_getArg(self.cmd, i) + yield _CXString.from_result( + conf.lib.clang_CompileCommand_getArg(self.cmd, i) + ) class CompileCommands: @@ -3482,7 +3520,7 @@ def __getitem__(self, i): return CompileCommand(cc, self) @staticmethod - def from_result(res, fn, args): + def from_result(res): if not res: return None return CompileCommands(res) @@ -3500,7 +3538,7 @@ def __del__(self): conf.lib.clang_CompilationDatabase_dispose(self) @staticmethod - def from_result(res, fn, args): + def from_result(res): if not res: raise CompilationDatabaseError(0, "CompilationDatabase loading failed") return CompilationDatabase(res) @@ -3510,8 +3548,10 @@ def fromDirectory(buildDir): """Builds a CompilationDatabase from the database found in buildDir""" errorCode = c_uint() try: - cdb = conf.lib.clang_CompilationDatabase_fromDirectory( - os.fspath(buildDir), byref(errorCode) + cdb = CompilationDatabase.from_result( + conf.lib.clang_CompilationDatabase_fromDirectory( + os.fspath(buildDir), byref(errorCode) + ) ) except CompilationDatabaseError as e: raise CompilationDatabaseError( @@ -3524,8 +3564,10 @@ def getCompileCommands(self, filename): Get an iterable object providing all the CompileCommands available to build filename. Returns None if filename is not found in the database. """ - return conf.lib.clang_CompilationDatabase_getCompileCommands( # type: ignore [no-any-return] - self, os.fspath(filename) + return CompileCommands.from_result( + conf.lib.clang_CompilationDatabase_getCompileCommands( # type: ignore [no-any-return] + self, os.fspath(filename) + ) ) def getAllCompileCommands(self): @@ -3533,7 +3575,9 @@ def getAllCompileCommands(self): Get an iterable object providing all the CompileCommands available from the database. """ - return conf.lib.clang_CompilationDatabase_getAllCompileCommands(self) # type: ignore [no-any-return] + return CompileCommands.from_result( + conf.lib.clang_CompilationDatabase_getAllCompileCommands(self) # type: ignore [no-any-return] + ) class Token(Structure): @@ -3554,7 +3598,7 @@ def spelling(self): This is the textual representation of the token in source. """ - return conf.lib.clang_getTokenSpelling(self._tu, self) # type: ignore [no-any-return] + return _CXString.from_result(conf.lib.clang_getTokenSpelling(self._tu, self)) @property def kind(self): @@ -3661,41 +3705,19 @@ def write_main_file_to_stdout(self): "clang_CompilationDatabase_fromDirectory", [c_interop_string, POINTER(c_uint)], c_object_p, - CompilationDatabase.from_result, - ), - ( - "clang_CompilationDatabase_getAllCompileCommands", - [c_object_p], - c_object_p, - CompileCommands.from_result, ), + ("clang_CompilationDatabase_getAllCompileCommands", [c_object_p], c_object_p), ( "clang_CompilationDatabase_getCompileCommands", [c_object_p, c_interop_string], c_object_p, - CompileCommands.from_result, ), ("clang_CompileCommands_dispose", [c_object_p]), ("clang_CompileCommands_getCommand", [c_object_p, c_uint], c_object_p), ("clang_CompileCommands_getSize", [c_object_p], c_uint), - ( - "clang_CompileCommand_getArg", - [c_object_p, c_uint], - _CXString, - _CXString.from_result, - ), - ( - "clang_CompileCommand_getDirectory", - [c_object_p], - _CXString, - _CXString.from_result, - ), - ( - "clang_CompileCommand_getFilename", - [c_object_p], - _CXString, - _CXString.from_result, - ), + ("clang_CompileCommand_getArg", [c_object_p, c_uint], _CXString), + ("clang_CompileCommand_getDirectory", [c_object_p], _CXString), + ("clang_CompileCommand_getFilename", [c_object_p], _CXString), ("clang_CompileCommand_getNumArgs", [c_object_p], c_uint), ( "clang_codeCompleteAt", @@ -3743,82 +3765,62 @@ def write_main_file_to_stdout(self): ("clang_equalLocations", [SourceLocation, SourceLocation], bool), ("clang_equalRanges", [SourceRange, SourceRange], bool), ("clang_equalTypes", [Type, Type], bool), - ("clang_formatDiagnostic", [Diagnostic, c_uint], _CXString, _CXString.from_result), - ("clang_getArgType", [Type, c_uint], Type, Type.from_result), - ("clang_getArrayElementType", [Type], Type, Type.from_result), + ("clang_formatDiagnostic", [Diagnostic, c_uint], _CXString), + ("clang_getArgType", [Type, c_uint], Type), + ("clang_getArrayElementType", [Type], Type), ("clang_getArraySize", [Type], c_longlong), ("clang_getFieldDeclBitWidth", [Cursor], c_int), - ("clang_getCanonicalCursor", [Cursor], Cursor, Cursor.from_cursor_result), - ("clang_getCanonicalType", [Type], Type, Type.from_result), + ("clang_getCanonicalCursor", [Cursor], Cursor), + ("clang_getCanonicalType", [Type], Type), ("clang_getChildDiagnostics", [Diagnostic], c_object_p), ("clang_getCompletionAvailability", [c_void_p], c_int), - ("clang_getCompletionBriefComment", [c_void_p], _CXString, _CXString.from_result), + ("clang_getCompletionBriefComment", [c_void_p], _CXString), ("clang_getCompletionChunkCompletionString", [c_void_p, c_int], c_object_p), ("clang_getCompletionChunkKind", [c_void_p, c_int], c_int), - ( - "clang_getCompletionChunkText", - [c_void_p, c_int], - _CXString, - _CXString.from_result, - ), + ("clang_getCompletionChunkText", [c_void_p, c_int], _CXString), ("clang_getCompletionPriority", [c_void_p], c_int), - ( - "clang_getCString", - [_CXString], - c_interop_string, - c_interop_string.to_python_string, - ), + ("clang_getCString", [_CXString], c_interop_string), ("clang_getCursor", [TranslationUnit, SourceLocation], Cursor), ("clang_getCursorAvailability", [Cursor], c_int), - ("clang_getCursorDefinition", [Cursor], Cursor, Cursor.from_result), - ("clang_getCursorDisplayName", [Cursor], _CXString, _CXString.from_result), + ("clang_getCursorDefinition", [Cursor], Cursor), + ("clang_getCursorDisplayName", [Cursor], _CXString), ("clang_getCursorExtent", [Cursor], SourceRange), - ("clang_getCursorLexicalParent", [Cursor], Cursor, Cursor.from_cursor_result), + ("clang_getCursorLexicalParent", [Cursor], Cursor), ("clang_getCursorLocation", [Cursor], SourceLocation), - ("clang_getCursorReferenced", [Cursor], Cursor, Cursor.from_result), + ("clang_getCursorReferenced", [Cursor], Cursor), ("clang_getCursorReferenceNameRange", [Cursor, c_uint, c_uint], SourceRange), - ("clang_getCursorResultType", [Cursor], Type, Type.from_result), - ("clang_getCursorSemanticParent", [Cursor], Cursor, Cursor.from_cursor_result), - ("clang_getCursorSpelling", [Cursor], _CXString, _CXString.from_result), - ("clang_getCursorType", [Cursor], Type, Type.from_result), - ("clang_getCursorUSR", [Cursor], _CXString, _CXString.from_result), - ("clang_Cursor_getMangling", [Cursor], _CXString, _CXString.from_result), + ("clang_getCursorResultType", [Cursor], Type), + ("clang_getCursorSemanticParent", [Cursor], Cursor), + ("clang_getCursorSpelling", [Cursor], _CXString), + ("clang_getCursorType", [Cursor], Type), + ("clang_getCursorUSR", [Cursor], _CXString), + ("clang_Cursor_getMangling", [Cursor], _CXString), # ("clang_getCXTUResourceUsage", # [TranslationUnit], # CXTUResourceUsage), ("clang_getCXXAccessSpecifier", [Cursor], c_uint), - ("clang_getDeclObjCTypeEncoding", [Cursor], _CXString, _CXString.from_result), + ("clang_getDeclObjCTypeEncoding", [Cursor], _CXString), ("clang_getDiagnostic", [c_object_p, c_uint], c_object_p), ("clang_getDiagnosticCategory", [Diagnostic], c_uint), - ("clang_getDiagnosticCategoryText", [Diagnostic], _CXString, _CXString.from_result), - ( - "clang_getDiagnosticFixIt", - [Diagnostic, c_uint, POINTER(SourceRange)], - _CXString, - _CXString.from_result, - ), + ("clang_getDiagnosticCategoryText", [Diagnostic], _CXString), + ("clang_getDiagnosticFixIt", [Diagnostic, c_uint, POINTER(SourceRange)], _CXString), ("clang_getDiagnosticInSet", [c_object_p, c_uint], c_object_p), ("clang_getDiagnosticLocation", [Diagnostic], SourceLocation), ("clang_getDiagnosticNumFixIts", [Diagnostic], c_uint), ("clang_getDiagnosticNumRanges", [Diagnostic], c_uint), - ( - "clang_getDiagnosticOption", - [Diagnostic, POINTER(_CXString)], - _CXString, - _CXString.from_result, - ), + ("clang_getDiagnosticOption", [Diagnostic, POINTER(_CXString)], _CXString), ("clang_getDiagnosticRange", [Diagnostic, c_uint], SourceRange), ("clang_getDiagnosticSeverity", [Diagnostic], c_int), - ("clang_getDiagnosticSpelling", [Diagnostic], _CXString, _CXString.from_result), - ("clang_getElementType", [Type], Type, Type.from_result), + ("clang_getDiagnosticSpelling", [Diagnostic], _CXString), + ("clang_getElementType", [Type], Type), ("clang_getEnumConstantDeclUnsignedValue", [Cursor], c_ulonglong), ("clang_getEnumConstantDeclValue", [Cursor], c_longlong), - ("clang_getEnumDeclIntegerType", [Cursor], Type, Type.from_result), + ("clang_getEnumDeclIntegerType", [Cursor], Type), ("clang_getFile", [TranslationUnit, c_interop_string], c_object_p), - ("clang_getFileName", [File], _CXString, _CXString.from_result), + ("clang_getFileName", [File], _CXString), ("clang_getFileTime", [File], c_uint), - ("clang_getIBOutletCollectionType", [Cursor], Type, Type.from_result), - ("clang_getIncludedFile", [Cursor], c_object_p, File.from_result), + ("clang_getIBOutletCollectionType", [Cursor], Type), + ("clang_getIncludedFile", [Cursor], c_object_p), ( "clang_getInclusions", [TranslationUnit, translation_unit_includes_callback, py_object], @@ -3842,41 +3844,26 @@ def write_main_file_to_stdout(self): ("clang_getNumDiagnosticsInSet", [c_object_p], c_uint), ("clang_getNumElements", [Type], c_longlong), ("clang_getNumOverloadedDecls", [Cursor], c_uint), - ("clang_getOverloadedDecl", [Cursor, c_uint], Cursor, Cursor.from_cursor_result), - ("clang_getPointeeType", [Type], Type, Type.from_result), + ("clang_getOverloadedDecl", [Cursor, c_uint], Cursor), + ("clang_getPointeeType", [Type], Type), ("clang_getRange", [SourceLocation, SourceLocation], SourceRange), ("clang_getRangeEnd", [SourceRange], SourceLocation), ("clang_getRangeStart", [SourceRange], SourceLocation), - ("clang_getResultType", [Type], Type, Type.from_result), - ("clang_getSpecializedCursorTemplate", [Cursor], Cursor, Cursor.from_cursor_result), + ("clang_getResultType", [Type], Type), + ("clang_getSpecializedCursorTemplate", [Cursor], Cursor), ("clang_getTemplateCursorKind", [Cursor], c_uint), ("clang_getTokenExtent", [TranslationUnit, Token], SourceRange), ("clang_getTokenKind", [Token], c_uint), ("clang_getTokenLocation", [TranslationUnit, Token], SourceLocation), - ( - "clang_getTokenSpelling", - [TranslationUnit, Token], - _CXString, - _CXString.from_result, - ), - ("clang_getTranslationUnitCursor", [TranslationUnit], Cursor, Cursor.from_result), - ( - "clang_getTranslationUnitSpelling", - [TranslationUnit], - _CXString, - _CXString.from_result, - ), - ( - "clang_getTUResourceUsageName", - [c_uint], - c_interop_string, - c_interop_string.to_python_string, - ), - ("clang_getTypeDeclaration", [Type], Cursor, Cursor.from_result), - ("clang_getTypedefDeclUnderlyingType", [Cursor], Type, Type.from_result), - ("clang_getTypedefName", [Type], _CXString, _CXString.from_result), - ("clang_getTypeKindSpelling", [c_uint], _CXString, _CXString.from_result), - ("clang_getTypeSpelling", [Type], _CXString, _CXString.from_result), + ("clang_getTokenSpelling", [TranslationUnit, Token], _CXString), + ("clang_getTranslationUnitCursor", [TranslationUnit], Cursor), + ("clang_getTranslationUnitSpelling", [TranslationUnit], _CXString), + ("clang_getTUResourceUsageName", [c_uint], c_interop_string), + ("clang_getTypeDeclaration", [Type], Cursor), + ("clang_getTypedefDeclUnderlyingType", [Cursor], Type), + ("clang_getTypedefName", [Type], _CXString), + ("clang_getTypeKindSpelling", [c_uint], _CXString), + ("clang_getTypeSpelling", [Type], _CXString), ("clang_hashCursor", [Cursor], c_uint), ("clang_isAttribute", [CursorKind], bool), ("clang_isConstQualifiedType", [Type], bool), @@ -3909,31 +3896,27 @@ def write_main_file_to_stdout(self): ), ("clang_visitChildren", [Cursor, cursor_visit_callback, py_object], c_uint), ("clang_Cursor_getNumArguments", [Cursor], c_int), - ("clang_Cursor_getArgument", [Cursor, c_uint], Cursor, Cursor.from_result), + ("clang_Cursor_getArgument", [Cursor, c_uint], Cursor), ("clang_Cursor_getNumTemplateArguments", [Cursor], c_int), - ( - "clang_Cursor_getTemplateArgumentKind", - [Cursor, c_uint], - TemplateArgumentKind.from_id, - ), - ("clang_Cursor_getTemplateArgumentType", [Cursor, c_uint], Type, Type.from_result), + ("clang_Cursor_getTemplateArgumentKind", [Cursor, c_uint], c_uint), + ("clang_Cursor_getTemplateArgumentType", [Cursor, c_uint], Type), ("clang_Cursor_getTemplateArgumentValue", [Cursor, c_uint], c_longlong), ("clang_Cursor_getTemplateArgumentUnsignedValue", [Cursor, c_uint], c_ulonglong), ("clang_Cursor_isAnonymous", [Cursor], bool), ("clang_Cursor_isBitField", [Cursor], bool), ("clang_Cursor_getBinaryOpcode", [Cursor], c_int), - ("clang_Cursor_getBriefCommentText", [Cursor], _CXString, _CXString.from_result), - ("clang_Cursor_getRawCommentText", [Cursor], _CXString, _CXString.from_result), + ("clang_Cursor_getBriefCommentText", [Cursor], _CXString), + ("clang_Cursor_getRawCommentText", [Cursor], _CXString), ("clang_Cursor_getOffsetOfField", [Cursor], c_longlong), ("clang_Location_isInSystemHeader", [SourceLocation], bool), ("clang_Type_getAlignOf", [Type], c_longlong), - ("clang_Type_getClassType", [Type], Type, Type.from_result), + ("clang_Type_getClassType", [Type], Type), ("clang_Type_getNumTemplateArguments", [Type], c_int), - ("clang_Type_getTemplateArgumentAsType", [Type, c_uint], Type, Type.from_result), + ("clang_Type_getTemplateArgumentAsType", [Type, c_uint], Type), ("clang_Type_getOffsetOf", [Type, c_interop_string], c_longlong), ("clang_Type_getSizeOf", [Type], c_longlong), ("clang_Type_getCXXRefQualifier", [Type], c_uint), - ("clang_Type_getNamedType", [Type], Type, Type.from_result), + ("clang_Type_getNamedType", [Type], Type), ("clang_Type_visitFields", [Type, fields_visit_callback, py_object], c_uint), ] diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py index 7476947bde2ea..77d8ca415708f 100644 --- a/clang/bindings/python/tests/cindex/test_cursor.py +++ b/clang/bindings/python/tests/cindex/test_cursor.py @@ -14,6 +14,7 @@ from clang.cindex import TranslationUnit from clang.cindex import TypeKind from clang.cindex import BinaryOperator +from clang.cindex import StorageClass from .util import get_cursor from .util import get_cursors from .util import get_tu @@ -279,6 +280,90 @@ def test_is_default_method(self): self.assertTrue(xc.is_default_method()) self.assertFalse(yc.is_default_method()) + def test_is_deleted_method(self): + source = "class X { X() = delete; }; class Y { Y(); };" + tu = get_tu(source, lang="cpp") + + xs = get_cursors(tu, "X") + ys = get_cursors(tu, "Y") + + self.assertEqual(len(xs), 2) + self.assertEqual(len(ys), 2) + + xc = xs[1] + yc = ys[1] + + self.assertTrue(xc.is_deleted_method()) + self.assertFalse(yc.is_deleted_method()) + + def test_is_copy_assignment_operator_method(self): + source_with_copy_assignment_operators = """ + struct Foo { + // Those are copy-assignment operators + bool operator=(const Foo&); + bool operator=(Foo&); + Foo operator=(Foo); + bool operator=(volatile Foo&); + bool operator=(const volatile Foo&); + + // Positive-check that the recognition works for templated classes too + template + class Bar { + bool operator=(const Bar&); + Bar operator=(const Bar); + bool operator=(Bar&); + bool operator=(volatile Bar&); + bool operator=(const volatile Bar&); + }; + """ + source_without_copy_assignment_operators = """ + struct Foo { + // Those are not copy-assignment operators + template + bool operator=(const T&); + bool operator=(const bool&); + bool operator=(char&); + bool operator=(volatile unsigned int&); + bool operator=(const volatile unsigned char&); + bool operator=(int); + bool operator=(Foo&&); + }; + """ + tu_with_copy_assignment_operators = get_tu( + source_with_copy_assignment_operators, lang="cpp" + ) + tu_without_copy_assignment_operators = get_tu( + source_without_copy_assignment_operators, lang="cpp" + ) + + copy_assignment_operators_cursors = get_cursors( + tu_with_copy_assignment_operators, "operator=" + ) + non_copy_assignment_operators_cursors = get_cursors( + tu_without_copy_assignment_operators, "operator=" + ) + + self.assertEqual(len(copy_assignment_operators_cursors), 10) + self.assertTrue(len(non_copy_assignment_operators_cursors), 9) + + self.assertTrue( + all( + [ + cursor.is_copy_assignment_operator_method() + for cursor in copy_assignment_operators_cursors + ] + ) + ) + + self.assertFalse( + any( + [ + cursor.is_copy_assignment_operator_method() + for cursor in non_copy_assignment_operators_cursors + ] + ) + ) + def test_is_move_assignment_operator_method(self): """Ensure Cursor.is_move_assignment_operator_method works.""" source_with_move_assignment_operators = """ @@ -482,6 +567,41 @@ def test_is_scoped_enum(self): self.assertFalse(regular_enum.is_scoped_enum()) self.assertTrue(scoped_enum.is_scoped_enum()) + def test_get_definition(self): + """Ensure Cursor.get_definition works.""" + tu = get_tu( + """ +class A { + constexpr static int f(){return 3;} +}; +struct B { + int b = A::f(); +}; +""", + lang="cpp", + ) + curs = get_cursors(tu, "f") + self.assertEqual(len(curs), 4) + self.assertEqual(curs[0].kind, CursorKind.CXX_METHOD) + self.assertEqual(curs[1].get_definition(), curs[0]) + self.assertEqual(curs[2].get_definition(), curs[0]) + self.assertEqual(curs[3].get_definition(), curs[0]) + + def test_get_usr(self): + """Ensure Cursor.get_usr works.""" + tu = get_tu( + """ +int add(int, int); +int add(int a, int b) { return a + b; } +int add(float a, float b) { return a + b; } +""", + lang="cpp", + ) + curs = get_cursors(tu, "add") + self.assertEqual(len(curs), 3) + self.assertEqual(curs[0].get_usr(), curs[1].get_usr()) + self.assertNotEqual(curs[0].get_usr(), curs[2].get_usr()) + def test_underlying_type(self): tu = get_tu("typedef int foo;") typedef = get_cursor(tu, "foo") @@ -570,6 +690,23 @@ def test_enum_values_cpp(self): self.assertEqual(ham.kind, CursorKind.ENUM_CONSTANT_DECL) self.assertEqual(ham.enum_value, 0x10000000000) + def test_enum_values_unsigned(self): + tu = get_tu("enum TEST : unsigned char { SPAM=0, HAM = 200};", lang="cpp") + enum = get_cursor(tu, "TEST") + self.assertIsNotNone(enum) + + self.assertEqual(enum.kind, CursorKind.ENUM_DECL) + + enum_constants = list(enum.get_children()) + self.assertEqual(len(enum_constants), 2) + + spam, ham = enum_constants + + self.assertEqual(spam.kind, CursorKind.ENUM_CONSTANT_DECL) + self.assertEqual(spam.enum_value, 0) + self.assertEqual(ham.kind, CursorKind.ENUM_CONSTANT_DECL) + self.assertEqual(ham.enum_value, 200) + def test_annotation_attribute(self): tu = get_tu( 'int foo (void) __attribute__ ((annotate("here be annotation attribute")));' @@ -625,6 +762,25 @@ def test_result_type_objc_method_decl(self): self.assertEqual(cursor.kind, CursorKind.OBJC_INSTANCE_METHOD_DECL) self.assertEqual(result_type.kind, TypeKind.VOID) + def test_storage_class(self): + tu = get_tu( + """ +extern int ex; +register int reg; +int count(int a, int b){ + static int counter = 0; + return 0; +} +""", + lang="cpp", + ) + cursor = get_cursor(tu, "ex") + self.assertEqual(cursor.storage_class, StorageClass.EXTERN) + cursor = get_cursor(tu, "counter") + self.assertEqual(cursor.storage_class, StorageClass.STATIC) + cursor = get_cursor(tu, "reg") + self.assertEqual(cursor.storage_class, StorageClass.REGISTER) + def test_availability(self): tu = get_tu("class A { A(A const&) = delete; };", lang="cpp") @@ -681,6 +837,23 @@ def test_get_token_cursor(self): r_cursor = t_cursor.referenced # should not raise an exception self.assertEqual(r_cursor.kind, CursorKind.CLASS_DECL) + def test_get_field_offsetof(self): + tu = get_tu( + "struct myStruct {int a; char b; char c; short d; char e;};", lang="cpp" + ) + c1 = get_cursor(tu, "myStruct") + c2 = get_cursor(tu, "a") + c3 = get_cursor(tu, "b") + c4 = get_cursor(tu, "c") + c5 = get_cursor(tu, "d") + c6 = get_cursor(tu, "e") + self.assertEqual(c1.get_field_offsetof(), -1) + self.assertEqual(c2.get_field_offsetof(), 0) + self.assertEqual(c3.get_field_offsetof(), 32) + self.assertEqual(c4.get_field_offsetof(), 40) + self.assertEqual(c5.get_field_offsetof(), 48) + self.assertEqual(c6.get_field_offsetof(), 64) + def test_get_arguments(self): tu = get_tu("void foo(int i, int j);") foo = get_cursor(tu, "foo") @@ -799,3 +972,13 @@ def test_binop(self): for op, typ in operators.items(): c = get_cursor(tu, op) assert c.binary_operator == typ + + def test_from_result_null(self): + tu = get_tu("int a = 1+2;", lang="cpp") + op = next(next(tu.cursor.get_children()).get_children()) + self.assertEqual(op.kind, CursorKind.BINARY_OPERATOR) + self.assertEqual(op.get_definition(), None) + + def test_from_cursor_result_null(self): + tu = get_tu("") + self.assertEqual(tu.cursor.semantic_parent, None) diff --git a/clang/bindings/python/tests/cindex/test_diagnostics.py b/clang/bindings/python/tests/cindex/test_diagnostics.py index 57c41baaa2541..041083d12c7f1 100644 --- a/clang/bindings/python/tests/cindex/test_diagnostics.py +++ b/clang/bindings/python/tests/cindex/test_diagnostics.py @@ -46,6 +46,8 @@ def test_diagnostic_fixit(self): self.assertEqual(tu.diagnostics[0].location.column, 26) self.assertRegex(tu.diagnostics[0].spelling, "use of GNU old-style.*") self.assertEqual(len(tu.diagnostics[0].fixits), 1) + with self.assertRaises(IndexError): + tu.diagnostics[0].fixits[1] self.assertEqual(tu.diagnostics[0].fixits[0].range.start.line, 1) self.assertEqual(tu.diagnostics[0].fixits[0].range.start.column, 26) self.assertEqual(tu.diagnostics[0].fixits[0].range.end.line, 1) @@ -97,6 +99,8 @@ def test_diagnostic_children(self): children = d.children self.assertEqual(len(children), 1) + with self.assertRaises(IndexError): + children[1] self.assertEqual(children[0].severity, Diagnostic.Note) self.assertRegex(children[0].spelling, ".*declared here") self.assertEqual(children[0].location.line, 1) @@ -111,3 +115,16 @@ def test_diagnostic_string_repr(self): repr(d), ", spelling \"expected ';' after struct\">", ) + + def test_diagnostic_string_format(self): + tu = get_tu("struct MissingSemicolon{}") + self.assertEqual(len(tu.diagnostics), 1) + d = tu.diagnostics[0] + + self.assertEqual(str(d), "t.c:1:26: error: expected ';' after struct") + self.assertEqual( + d.format(0b111111), + "t.c:1:26: error: expected ';' after struct [3, Parse Issue]", + ) + with self.assertRaises(ValueError): + d.format(0b1000000) diff --git a/clang/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py index 1dd8db0e3e814..928a9794e4213 100644 --- a/clang/bindings/python/tests/cindex/test_type.py +++ b/clang/bindings/python/tests/cindex/test_type.py @@ -10,7 +10,9 @@ from clang.cindex import CursorKind from clang.cindex import TranslationUnit from clang.cindex import TypeKind +from clang.cindex import RefQualifierKind from .util import get_cursor +from .util import get_cursors from .util import get_tu @@ -308,10 +310,10 @@ def test_element_type(self): def test_invalid_element_type(self): """Ensure Type.element_type raises if type doesn't have elements.""" tu = get_tu("int i;") - i = get_cursor(tu, "i") - self.assertIsNotNone(i) - with self.assertRaises(Exception): - i.element_type + ty = get_cursor(tu, "i").type + with self.assertRaises(Exception) as ctx: + ty.element_type + self.assertEqual(str(ctx.exception), "Element type not available on this type.") def test_element_count(self): """Ensure Type.element_count works.""" @@ -357,6 +359,49 @@ def test_is_restrict_qualified(self): self.assertTrue(i.type.is_restrict_qualified()) self.assertFalse(j.type.is_restrict_qualified()) + def test_get_result(self): + tu = get_tu("void foo(); int bar(char, short);") + foo = get_cursor(tu, "foo") + bar = get_cursor(tu, "bar") + self.assertEqual(foo.type.get_result().spelling, "void") + self.assertEqual(bar.type.get_result().spelling, "int") + + def test_get_class_type(self): + tu = get_tu( + """ +class myClass +{ + char *myAttr; +}; + +char *myClass::*pMyAttr = &myClass::myAttr; +""", + lang="cpp", + ) + cur = get_cursor(tu, "pMyAttr") + self.assertEqual(cur.type.get_class_type().spelling, "myClass") + + def test_get_named_type(self): + tu = get_tu("using char_alias = char; char_alias xyz;", lang="cpp") + cur = get_cursor(tu, "xyz") + self.assertEqual(cur.type.get_named_type().spelling, "char_alias") + + def test_get_ref_qualifier(self): + tu = get_tu( + """ +class A +{ + const int& getAttr() const &; + int getAttr() const &&; +}; +""", + lang="cpp", + ) + getters = get_cursors(tu, "getAttr") + self.assertEqual(len(getters), 2) + self.assertEqual(getters[0].type.get_ref_qualifier(), RefQualifierKind.LVALUE) + self.assertEqual(getters[1].type.get_ref_qualifier(), RefQualifierKind.RVALUE) + def test_record_layout(self): """Ensure Cursor.type.get_size, Cursor.type.get_align and Cursor.type.get_offset works.""" diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 0c6b9b1b8f9ce..ea4b4bcec55e7 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -700,6 +700,8 @@ Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±in T __builtin_elementwise_canonicalize(T x) return the platform specific canonical encoding floating point types of a floating-point number T __builtin_elementwise_copysign(T x, T y) return the magnitude of x with the sign of y. floating point types + T __builtin_elementwise_fmod(T x, T y) return The floating-point remainder of (x/y) whose sign floating point types + matches the sign of x. T __builtin_elementwise_max(T x, T y) return x or y, whichever is larger integer and floating point types T __builtin_elementwise_min(T x, T y) return x or y, whichever is smaller integer and floating point types T __builtin_elementwise_add_sat(T x, T y) return the sum of x and y, clamped to the range of integer types diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 1fbcac807d0b3..d48601db02355 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -103,6 +103,7 @@ ABI Changes in This Version --------------------------- - Fixed Microsoft name mangling of placeholder, auto and decltype(auto), return types for MSVC 1920+. This change resolves incompatibilities with code compiled by MSVC 1920+ but will introduce incompatibilities with code compiled by earlier versions of Clang unless such code is built with the compiler option -fms-compatibility-version=19.14 to imitate the MSVC 1914 mangling behavior. +- Fixed the Itanium mangling of the construction vtable name. This change will introduce incompatibilities with code compiled by Clang 19 and earlier versions, unless the -fclang-abi-compat=19 option is used. (#GH108015) AST Dumping Potentially Breaking Changes ---------------------------------------- @@ -141,6 +142,8 @@ C++ Language Changes - Add ``__builtin_elementwise_popcount`` builtin for integer types only. +- Add ``__builtin_elementwise_fmod`` builtin for floating point types only. + - The builtin type alias ``__builtin_common_type`` has been added to improve the performance of ``std::common_type``. @@ -444,6 +447,7 @@ Bug Fixes to C++ Support - Fixed an assertion failure in debug mode, and potential crashes in release mode, when diagnosing a failed cast caused indirectly by a failed implicit conversion to the type of the constructor parameter. - Fixed an assertion failure by adjusting integral to boolean vector conversions (#GH108326) +- Mangle friend function templates with a constraint that depends on a template parameter from an enclosing template as members of the enclosing class. (#GH110247) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -507,6 +511,12 @@ X86 Support * Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and ``*_(mask(z)))_minmax_s[s|d|h]``. +- The following bit manipulation intrinsics can now be used in constant expressions: + all lzcnt intrinsics in lzcntintrin.h + all bextr intrinsics in bmiintrin.h + all tzcnt intrinsics in bmiintrin.h + all bextr intrinsics in tbmintrin.h + Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index 9847d449d76d0..a22bda189dd29 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -1544,6 +1544,49 @@ Warn on ``mmap()`` calls with both writable and executable access. // code } +.. _security-PointerSub: + +security.PointerSub (C) +""""""""""""""""""""""" +Check for pointer subtractions on two pointers pointing to different memory +chunks. According to the C standard §6.5.6 only subtraction of pointers that +point into (or one past the end) the same array object is valid (for this +purpose non-array variables are like arrays of size 1). This checker only +searches for different memory objects at subtraction, but does not check if the +array index is correct. Furthermore, only cases are reported where +stack-allocated objects are involved (no warnings on pointers to memory +allocated by `malloc`). + +.. code-block:: c + + void test() { + int a, b, c[10], d[10]; + int x = &c[3] - &c[1]; + x = &d[4] - &c[1]; // warn: 'c' and 'd' are different arrays + x = (&a + 1) - &a; + x = &b - &a; // warn: 'a' and 'b' are different variables + } + + struct S { + int x[10]; + int y[10]; + }; + + void test1() { + struct S a[10]; + struct S b; + int d = &a[4] - &a[6]; + d = &a[0].x[3] - &a[0].x[1]; + d = a[0].y - a[0].x; // warn: 'S.b' and 'S.a' are different objects + d = (char *)&b.y - (char *)&b.x; // warn: different members of the same object + d = (char *)&b.y - (char *)&b; // warn: object of type S is not the same array as a member of it + } + +There may be existing applications that use code like above for calculating +offsets of members in a structure, using pointer subtractions. This is still +undefined behavior according to the standard and code like this can be replaced +with the `offsetof` macro. + .. _security-putenv-stack-array: security.PutenvStackArray (C) @@ -2761,49 +2804,6 @@ Check for pointer arithmetic on locations other than array elements. p = &x + 1; // warn } -.. _alpha-core-PointerSub: - -alpha.core.PointerSub (C) -""""""""""""""""""""""""" -Check for pointer subtractions on two pointers pointing to different memory -chunks. According to the C standard §6.5.6 only subtraction of pointers that -point into (or one past the end) the same array object is valid (for this -purpose non-array variables are like arrays of size 1). This checker only -searches for different memory objects at subtraction, but does not check if the -array index is correct. Furthermore, only cases are reported where -stack-allocated objects are involved (no warnings on pointers to memory -allocated by `malloc`). - -.. code-block:: c - - void test() { - int a, b, c[10], d[10]; - int x = &c[3] - &c[1]; - x = &d[4] - &c[1]; // warn: 'c' and 'd' are different arrays - x = (&a + 1) - &a; - x = &b - &a; // warn: 'a' and 'b' are different variables - } - - struct S { - int x[10]; - int y[10]; - }; - - void test1() { - struct S a[10]; - struct S b; - int d = &a[4] - &a[6]; - d = &a[0].x[3] - &a[0].x[1]; - d = a[0].y - a[0].x; // warn: 'S.b' and 'S.a' are different objects - d = (char *)&b.y - (char *)&b.x; // warn: different members of the same object - d = (char *)&b.y - (char *)&b; // warn: object of type S is not the same array as a member of it - } - -There may be existing applications that use code like above for calculating -offsets of members in a structure, using pointer subtractions. This is still -undefined behavior according to the standard and code like this can be replaced -with the `offsetof` macro. - .. _alpha-core-StackAddressAsyncEscape: alpha.core.StackAddressAsyncEscape (ObjC) diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index dc87b84153e74..67e75652a1664 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -6191,7 +6191,9 @@ class HLSLAttributedResourceType : public Type, public llvm::FoldingSetNode { HLSLAttributedResourceType(QualType Canon, QualType Wrapped, QualType Contained, const Attributes &Attrs) - : Type(HLSLAttributedResource, Canon, Wrapped->getDependence()), + : Type(HLSLAttributedResource, Canon, + Contained.isNull() ? TypeDependence::None + : Contained->getDependence()), WrappedType(Wrapped), ContainedType(Contained), Attrs(Attrs) {} public: diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 33791270800c9..8090119e512fb 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -1328,6 +1328,12 @@ def ElementwisePopcount : Builtin { let Prototype = "void(...)"; } +def ElementwiseFmod : Builtin { + let Spellings = ["__builtin_elementwise_fmod"]; + let Attributes = [NoThrow, Const, CustomTypeChecking]; + let Prototype = "void(...)"; +} + def ElementwisePow : Builtin { let Spellings = ["__builtin_elementwise_pow"]; let Attributes = [NoThrow, Const, CustomTypeChecking]; diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 6fff562165080..6b7bce5bc00d4 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -139,6 +139,7 @@ TARGET_BUILTIN(__nvvm_is_explicit_cluster, "b", "nc", AND(SM_90, PTX78)) BUILTIN(__nvvm_read_ptx_sreg_laneid, "i", "nc") BUILTIN(__nvvm_read_ptx_sreg_warpid, "i", "nc") BUILTIN(__nvvm_read_ptx_sreg_nwarpid, "i", "nc") +BUILTIN(__nvvm_read_ptx_sreg_warpsize, "i", "nc") BUILTIN(__nvvm_read_ptx_sreg_smid, "i", "nc") BUILTIN(__nvvm_read_ptx_sreg_nsmid, "i", "nc") diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 25c656a530b15..e68dcd922acbf 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -551,13 +551,13 @@ TARGET_BUILTIN(__builtin_ia32_rdseed16_step, "UiUs*", "n", "rdseed") TARGET_BUILTIN(__builtin_ia32_rdseed32_step, "UiUi*", "n", "rdseed") // LZCNT -TARGET_BUILTIN(__builtin_ia32_lzcnt_u16, "UsUs", "nc", "lzcnt") -TARGET_BUILTIN(__builtin_ia32_lzcnt_u32, "UiUi", "nc", "lzcnt") +TARGET_BUILTIN(__builtin_ia32_lzcnt_u16, "UsUs", "ncE", "lzcnt") +TARGET_BUILTIN(__builtin_ia32_lzcnt_u32, "UiUi", "ncE", "lzcnt") // BMI -TARGET_BUILTIN(__builtin_ia32_bextr_u32, "UiUiUi", "nc", "bmi") -TARGET_BUILTIN(__builtin_ia32_tzcnt_u16, "UsUs", "nc", "") -TARGET_BUILTIN(__builtin_ia32_tzcnt_u32, "UiUi", "nc", "") +TARGET_BUILTIN(__builtin_ia32_bextr_u32, "UiUiUi", "ncE", "bmi") +TARGET_BUILTIN(__builtin_ia32_tzcnt_u16, "UsUs", "ncE", "") +TARGET_BUILTIN(__builtin_ia32_tzcnt_u32, "UiUi", "ncE", "") // BMI2 TARGET_BUILTIN(__builtin_ia32_bzhi_si, "UiUiUi", "nc", "bmi2") @@ -565,7 +565,7 @@ TARGET_BUILTIN(__builtin_ia32_pdep_si, "UiUiUi", "nc", "bmi2") TARGET_BUILTIN(__builtin_ia32_pext_si, "UiUiUi", "nc", "bmi2") // TBM -TARGET_BUILTIN(__builtin_ia32_bextri_u32, "UiUiIUi", "nc", "tbm") +TARGET_BUILTIN(__builtin_ia32_bextri_u32, "UiUiIUi", "ncE", "tbm") // LWP TARGET_BUILTIN(__builtin_ia32_llwpcb, "vv*", "n", "lwp") diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index db381aa77e761..5f4252c91b884 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -70,13 +70,13 @@ TARGET_BUILTIN(__builtin_ia32_addcarryx_u64, "UcUcUOiUOiUOi*", "n", "") TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcUOiUOiUOi*", "n", "") TARGET_BUILTIN(__builtin_ia32_rdrand64_step, "UiUOi*", "n", "rdrnd") TARGET_BUILTIN(__builtin_ia32_rdseed64_step, "UiUOi*", "n", "rdseed") -TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "UOiUOi", "nc", "lzcnt") -TARGET_BUILTIN(__builtin_ia32_bextr_u64, "UOiUOiUOi", "nc", "bmi") -TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "UOiUOi", "nc", "") +TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "UOiUOi", "ncE", "lzcnt") +TARGET_BUILTIN(__builtin_ia32_bextr_u64, "UOiUOiUOi", "ncE", "bmi") +TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "UOiUOi", "ncE", "") TARGET_BUILTIN(__builtin_ia32_bzhi_di, "UOiUOiUOi", "nc", "bmi2") TARGET_BUILTIN(__builtin_ia32_pdep_di, "UOiUOiUOi", "nc", "bmi2") TARGET_BUILTIN(__builtin_ia32_pext_di, "UOiUOiUOi", "nc", "bmi2") -TARGET_BUILTIN(__builtin_ia32_bextri_u64, "UOiUOiIUOi", "nc", "tbm") +TARGET_BUILTIN(__builtin_ia32_bextri_u64, "UOiUOiIUOi", "ncE", "tbm") TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcUOiUiIUi", "n", "lwp") TARGET_BUILTIN(__builtin_ia32_lwpval64, "vUOiUiIUi", "n", "lwp") TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "OiV2dIi", "ncV:128:", "avx512f") diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f3d5d4c56606c..9e8f152852fd1 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12395,6 +12395,9 @@ def err_hlsl_operator_unsupported : Error< def err_hlsl_param_qualifier_mismatch : Error<"conflicting parameter qualifier %0 on parameter %1">; +def err_hlsl_vector_compound_assignment_truncation : Error< + "left hand operand of type %0 to compound assignment cannot be truncated " + "when used with right hand operand of type %1">; def warn_hlsl_impcast_vector_truncation : Warning< "implicit conversion truncates vector: %0 to %1">, InGroup; diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index c3d53ca92d450..8c605f6852016 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -239,6 +239,12 @@ class LangOptionsBase { /// in the initializers of members of local classes. Ver18, + /// Attempt to be ABI-compatible with code generated by Clang 19.0.x. + /// This causes clang to: + /// - Incorrectly mangles the 'base type' substitutions of the CXX + /// construction vtable because it hasn't added 'type' as a substitution. + Ver19, + /// Conform to the underlying platform's C and C++ ABIs as closely /// as we can. Latest diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index ae6b55e98827f..9c9f31f388406 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -817,4 +817,9 @@ multiclass ZAReadzArray{ defm SVREADZ_VG2 : ZAReadzArray<"2">; defm SVREADZ_VG4 : ZAReadzArray<"4">; + +let SMETargetGuard = "sme2,sme-lutv2" in { + def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>; +} + } // let SVETargetGuard = InvalidMode diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 8b8824c04a332..aedc4c16d4e9d 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2979,7 +2979,7 @@ def flax_vector_conversions_EQ : Joined<["-"], "flax-vector-conversions=">, Grou "LangOptions::LaxVectorConversionKind::Integer", "LangOptions::LaxVectorConversionKind::All"]>, MarshallingInfoEnum, - open_cl.KeyPath # + !strconcat("(", open_cl.KeyPath, " || ", hlsl.KeyPath, ")") # " ? LangOptions::LaxVectorConversionKind::None" # " : LangOptions::LaxVectorConversionKind::All">; def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group, diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index e1c3a99cfa167..a9ce3681338d4 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -7423,7 +7423,8 @@ class Sema final : public SemaBase { SourceLocation Loc, BinaryOperatorKind Opc); QualType CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS, - SourceLocation Loc); + SourceLocation Loc, + BinaryOperatorKind Opc); /// Context in which we're performing a usual arithmetic conversion. enum ArithConvKind { diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index 311cd58bbcac2..fa957abc9791a 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -63,6 +63,11 @@ class SemaHLSL : public SemaBase { std::initializer_list AllowedStages); void DiagnoseAvailabilityViolations(TranslationUnitDecl *TU); + QualType handleVectorBinOpConversion(ExprResult &LHS, ExprResult &RHS, + QualType LHSType, QualType RHSType, + bool IsCompAssign); + void emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, BinaryOperatorKind Opc); + void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL); void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL); void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL); diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td index 747ebd8c2e4de..6bc389f9da265 100644 --- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td +++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td @@ -290,11 +290,6 @@ def PointerArithChecker : Checker<"PointerArithm">, "elements">, Documentation; -def PointerSubChecker : Checker<"PointerSub">, - HelpText<"Check for pointer subtractions on two pointers pointing to " - "different memory chunks">, - Documentation; - def TestAfterDivZeroChecker : Checker<"TestAfterDivZero">, HelpText<"Check for division by variable that is later compared against 0. " "Either the comparison is useless or there is division by zero.">, @@ -1003,6 +998,11 @@ def MmapWriteExecChecker : Checker<"MmapWriteExec">, HelpText<"Warn on mmap() calls with both writable and executable access">, Documentation; +def PointerSubChecker : Checker<"PointerSub">, + HelpText<"Check for pointer subtractions on two pointers pointing to " + "different memory chunks">, + Documentation; + def PutenvStackArray : Checker<"PutenvStackArray">, HelpText<"Finds calls to the function 'putenv' which pass a pointer to " "an automatic (stack-allocated) array as the argument.">, diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index cda8b02cc8499..458075020f6b2 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2272,8 +2272,8 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { #include "clang/Basic/AMDGPUTypes.def" #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id: #include "clang/Basic/HLSLIntangibleTypes.def" - Width = 0; - Align = 8; + Width = Target->getPointerWidth(LangAS::Default); + Align = Target->getPointerAlign(LangAS::Default); break; } break; diff --git a/clang/lib/AST/ByteCode/Boolean.h b/clang/lib/AST/ByteCode/Boolean.h index f1914ddb9970d..c568b557574e2 100644 --- a/clang/lib/AST/ByteCode/Boolean.h +++ b/clang/lib/AST/ByteCode/Boolean.h @@ -30,6 +30,7 @@ class Boolean final { public: /// Zero-initializes a boolean. Boolean() : V(false) {} + Boolean(const llvm::APSInt &I) : V(!I.isZero()) {} explicit Boolean(bool V) : V(V) {} bool operator<(Boolean RHS) const { return V < RHS.V; } diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 78ba1a7eec662..680be736aa647 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -672,6 +672,45 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { ToSize, CE); }; + case CK_IntegralToFixedPoint: { + if (!this->visit(SubExpr)) + return false; + + auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType()); + uint32_t I; + std::memcpy(&I, &Sem, sizeof(Sem)); + return this->emitCastIntegralFixedPoint(classifyPrim(SubExpr->getType()), I, + CE); + } + case CK_FloatingToFixedPoint: { + if (!this->visit(SubExpr)) + return false; + + auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType()); + uint32_t I; + std::memcpy(&I, &Sem, sizeof(Sem)); + return this->emitCastFloatingFixedPoint(I, CE); + } + case CK_FixedPointToFloating: { + if (!this->visit(SubExpr)) + return false; + const auto *TargetSemantics = &Ctx.getFloatSemantics(CE->getType()); + return this->emitCastFixedPointFloating(TargetSemantics, CE); + } + case CK_FixedPointToIntegral: { + if (!this->visit(SubExpr)) + return false; + return this->emitCastFixedPointIntegral(classifyPrim(CE->getType()), CE); + } + case CK_FixedPointCast: { + if (!this->visit(SubExpr)) + return false; + auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType()); + uint32_t I; + std::memcpy(&I, &Sem, sizeof(Sem)); + return this->emitCastFixedPoint(I, CE); + } + case CK_ToVoid: return discard(SubExpr); @@ -724,6 +763,9 @@ bool Compiler::VisitFixedPointLiteral(const FixedPointLiteral *E) { assert(E->getType()->isFixedPointType()); assert(classifyPrim(E) == PT_FixedPoint); + if (DiscardResult) + return true; + auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType()); APInt Value = E->getValue(); return this->emitConstFixedPoint(FixedPoint(Value, Sem), E); @@ -762,6 +804,8 @@ bool Compiler::VisitBinaryOperator(const BinaryOperator *BO) { RHS->getType()->isAnyComplexType()) && BO->isComparisonOp()) return this->emitComplexComparison(LHS, RHS, BO); + if (LHS->getType()->isFixedPointType() || RHS->getType()->isFixedPointType()) + return this->VisitFixedPointBinOp(BO); if (BO->isPtrMemOp()) { if (!this->visit(LHS)) @@ -1458,6 +1502,112 @@ bool Compiler::VisitVectorBinOp(const BinaryOperator *E) { return true; } +template +bool Compiler::VisitFixedPointBinOp(const BinaryOperator *E) { + const Expr *LHS = E->getLHS(); + const Expr *RHS = E->getRHS(); + + assert(LHS->getType()->isFixedPointType() || + RHS->getType()->isFixedPointType()); + + auto LHSSema = Ctx.getASTContext().getFixedPointSemantics(LHS->getType()); + auto RHSSema = Ctx.getASTContext().getFixedPointSemantics(RHS->getType()); + + if (!this->visit(LHS)) + return false; + if (!LHS->getType()->isFixedPointType()) { + uint32_t I; + std::memcpy(&I, &LHSSema, sizeof(llvm::FixedPointSemantics)); + if (!this->emitCastIntegralFixedPoint(classifyPrim(LHS->getType()), I, E)) + return false; + } + + if (!this->visit(RHS)) + return false; + if (!RHS->getType()->isFixedPointType()) { + uint32_t I; + std::memcpy(&I, &RHSSema, sizeof(llvm::FixedPointSemantics)); + if (!this->emitCastIntegralFixedPoint(classifyPrim(RHS->getType()), I, E)) + return false; + } + + // Convert the result to the target semantics. + auto ConvertResult = [&](bool R) -> bool { + if (!R) + return false; + auto ResultSema = Ctx.getASTContext().getFixedPointSemantics(E->getType()); + auto CommonSema = LHSSema.getCommonSemantics(RHSSema); + if (ResultSema != CommonSema) { + uint32_t I; + std::memcpy(&I, &ResultSema, sizeof(ResultSema)); + return this->emitCastFixedPoint(I, E); + } + return true; + }; + + auto MaybeCastToBool = [&](bool Result) { + if (!Result) + return false; + PrimType T = classifyPrim(E); + if (DiscardResult) + return this->emitPop(T, E); + if (T != PT_Bool) + return this->emitCast(PT_Bool, T, E); + return true; + }; + + switch (E->getOpcode()) { + case BO_EQ: + return MaybeCastToBool(this->emitEQFixedPoint(E)); + case BO_NE: + return MaybeCastToBool(this->emitNEFixedPoint(E)); + case BO_LT: + return MaybeCastToBool(this->emitLTFixedPoint(E)); + case BO_LE: + return MaybeCastToBool(this->emitLEFixedPoint(E)); + case BO_GT: + return MaybeCastToBool(this->emitGTFixedPoint(E)); + case BO_GE: + return MaybeCastToBool(this->emitGEFixedPoint(E)); + case BO_Add: + return ConvertResult(this->emitAddFixedPoint(E)); + case BO_Sub: + return ConvertResult(this->emitSubFixedPoint(E)); + case BO_Mul: + return ConvertResult(this->emitMulFixedPoint(E)); + case BO_Div: + return ConvertResult(this->emitDivFixedPoint(E)); + case BO_Shl: + return ConvertResult(this->emitShiftFixedPoint(/*Left=*/true, E)); + case BO_Shr: + return ConvertResult(this->emitShiftFixedPoint(/*Left=*/false, E)); + + default: + return this->emitInvalid(E); + } + + llvm_unreachable("unhandled binop opcode"); +} + +template +bool Compiler::VisitFixedPointUnaryOperator(const UnaryOperator *E) { + const Expr *SubExpr = E->getSubExpr(); + assert(SubExpr->getType()->isFixedPointType()); + + switch (E->getOpcode()) { + case UO_Plus: + return this->delegate(SubExpr); + case UO_Minus: + if (!this->visit(SubExpr)) + return false; + return this->emitNegFixedPoint(E); + default: + return false; + } + + llvm_unreachable("Unhandled unary opcode"); +} + template bool Compiler::VisitImplicitValueInitExpr( const ImplicitValueInitExpr *E) { @@ -3699,7 +3849,10 @@ bool Compiler::visitZeroInitializer(PrimType T, QualType QT, return this->emitNullMemberPtr(nullptr, E); case PT_Float: return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E); - case PT_FixedPoint: + case PT_FixedPoint: { + auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType()); + return this->emitConstFixedPoint(FixedPoint::zero(Sem), E); + } llvm_unreachable("Implement"); } llvm_unreachable("unknown primitive type"); @@ -5364,6 +5517,8 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return this->VisitComplexUnaryOperator(E); if (SubExpr->getType()->isVectorType()) return this->VisitVectorUnaryOperator(E); + if (SubExpr->getType()->isFixedPointType()) + return this->VisitFixedPointUnaryOperator(E); std::optional T = classify(SubExpr->getType()); switch (E->getOpcode()) { diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index d1911f11603a0..22e078f3fe546 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -132,6 +132,8 @@ class Compiler : public ConstStmtVisitor, bool>, bool VisitPointerArithBinOp(const BinaryOperator *E); bool VisitComplexBinOp(const BinaryOperator *E); bool VisitVectorBinOp(const BinaryOperator *E); + bool VisitFixedPointBinOp(const BinaryOperator *E); + bool VisitFixedPointUnaryOperator(const UnaryOperator *E); bool VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E); bool VisitCallExpr(const CallExpr *E); bool VisitBuiltinCallExpr(const CallExpr *E, unsigned BuiltinID); diff --git a/clang/lib/AST/ByteCode/FixedPoint.h b/clang/lib/AST/ByteCode/FixedPoint.h index fba793cd59e7e..ab8d6d7f02b52 100644 --- a/clang/lib/AST/ByteCode/FixedPoint.h +++ b/clang/lib/AST/ByteCode/FixedPoint.h @@ -23,34 +23,72 @@ using APSInt = llvm::APSInt; class FixedPoint final { private: llvm::APFixedPoint V; - FixedPoint(llvm::APFixedPoint &&V) : V(std::move(V)) {} public: + FixedPoint(llvm::APFixedPoint &&V) : V(std::move(V)) {} + FixedPoint(llvm::APFixedPoint &V) : V(V) {} FixedPoint(APInt V, llvm::FixedPointSemantics Sem) : V(V, Sem) {} // This needs to be default-constructible so llvm::endian::read works. FixedPoint() : V(APInt(0, 0ULL, false), llvm::FixedPointSemantics(0, 0, false, false, false)) {} - operator bool() const { return V.getBoolValue(); } - template >> - explicit operator Ty() const { - // FIXME - return 0; + static FixedPoint zero(llvm::FixedPointSemantics Sem) { + return FixedPoint(APInt(Sem.getWidth(), 0ULL, Sem.isSigned()), Sem); + } + + static FixedPoint from(const APSInt &I, llvm::FixedPointSemantics Sem, + bool *Overflow) { + return FixedPoint(llvm::APFixedPoint::getFromIntValue(I, Sem, Overflow)); + } + static FixedPoint from(const llvm::APFloat &I, llvm::FixedPointSemantics Sem, + bool *Overflow) { + return FixedPoint(llvm::APFixedPoint::getFromFloatValue(I, Sem, Overflow)); } + operator bool() const { return V.getBoolValue(); } void print(llvm::raw_ostream &OS) const { OS << V; } APValue toAPValue(const ASTContext &) const { return APValue(V); } - APSInt toAPSInt(unsigned BitWidth) const { return V.getValue(); } + APSInt toAPSInt(unsigned BitWidth = 0) const { return V.getValue(); } unsigned bitWidth() const { return V.getWidth(); } bool isSigned() const { return V.isSigned(); } + bool isZero() const { return V.getValue().isZero(); } + bool isNegative() const { return V.getValue().isNegative(); } + bool isPositive() const { return V.getValue().isNonNegative(); } + bool isMin() const { + return V == llvm::APFixedPoint::getMin(V.getSemantics()); + } + bool isMinusOne() const { return V.isSigned() && V.getValue() == -1; } + + FixedPoint truncate(unsigned BitWidth) const { return *this; } + + FixedPoint toSemantics(const llvm::FixedPointSemantics &Sem, + bool *Overflow) const { + return FixedPoint(V.convert(Sem, Overflow)); + } + llvm::FixedPointSemantics getSemantics() const { return V.getSemantics(); } + + llvm::APFloat toFloat(const llvm::fltSemantics *Sem) const { + return V.convertToFloat(*Sem); + } + + llvm::APSInt toInt(unsigned BitWidth, bool Signed, bool *Overflow) const { + return V.convertToInt(BitWidth, Signed, Overflow); + } + + std::string toDiagnosticString(const ASTContext &Ctx) const { + return V.toString(); + } ComparisonCategoryResult compare(const FixedPoint &Other) const { - if (Other.V == V) + int c = V.compare(Other.V); + if (c == 0) return ComparisonCategoryResult::Equal; - return ComparisonCategoryResult::Unordered; + else if (c < 0) + return ComparisonCategoryResult::Less; + return ComparisonCategoryResult::Greater; } static bool neg(const FixedPoint &A, FixedPoint *R) { @@ -58,6 +96,67 @@ class FixedPoint final { *R = FixedPoint(A.V.negate(&Overflow)); return Overflow; } + + static bool add(const FixedPoint A, const FixedPoint B, unsigned Bits, + FixedPoint *R) { + bool Overflow = false; + *R = FixedPoint(A.V.add(B.V, &Overflow)); + return Overflow; + } + static bool sub(const FixedPoint A, const FixedPoint B, unsigned Bits, + FixedPoint *R) { + bool Overflow = false; + *R = FixedPoint(A.V.sub(B.V, &Overflow)); + return Overflow; + } + static bool mul(const FixedPoint A, const FixedPoint B, unsigned Bits, + FixedPoint *R) { + bool Overflow = false; + *R = FixedPoint(A.V.mul(B.V, &Overflow)); + return Overflow; + } + static bool div(const FixedPoint A, const FixedPoint B, unsigned Bits, + FixedPoint *R) { + bool Overflow = false; + *R = FixedPoint(A.V.div(B.V, &Overflow)); + return Overflow; + } + + static bool shiftLeft(const FixedPoint A, const FixedPoint B, unsigned OpBits, + FixedPoint *R) { + unsigned Amt = B.V.getValue().getLimitedValue(OpBits); + bool Overflow; + *R = FixedPoint(A.V.shl(Amt, &Overflow)); + return Overflow; + } + static bool shiftRight(const FixedPoint A, const FixedPoint B, + unsigned OpBits, FixedPoint *R) { + unsigned Amt = B.V.getValue().getLimitedValue(OpBits); + bool Overflow; + *R = FixedPoint(A.V.shr(Amt, &Overflow)); + return Overflow; + } + + static bool rem(const FixedPoint A, const FixedPoint B, unsigned Bits, + FixedPoint *R) { + llvm_unreachable("Rem doesn't exist for fixed point values"); + return true; + } + static bool bitAnd(const FixedPoint A, const FixedPoint B, unsigned Bits, + FixedPoint *R) { + return true; + } + static bool bitOr(const FixedPoint A, const FixedPoint B, unsigned Bits, + FixedPoint *R) { + return true; + } + static bool bitXor(const FixedPoint A, const FixedPoint B, unsigned Bits, + FixedPoint *R) { + return true; + } + + static bool increment(const FixedPoint &A, FixedPoint *R) { return true; } + static bool decrement(const FixedPoint &A, FixedPoint *R) { return true; } }; inline FixedPoint getSwappedBytes(FixedPoint F) { return F; } diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 798e0f3e96fa0..fd9a256843a0e 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1393,6 +1393,19 @@ bool InvalidNewDeleteExpr(InterpState &S, CodePtr OpPC, const Expr *E) { return false; } +bool handleFixedPointOverflow(InterpState &S, CodePtr OpPC, + const FixedPoint &FP) { + const Expr *E = S.Current->getExpr(OpPC); + if (S.checkingForUndefinedBehavior()) { + S.getASTContext().getDiagnostics().Report( + E->getExprLoc(), diag::warn_fixedpoint_constant_overflow) + << FP.toDiagnosticString(S.getASTContext()) << E->getType(); + } + S.CCEDiag(E, diag::note_constexpr_overflow) + << FP.toDiagnosticString(S.getASTContext()) << E->getType(); + return S.noteUndefinedBehavior(); +} + bool Interpret(InterpState &S, APValue &Result) { // The current stack frame when we started Interpret(). // This is being used by the ops to determine wheter diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 79af426f8a913..8a3c6810e0e11 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -38,6 +38,7 @@ namespace clang { namespace interp { using APSInt = llvm::APSInt; +using FixedPointSemantics = llvm::FixedPointSemantics; /// Convert a value to an APValue. template @@ -161,6 +162,15 @@ bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize, const CallExpr *CE); bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T); +template +static bool handleOverflow(InterpState &S, CodePtr OpPC, const T &SrcValue) { + const Expr *E = S.Current->getExpr(OpPC); + S.CCEDiag(E, diag::note_constexpr_overflow) << SrcValue << E->getType(); + return S.noteUndefinedBehavior(); +} +bool handleFixedPointOverflow(InterpState &S, CodePtr OpPC, + const FixedPoint &FP); + enum class ShiftDir { Left, Right }; /// Checks if the shift operation is legal. @@ -225,14 +235,16 @@ bool CheckDivRem(InterpState &S, CodePtr OpPC, const T &LHS, const T &RHS) { return false; } - if (LHS.isSigned() && LHS.isMin() && RHS.isNegative() && RHS.isMinusOne()) { - APSInt LHSInt = LHS.toAPSInt(); - SmallString<32> Trunc; - (-LHSInt.extend(LHSInt.getBitWidth() + 1)).toString(Trunc, 10); - const SourceInfo &Loc = S.Current->getSource(OpPC); - const Expr *E = S.Current->getExpr(OpPC); - S.CCEDiag(Loc, diag::note_constexpr_overflow) << Trunc << E->getType(); - return false; + if constexpr (!std::is_same_v) { + if (LHS.isSigned() && LHS.isMin() && RHS.isNegative() && RHS.isMinusOne()) { + APSInt LHSInt = LHS.toAPSInt(); + SmallString<32> Trunc; + (-LHSInt.extend(LHSInt.getBitWidth() + 1)).toString(Trunc, 10); + const SourceInfo &Loc = S.Current->getSource(OpPC); + const Expr *E = S.Current->getExpr(OpPC); + S.CCEDiag(Loc, diag::note_constexpr_overflow) << Trunc << E->getType(); + return false; + } } return true; } @@ -364,10 +376,13 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS, S.Stk.push(Result); return true; } - // If for some reason evaluation continues, use the truncated results. S.Stk.push(Result); + // Short-circuit fixed-points here since the error handling is easier. + if constexpr (std::is_same_v) + return handleFixedPointOverflow(S, OpPC, Result); + // Slow path - compute the result using another bit of precision. APSInt Value = OpAP()(LHS.toAPSInt(Bits), RHS.toAPSInt(Bits)); @@ -384,13 +399,10 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS, << Trunc << Type << E->getSourceRange(); } - S.CCEDiag(E, diag::note_constexpr_overflow) << Value << Type; - - if (!S.noteUndefinedBehavior()) { + if (!handleOverflow(S, OpPC, Value)) { S.Stk.pop(); return false; } - return true; } @@ -680,6 +692,13 @@ bool Div(InterpState &S, CodePtr OpPC) { S.Stk.push(Result); return true; } + + if constexpr (std::is_same_v) { + if (handleFixedPointOverflow(S, OpPC, Result)) { + S.Stk.push(Result); + return true; + } + } return false; } @@ -740,8 +759,7 @@ bool Neg(InterpState &S, CodePtr OpPC) { return true; } - S.CCEDiag(E, diag::note_constexpr_overflow) << NegatedValue << Type; - return S.noteUndefinedBehavior(); + return handleOverflow(S, OpPC, NegatedValue); } enum class PushVal : bool { @@ -803,8 +821,7 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { return true; } - S.CCEDiag(E, diag::note_constexpr_overflow) << APResult << Type; - return S.noteUndefinedBehavior(); + return handleOverflow(S, OpPC, APResult); } /// 1) Pops a pointer from the stack @@ -2160,6 +2177,22 @@ inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem, return true; } +inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) { + FixedPointSemantics TargetSemantics(0, 0, false, false, false); + std::memcpy(&TargetSemantics, &FPS, sizeof(TargetSemantics)); + + const auto &Source = S.Stk.pop(); + + bool Overflow; + FixedPoint Result = Source.toSemantics(TargetSemantics, &Overflow); + + if (Overflow && !handleFixedPointOverflow(S, OpPC, Result)) + return false; + + S.Stk.push(Result); + return true; +} + /// Like Cast(), but we cast to an arbitrary-bitwidth integral, so we need /// to know what bitwidth the result should be. template ::T> @@ -2230,13 +2263,8 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC, auto Status = F.convertToInteger(Result); // Float-to-Integral overflow check. - if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) { - const Expr *E = S.Current->getExpr(OpPC); - QualType Type = E->getType(); - - S.CCEDiag(E, diag::note_constexpr_overflow) << F.getAPFloat() << Type; - return S.noteUndefinedBehavior(); - } + if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) + return handleOverflow(S, OpPC, F.getAPFloat()); FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI); S.Stk.push>(IntegralAP(Result)); @@ -2251,13 +2279,8 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC, auto Status = F.convertToInteger(Result); // Float-to-Integral overflow check. - if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) { - const Expr *E = S.Current->getExpr(OpPC); - QualType Type = E->getType(); - - S.CCEDiag(E, diag::note_constexpr_overflow) << F.getAPFloat() << Type; - return S.noteUndefinedBehavior(); - } + if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) + return handleOverflow(S, OpPC, F.getAPFloat()); FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI); S.Stk.push>(IntegralAP(Result)); @@ -2311,6 +2334,63 @@ static inline bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, return true; } +template ::T> +static inline bool CastIntegralFixedPoint(InterpState &S, CodePtr OpPC, + uint32_t FPS) { + const T &Int = S.Stk.pop(); + + FixedPointSemantics Sem(0, 0, false, false, false); + std::memcpy(&Sem, &FPS, sizeof(Sem)); + + bool Overflow; + FixedPoint Result = FixedPoint::from(Int.toAPSInt(), Sem, &Overflow); + + if (Overflow && !handleFixedPointOverflow(S, OpPC, Result)) + return false; + + S.Stk.push(Result); + return true; +} + +static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC, + uint32_t FPS) { + const auto &Float = S.Stk.pop(); + + FixedPointSemantics Sem(0, 0, false, false, false); + std::memcpy(&Sem, &FPS, sizeof(Sem)); + + bool Overflow; + FixedPoint Result = FixedPoint::from(Float.getAPFloat(), Sem, &Overflow); + + if (Overflow && !handleFixedPointOverflow(S, OpPC, Result)) + return false; + + S.Stk.push(Result); + return true; +} + +static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC, + const llvm::fltSemantics *Sem) { + const auto &Fixed = S.Stk.pop(); + + S.Stk.push(Fixed.toFloat(Sem)); + return true; +} + +template ::T> +static inline bool CastFixedPointIntegral(InterpState &S, CodePtr OpPC) { + const auto &Fixed = S.Stk.pop(); + + bool Overflow; + APSInt Int = Fixed.toInt(T::bitWidth(), T::isSigned(), &Overflow); + + if (Overflow && !handleOverflow(S, OpPC, Int)) + return false; + + S.Stk.push(Int); + return true; +} + static inline bool PtrPtrCast(InterpState &S, CodePtr OpPC, bool SrcIsVoidPtr) { const auto &Ptr = S.Stk.peek(); @@ -2501,6 +2581,42 @@ inline bool Shl(InterpState &S, CodePtr OpPC) { return DoShift(S, OpPC, LHS, RHS); } +static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) { + const auto &RHS = S.Stk.pop(); + const auto &LHS = S.Stk.pop(); + llvm::FixedPointSemantics LHSSema = LHS.getSemantics(); + + unsigned ShiftBitWidth = + LHSSema.getWidth() - (unsigned)LHSSema.hasUnsignedPadding() - 1; + + // Embedded-C 4.1.6.2.2: + // The right operand must be nonnegative and less than the total number + // of (nonpadding) bits of the fixed-point operand ... + if (RHS.isNegative()) { + S.CCEDiag(S.Current->getLocation(OpPC), diag::note_constexpr_negative_shift) + << RHS.toAPSInt(); + } else if (static_cast(RHS.toAPSInt().getLimitedValue( + ShiftBitWidth)) != RHS.toAPSInt()) { + const Expr *E = S.Current->getExpr(OpPC); + S.CCEDiag(E, diag::note_constexpr_large_shift) + << RHS.toAPSInt() << E->getType() << ShiftBitWidth; + } + + FixedPoint Result; + if (Left) { + if (FixedPoint::shiftLeft(LHS, RHS, ShiftBitWidth, &Result) && + !handleFixedPointOverflow(S, OpPC, Result)) + return false; + } else { + if (FixedPoint::shiftRight(LHS, RHS, ShiftBitWidth, &Result) && + !handleFixedPointOverflow(S, OpPC, Result)) + return false; + } + + S.Stk.push(Result); + return true; +} + //===----------------------------------------------------------------------===// // NoRet //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 5fdafd1bf8198..61b6f2e8daa2f 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -84,6 +84,11 @@ def IntegerTypeClass : TypeClass { Uint32, Sint64, Uint64, IntAP, IntAPS]; } +def IntegerAndFixedTypeClass : TypeClass { + let Types = [Sint8, Uint8, Sint16, Uint16, Sint32, + Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint]; +} + def FixedSizeIntegralTypeClass : TypeClass { let Types = [Sint8, Uint8, Sint16, Uint16, Sint32, Uint32, Sint64, Uint64, Bool]; @@ -98,7 +103,7 @@ def FloatTypeClass : TypeClass { } def AluTypeClass : TypeClass { - let Types = !listconcat(IntegerTypeClass.Types, [Bool]); + let Types = !listconcat(IntegerTypeClass.Types, [Bool], [FixedPoint]); } def PtrTypeClass : TypeClass { @@ -110,7 +115,7 @@ def NonPtrTypeClass : TypeClass { } def AllTypeClass : TypeClass { - let Types = !listconcat(AluTypeClass.Types, PtrTypeClass.Types, FloatTypeClass.Types, [FixedPoint]); + let Types = !listconcat(AluTypeClass.Types, PtrTypeClass.Types, FloatTypeClass.Types); } def ComparableTypeClass : TypeClass { @@ -146,7 +151,7 @@ class FloatOpcode : Opcode { } class IntegerOpcode : Opcode { - let Types = [IntegerTypeClass]; + let Types = [IntegerAndFixedTypeClass]; let HasGroup = 1; } @@ -626,6 +631,10 @@ def CastFP : Opcode { let Args = [ArgFltSemantics, ArgRoundingMode]; } +def CastFixedPoint : Opcode { + let Args = [ArgUint32]; +} + def FixedSizeIntegralTypes : TypeClass { let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64, Bool]; } @@ -674,6 +683,25 @@ def CastPointerIntegralAP : Opcode { def CastPointerIntegralAPS : Opcode { let Args = [ArgUint32]; } +def CastIntegralFixedPoint : Opcode { + let Types = [FixedSizeIntegralTypes]; + let Args = [ArgUint32]; + let HasGroup = 1; +} +def CastFloatingFixedPoint : Opcode { + let Args = [ArgUint32]; +} +def CastFixedPointFloating : Opcode { + let Args = [ArgFltSemantics]; +} +def CastFixedPointIntegral : Opcode { + let Types = [FixedSizeIntegralTypes]; + let HasGroup = 1; +} +def ShiftFixedPoint : Opcode { + let Args = [ArgBool]; +} + def PtrPtrCast : Opcode { let Args = [ArgBool]; diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 960eae36ed1f5..834a7a1e2eb23 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -52,6 +52,7 @@ #include "clang/AST/TypeLoc.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/DiagnosticSema.h" +#include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "llvm/ADT/APFixedPoint.h" #include "llvm/ADT/Sequence.h" @@ -13462,6 +13463,47 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, return false; return Success(DidOverflow, E); } + + case clang::X86::BI__builtin_ia32_bextr_u32: + case clang::X86::BI__builtin_ia32_bextr_u64: + case clang::X86::BI__builtin_ia32_bextri_u32: + case clang::X86::BI__builtin_ia32_bextri_u64: { + APSInt Val, Idx; + if (!EvaluateInteger(E->getArg(0), Val, Info) || + !EvaluateInteger(E->getArg(1), Idx, Info)) + return false; + + unsigned BitWidth = Val.getBitWidth(); + uint64_t Shift = Idx.extractBitsAsZExtValue(8, 0); + uint64_t Length = Idx.extractBitsAsZExtValue(8, 8); + Length = Length > BitWidth ? BitWidth : Length; + + // Handle out of bounds cases. + if (Length == 0 || Shift >= BitWidth) + return Success(0, E); + + uint64_t Result = Val.getZExtValue() >> Shift; + Result &= llvm::maskTrailingOnes(Length); + return Success(Result, E); + } + + case clang::X86::BI__builtin_ia32_lzcnt_u16: + case clang::X86::BI__builtin_ia32_lzcnt_u32: + case clang::X86::BI__builtin_ia32_lzcnt_u64: { + APSInt Val; + if (!EvaluateInteger(E->getArg(0), Val, Info)) + return false; + return Success(Val.countLeadingZeros(), E); + } + + case clang::X86::BI__builtin_ia32_tzcnt_u16: + case clang::X86::BI__builtin_ia32_tzcnt_u32: + case clang::X86::BI__builtin_ia32_tzcnt_u64: { + APSInt Val; + if (!EvaluateInteger(E->getArg(0), Val, Info)) + return false; + return Success(Val.countTrailingZeros(), E); + } } } diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index b6e1da0c3192d..117255178eebb 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -464,7 +464,7 @@ class CXXNameMangler { void mangleSeqID(unsigned SeqID); void mangleName(GlobalDecl GD); void mangleType(QualType T); - void mangleNameOrStandardSubstitution(const NamedDecl *ND); + void mangleCXXRecordDecl(const CXXRecordDecl *Record); void mangleLambdaSig(const CXXRecordDecl *Lambda); void mangleModuleNamePrefix(StringRef Name, bool IsPartition = false); void mangleVendorQualifier(StringRef Name); @@ -693,7 +693,7 @@ ItaniumMangleContextImpl::getEffectiveDeclContext(const Decl *D) { if (VD->isExternC()) return getASTContext().getTranslationUnitDecl(); - if (const auto *FD = dyn_cast(D)) { + if (const auto *FD = D->getAsFunction()) { if (FD->isExternC()) return getASTContext().getTranslationUnitDecl(); // Member-like constrained friends are mangled as if they were members of @@ -3029,9 +3029,13 @@ void CXXNameMangler::mangleType(QualType T) { addSubstitution(T); } -void CXXNameMangler::mangleNameOrStandardSubstitution(const NamedDecl *ND) { - if (!mangleStandardSubstitution(ND)) - mangleName(ND); +void CXXNameMangler::mangleCXXRecordDecl(const CXXRecordDecl *Record) { + if (mangleSubstitution(Record)) + return; + mangleName(Record); + if (isCompatibleWith(LangOptions::ClangABI::Ver19)) + return; + addSubstitution(Record); } void CXXNameMangler::mangleType(const BuiltinType *T) { @@ -7309,7 +7313,7 @@ void ItaniumMangleContextImpl::mangleCXXVTable(const CXXRecordDecl *RD, // ::= TV # virtual table CXXNameMangler Mangler(*this, Out); Mangler.getStream() << "_ZTV"; - Mangler.mangleNameOrStandardSubstitution(RD); + Mangler.mangleCXXRecordDecl(RD); } void ItaniumMangleContextImpl::mangleCXXVTT(const CXXRecordDecl *RD, @@ -7317,7 +7321,7 @@ void ItaniumMangleContextImpl::mangleCXXVTT(const CXXRecordDecl *RD, // ::= TT # VTT structure CXXNameMangler Mangler(*this, Out); Mangler.getStream() << "_ZTT"; - Mangler.mangleNameOrStandardSubstitution(RD); + Mangler.mangleCXXRecordDecl(RD); } void ItaniumMangleContextImpl::mangleCXXCtorVTable(const CXXRecordDecl *RD, @@ -7327,10 +7331,10 @@ void ItaniumMangleContextImpl::mangleCXXCtorVTable(const CXXRecordDecl *RD, // ::= TC _ CXXNameMangler Mangler(*this, Out); Mangler.getStream() << "_ZTC"; - Mangler.mangleNameOrStandardSubstitution(RD); + Mangler.mangleCXXRecordDecl(RD); Mangler.getStream() << Offset; Mangler.getStream() << '_'; - Mangler.mangleNameOrStandardSubstitution(Type); + Mangler.mangleCXXRecordDecl(Type); } void ItaniumMangleContextImpl::mangleCXXRTTI(QualType Ty, raw_ostream &Out) { diff --git a/clang/lib/Basic/Targets/Sparc.h b/clang/lib/Basic/Targets/Sparc.h index 3357bee33e1ac..ee0d3e2b4329e 100644 --- a/clang/lib/Basic/Targets/Sparc.h +++ b/clang/lib/Basic/Targets/Sparc.h @@ -151,7 +151,7 @@ class LLVM_LIBRARY_VISIBILITY SparcV8TargetInfo : public SparcTargetInfo { public: SparcV8TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : SparcTargetInfo(Triple, Opts) { - resetDataLayout("E-m:e-p:32:32-i64:64-f128:64-n32-S64"); + resetDataLayout("E-m:e-p:32:32-i64:64-i128:128-f128:64-n32-S64"); // NetBSD / OpenBSD use long (same as llvm default); everyone else uses int. switch (getTriple().getOS()) { default: @@ -188,7 +188,7 @@ class LLVM_LIBRARY_VISIBILITY SparcV8elTargetInfo : public SparcV8TargetInfo { public: SparcV8elTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : SparcV8TargetInfo(Triple, Opts) { - resetDataLayout("e-m:e-p:32:32-i64:64-f128:64-n32-S64"); + resetDataLayout("e-m:e-p:32:32-i64:64-i128:128-f128:64-n32-S64"); } }; @@ -198,7 +198,7 @@ class LLVM_LIBRARY_VISIBILITY SparcV9TargetInfo : public SparcTargetInfo { SparcV9TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : SparcTargetInfo(Triple, Opts) { // FIXME: Support Sparc quad-precision long double? - resetDataLayout("E-m:e-i64:64-n32:64-S128"); + resetDataLayout("E-m:e-i64:64-i128:128-n32:64-S128"); // This is an LP64 platform. LongWidth = LongAlign = PointerWidth = PointerAlign = 64; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 9033cd1ccd781..d739597de4c85 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2878,7 +2878,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_fmodf: case Builtin::BI__builtin_fmodf16: case Builtin::BI__builtin_fmodl: - case Builtin::BI__builtin_fmodf128: { + case Builtin::BI__builtin_fmodf128: + case Builtin::BI__builtin_elementwise_fmod: { CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); Value *Arg1 = EmitScalarExpr(E->getArg(0)); Value *Arg2 = EmitScalarExpr(E->getArg(1)); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 9e095a3755219..c920d93957b16 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1073,9 +1073,8 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, CGM.getContext().getTargetInfo().getNewAlign() / 8)); // Cast the void pointer and get the address of the globalized variable. - llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); + VoidPtr, Bld.getPtrTy(0), VD->getName() + "_on_stack"); LValue VarAddr = CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy); Rec.second.PrivateAddr = VarAddr.getAddress(); @@ -1930,7 +1929,7 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy); + Src, Bld.getPtrTy(0), CGF.SizeTy); llvm::Value *LB = CGF.EmitLoadOfScalar( TypedAddress, /*Volatile=*/false, @@ -1939,8 +1938,8 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( Args.emplace_back(LB); ++Idx; Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); - TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy); + TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Src, Bld.getPtrTy(0), + CGF.SizeTy); llvm::Value *UB = CGF.EmitLoadOfScalar( TypedAddress, /*Volatile=*/false, @@ -2079,7 +2078,7 @@ Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, GV->setAlignment(Align.getAsAlign()); return Address( CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace( + GV, CGF.Builder.getPtrTy(CGM.getContext().getTargetAddressSpace( VD->getType().getAddressSpace()))), VarTy, Align); } diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 2381fa93e23fe..25c1c496a4f27 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -5622,8 +5622,9 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D, emitter->finalize(GV); // If it is safe to mark the global 'constant', do so now. - GV->setConstant(!NeedsGlobalCtor && !NeedsGlobalDtor && - D->getType().isConstantStorage(getContext(), true, true)); + GV->setConstant((D->hasAttr() && LangOpts.CUDAIsDevice) || + (!NeedsGlobalCtor && !NeedsGlobalDtor && + D->getType().isConstantStorage(getContext(), true, true))); // If it is in a read-only section, mark it 'constant'. if (const SectionAttr *SA = D->getAttr()) { diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index d0c8bdba0ede9..fba6a8853c396 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6481,6 +6481,7 @@ const ToolChain &Driver::getToolChain(const ArgList &Args, case llvm::Triple::ZOS: TC = std::make_unique(*this, Target, Args); break; + case llvm::Triple::Vulkan: case llvm::Triple::ShaderModel: TC = std::make_unique(*this, Target, Args); break; diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 6c09843a7146f..a0291ccfea245 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3830,6 +3830,9 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, case LangOptions::ClangABI::Ver18: GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "18.0"); break; + case LangOptions::ClangABI::Ver19: + GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "19.0"); + break; case LangOptions::ClangABI::Latest: break; } @@ -4372,6 +4375,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Opts.setClangABICompat(LangOptions::ClangABI::Ver17); else if (Major <= 18) Opts.setClangABICompat(LangOptions::ClangABI::Ver18); + else if (Major <= 19) + Opts.setClangABICompat(LangOptions::ClangABI::Ver19); } else if (Ver != "latest") { Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << A->getValue(); diff --git a/clang/lib/Headers/bmiintrin.h b/clang/lib/Headers/bmiintrin.h index 78bffe68e221a..634fa39bfa1d7 100644 --- a/clang/lib/Headers/bmiintrin.h +++ b/clang/lib/Headers/bmiintrin.h @@ -17,7 +17,12 @@ /* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT instruction behaves as BSF on non-BMI targets, there is code that expects to use it as a potentially faster version of BSF. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __RELAXED_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__)) constexpr +#else #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#endif /// Counts the number of trailing zero bits in the operand. /// @@ -166,6 +171,12 @@ _mm_tzcnt_64(unsigned long long __X) /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + /// Performs a bitwise AND of the second operand with the one's /// complement of the first operand. /// @@ -223,9 +234,8 @@ __andn_u32(unsigned int __X, unsigned int __Y) /// \returns An unsigned integer whose least significant bits contain the /// extracted bits. /// \see _bextr_u32 -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__bextr_u32(unsigned int __X, unsigned int __Y) -{ +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR +__bextr_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_bextr_u32(__X, __Y); } @@ -248,10 +258,9 @@ __bextr_u32(unsigned int __X, unsigned int __Y) /// \returns An unsigned integer whose least significant bits contain the /// extracted bits. /// \see __bextr_u32 -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) -{ - return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR +_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) { + return __builtin_ia32_bextr_u32(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } /* Intel-specified, single-leading-underscore version of BEXTR2 */ @@ -271,7 +280,7 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) /// \returns An unsigned integer whose least significant bits contain the /// extracted bits. /// \see __bextr_u32 -static __inline__ unsigned int __DEFAULT_FN_ATTRS +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR _bextr2_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_bextr_u32(__X, __Y); } @@ -444,9 +453,8 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y) /// \returns An unsigned 64-bit integer whose least significant bits contain the /// extracted bits. /// \see _bextr_u64 -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__bextr_u64(unsigned long long __X, unsigned long long __Y) -{ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR +__bextr_u64(unsigned long long __X, unsigned long long __Y) { return __builtin_ia32_bextr_u64(__X, __Y); } @@ -469,10 +477,9 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y) /// \returns An unsigned 64-bit integer whose least significant bits contain the /// extracted bits. /// \see __bextr_u64 -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) -{ - return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR +_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) { + return __builtin_ia32_bextr_u64(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } /* Intel-specified, single-leading-underscore version of BEXTR2 */ @@ -492,7 +499,7 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) /// \returns An unsigned 64-bit integer whose least significant bits contain the /// extracted bits. /// \see __bextr_u64 -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR _bextr2_u64(unsigned long long __X, unsigned long long __Y) { return __builtin_ia32_bextr_u64(__X, __Y); } diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index b139f9eb7d999..810a16d75f022 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -378,6 +378,22 @@ template constexpr float asfloat(T F) { return __detail::bit_cast(F); } +//===----------------------------------------------------------------------===// +// asint builtins +//===----------------------------------------------------------------------===// + +/// \fn int asint(T Val) +/// \brief Interprets the bit pattern of x as an integer. +/// \param Val The input value. + +template constexpr vector asint(vector V) { + return __detail::bit_cast(V); +} + +template constexpr int asint(T F) { + return __detail::bit_cast(F); +} + //===----------------------------------------------------------------------===// // asin builtins //===----------------------------------------------------------------------===// @@ -913,6 +929,40 @@ float3 floor(float3); _HLSL_BUILTIN_ALIAS(__builtin_elementwise_floor) float4 floor(float4); +//===----------------------------------------------------------------------===// +// fmod builtins +//===----------------------------------------------------------------------===// + +/// \fn T fmod(T x, T y) +/// \brief Returns the linear interpolation of x to y. +/// \param x [in] The dividend. +/// \param y [in] The divisor. +/// +/// Return the floating-point remainder of the x parameter divided by the y +/// parameter. + +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod) +half fmod(half, half); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod) +half2 fmod(half2, half2); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod) +half3 fmod(half3, half3); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod) +half4 fmod(half4, half4); + +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod) +float fmod(float, float); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod) +float2 fmod(float2, float2); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod) +float3 fmod(float3, float3); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod) +float4 fmod(float4, float4); + //===----------------------------------------------------------------------===// // frac builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Headers/lzcntintrin.h b/clang/lib/Headers/lzcntintrin.h index f4ddce9d0e683..db00474ffd394 100644 --- a/clang/lib/Headers/lzcntintrin.h +++ b/clang/lib/Headers/lzcntintrin.h @@ -15,7 +15,13 @@ #define __LZCNTINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) constexpr +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) +#endif #ifndef _MSC_VER /// Counts the number of leading zero bits in the operand. diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index ca521dc0bcd26..2913d16fca482 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -117,33 +117,30 @@ struct BuiltinTypeDeclBuilder { if (Record->isCompleteDefinition()) return *this; + ASTContext &Ctx = S.getASTContext(); TypeSourceInfo *ElementTypeInfo = nullptr; - QualType Ty = Record->getASTContext().VoidPtrTy; + QualType ElemTy = Ctx.Char8Ty; if (Template) { if (const auto *TTD = dyn_cast( Template->getTemplateParameters()->getParam(0))) { - Ty = Record->getASTContext().getPointerType( - QualType(TTD->getTypeForDecl(), 0)); - QualType ElemType = QualType(TTD->getTypeForDecl(), 0); - ElementTypeInfo = S.getASTContext().getTrivialTypeSourceInfo( - ElemType, SourceLocation()); + ElemTy = QualType(TTD->getTypeForDecl(), 0); } } + ElementTypeInfo = Ctx.getTrivialTypeSourceInfo(ElemTy, SourceLocation()); // add handle member with resource type attributes QualType AttributedResTy = QualType(); SmallVector Attrs = { - HLSLResourceClassAttr::CreateImplicit(Record->getASTContext(), RC), - IsROV ? HLSLROVAttr::CreateImplicit(Record->getASTContext()) : nullptr, - RawBuffer ? HLSLRawBufferAttr::CreateImplicit(Record->getASTContext()) - : nullptr, - ElementTypeInfo ? HLSLContainedTypeAttr::CreateImplicit( - Record->getASTContext(), ElementTypeInfo) - : nullptr}; - Attr *ResourceAttr = - HLSLResourceAttr::CreateImplicit(Record->getASTContext(), RK); - if (CreateHLSLAttributedResourceType(S, Ty, Attrs, AttributedResTy)) + HLSLResourceClassAttr::CreateImplicit(Ctx, RC), + IsROV ? HLSLROVAttr::CreateImplicit(Ctx) : nullptr, + RawBuffer ? HLSLRawBufferAttr::CreateImplicit(Ctx) : nullptr, + ElementTypeInfo + ? HLSLContainedTypeAttr::CreateImplicit(Ctx, ElementTypeInfo) + : nullptr}; + Attr *ResourceAttr = HLSLResourceAttr::CreateImplicit(Ctx, RK); + if (CreateHLSLAttributedResourceType(S, Ctx.HLSLResourceTy, Attrs, + AttributedResTy)) addMemberVariable("h", AttributedResTy, {ResourceAttr}, Access); return *this; } @@ -214,14 +211,14 @@ struct BuiltinTypeDeclBuilder { assert(Fields.count("h") > 0 && "Subscript operator must be added after the handle."); - FieldDecl *Handle = Fields["h"]; ASTContext &AST = Record->getASTContext(); - - assert(Handle->getType().getCanonicalType() != AST.VoidPtrTy && - "Not yet supported for void pointer handles."); - - QualType ElemTy = - QualType(Handle->getType()->getPointeeOrArrayElementType(), 0); + QualType ElemTy = AST.Char8Ty; + if (Template) { + if (const auto *TTD = dyn_cast( + Template->getTemplateParameters()->getParam(0))) { + ElemTy = QualType(TTD->getTypeForDecl(), 0); + } + } QualType ReturnTy = ElemTy; FunctionProtoType::ExtProtoInfo ExtInfo; @@ -257,22 +254,23 @@ struct BuiltinTypeDeclBuilder { auto FnProtoLoc = TSInfo->getTypeLoc().getAs(); FnProtoLoc.setParam(0, IdxParam); + // FIXME: Placeholder to make sure we return the correct type - create + // field of element_type and return reference to it. This field will go + // away once indexing into resources is properly implemented in + // llvm/llvm-project#95956. + if (Fields.count("e") == 0) { + addMemberVariable("e", ElemTy, {}); + } + FieldDecl *ElemFieldDecl = Fields["e"]; + auto *This = CXXThisExpr::Create(AST, SourceLocation(), MethodDecl->getFunctionObjectParameterType(), true); - auto *HandleAccess = MemberExpr::CreateImplicit( - AST, This, false, Handle, Handle->getType(), VK_LValue, OK_Ordinary); - - auto *IndexExpr = DeclRefExpr::Create( - AST, NestedNameSpecifierLoc(), SourceLocation(), IdxParam, false, - DeclarationNameInfo(IdxParam->getDeclName(), SourceLocation()), - AST.UnsignedIntTy, VK_PRValue); - - auto *Array = - new (AST) ArraySubscriptExpr(HandleAccess, IndexExpr, ElemTy, VK_LValue, - OK_Ordinary, SourceLocation()); - - auto *Return = ReturnStmt::Create(AST, SourceLocation(), Array, nullptr); + Expr *ElemField = MemberExpr::CreateImplicit( + AST, This, false, ElemFieldDecl, ElemFieldDecl->getType(), VK_LValue, + OK_Ordinary); + auto *Return = + ReturnStmt::Create(AST, SourceLocation(), ElemField, nullptr); MethodDecl->setBody(CompoundStmt::Create(AST, {Return}, FPOptionsOverride(), SourceLocation(), diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index af1dc21594da8..8634b54b0535d 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2755,6 +2755,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, // These builtins restrict the element type to floating point // types only, and take in two arguments. + case Builtin::BI__builtin_elementwise_fmod: case Builtin::BI__builtin_elementwise_pow: { if (BuiltinElementwiseMath(TheCall)) return ExprError(); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 66df9c969256a..e072fb65b8132 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -10133,6 +10133,10 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS, const VectorType *RHSVecType = RHSType->getAs(); assert(LHSVecType || RHSVecType); + if (getLangOpts().HLSL) + return HLSL().handleVectorBinOpConversion(LHS, RHS, LHSType, RHSType, + IsCompAssign); + // AltiVec-style "vector bool op vector bool" combinations are allowed // for some operators but not others. if (!AllowBothBool && LHSVecType && @@ -12863,7 +12867,8 @@ static void diagnoseXorMisusedAsPow(Sema &S, const ExprResult &XorLHS, } QualType Sema::CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS, - SourceLocation Loc) { + SourceLocation Loc, + BinaryOperatorKind Opc) { // Ensure that either both operands are of the same vector type, or // one operand is of a vector type and the other is of its element type. QualType vType = CheckVectorOperands(LHS, RHS, Loc, false, @@ -12883,6 +12888,15 @@ QualType Sema::CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS, if (!getLangOpts().CPlusPlus && !(isa(vType->getAs()))) return InvalidLogicalVectorOperands(Loc, LHS, RHS); + // Beginning with HLSL 2021, HLSL disallows logical operators on vector + // operands and instead requires the use of the `and`, `or`, `any`, `all`, and + // `select` functions. + if (getLangOpts().HLSL && + getLangOpts().getHLSLVersion() >= LangOptionsBase::HLSL_2021) { + (void)InvalidOperands(Loc, LHS, RHS); + HLSL().emitLogicalOperatorFixIt(LHS.get(), RHS.get(), Opc); + return QualType(); + } return GetSignedVectorType(LHS.get()->getType()); } @@ -13054,7 +13068,7 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, // Check vector operands differently. if (LHS.get()->getType()->isVectorType() || RHS.get()->getType()->isVectorType()) - return CheckVectorLogicalOperands(LHS, RHS, Loc); + return CheckVectorLogicalOperands(LHS, RHS, Loc, Opc); bool EnumConstantInBoolContext = false; for (const ExprResult &HS : {LHS, RHS}) { diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 1d8ccdda45573..43cc6c81ae5cb 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -401,6 +401,194 @@ void SemaHLSL::DiagnoseAttrStageMismatch( << (AllowedStages.size() != 1) << join(StageStrings, ", "); } +template +static void castVector(Sema &S, ExprResult &E, QualType &Ty, unsigned Sz) { + if (const auto *VTy = Ty->getAs()) + Ty = VTy->getElementType(); + Ty = S.getASTContext().getExtVectorType(Ty, Sz); + E = S.ImpCastExprToType(E.get(), Ty, Kind); +} + +template +static QualType castElement(Sema &S, ExprResult &E, QualType Ty) { + E = S.ImpCastExprToType(E.get(), Ty, Kind); + return Ty; +} + +static QualType handleFloatVectorBinOpConversion( + Sema &SemaRef, ExprResult &LHS, ExprResult &RHS, QualType LHSType, + QualType RHSType, QualType LElTy, QualType RElTy, bool IsCompAssign) { + bool LHSFloat = LElTy->isRealFloatingType(); + bool RHSFloat = RElTy->isRealFloatingType(); + + if (LHSFloat && RHSFloat) { + if (IsCompAssign || + SemaRef.getASTContext().getFloatingTypeOrder(LElTy, RElTy) > 0) + return castElement(SemaRef, RHS, LHSType); + + return castElement(SemaRef, LHS, RHSType); + } + + if (LHSFloat) + return castElement(SemaRef, RHS, LHSType); + + assert(RHSFloat); + if (IsCompAssign) + return castElement(SemaRef, RHS, LHSType); + + return castElement(SemaRef, LHS, RHSType); +} + +static QualType handleIntegerVectorBinOpConversion( + Sema &SemaRef, ExprResult &LHS, ExprResult &RHS, QualType LHSType, + QualType RHSType, QualType LElTy, QualType RElTy, bool IsCompAssign) { + + int IntOrder = SemaRef.Context.getIntegerTypeOrder(LElTy, RElTy); + bool LHSSigned = LElTy->hasSignedIntegerRepresentation(); + bool RHSSigned = RElTy->hasSignedIntegerRepresentation(); + auto &Ctx = SemaRef.getASTContext(); + + // If both types have the same signedness, use the higher ranked type. + if (LHSSigned == RHSSigned) { + if (IsCompAssign || IntOrder >= 0) + return castElement(SemaRef, RHS, LHSType); + + return castElement(SemaRef, LHS, RHSType); + } + + // If the unsigned type has greater than or equal rank of the signed type, use + // the unsigned type. + if (IntOrder != (LHSSigned ? 1 : -1)) { + if (IsCompAssign || RHSSigned) + return castElement(SemaRef, RHS, LHSType); + return castElement(SemaRef, LHS, RHSType); + } + + // At this point the signed type has higher rank than the unsigned type, which + // means it will be the same size or bigger. If the signed type is bigger, it + // can represent all the values of the unsigned type, so select it. + if (Ctx.getIntWidth(LElTy) != Ctx.getIntWidth(RElTy)) { + if (IsCompAssign || LHSSigned) + return castElement(SemaRef, RHS, LHSType); + return castElement(SemaRef, LHS, RHSType); + } + + // This is a bit of an odd duck case in HLSL. It shouldn't happen, but can due + // to C/C++ leaking through. The place this happens today is long vs long + // long. When arguments are vector and vector, + // the long long has higher rank than long even though they are the same size. + + // If this is a compound assignment cast the right hand side to the left hand + // side's type. + if (IsCompAssign) + return castElement(SemaRef, RHS, LHSType); + + // If this isn't a compound assignment we convert to unsigned long long. + QualType ElTy = Ctx.getCorrespondingUnsignedType(LHSSigned ? LElTy : RElTy); + QualType NewTy = Ctx.getExtVectorType( + ElTy, RHSType->castAs()->getNumElements()); + (void)castElement(SemaRef, RHS, NewTy); + + return castElement(SemaRef, LHS, NewTy); +} + +static CastKind getScalarCastKind(ASTContext &Ctx, QualType DestTy, + QualType SrcTy) { + if (DestTy->isRealFloatingType() && SrcTy->isRealFloatingType()) + return CK_FloatingCast; + if (DestTy->isIntegralType(Ctx) && SrcTy->isIntegralType(Ctx)) + return CK_IntegralCast; + if (DestTy->isRealFloatingType()) + return CK_IntegralToFloating; + assert(SrcTy->isRealFloatingType() && DestTy->isIntegralType(Ctx)); + return CK_FloatingToIntegral; +} + +QualType SemaHLSL::handleVectorBinOpConversion(ExprResult &LHS, ExprResult &RHS, + QualType LHSType, + QualType RHSType, + bool IsCompAssign) { + const auto *LVecTy = LHSType->getAs(); + const auto *RVecTy = RHSType->getAs(); + auto &Ctx = getASTContext(); + + // If the LHS is not a vector and this is a compound assignment, we truncate + // the argument to a scalar then convert it to the LHS's type. + if (!LVecTy && IsCompAssign) { + QualType RElTy = RHSType->castAs()->getElementType(); + RHS = SemaRef.ImpCastExprToType(RHS.get(), RElTy, CK_HLSLVectorTruncation); + RHSType = RHS.get()->getType(); + if (Ctx.hasSameUnqualifiedType(LHSType, RHSType)) + return LHSType; + RHS = SemaRef.ImpCastExprToType(RHS.get(), LHSType, + getScalarCastKind(Ctx, LHSType, RHSType)); + return LHSType; + } + + unsigned EndSz = std::numeric_limits::max(); + unsigned LSz = 0; + if (LVecTy) + LSz = EndSz = LVecTy->getNumElements(); + if (RVecTy) + EndSz = std::min(RVecTy->getNumElements(), EndSz); + assert(EndSz != std::numeric_limits::max() && + "one of the above should have had a value"); + + // In a compound assignment, the left operand does not change type, the right + // operand is converted to the type of the left operand. + if (IsCompAssign && LSz != EndSz) { + Diag(LHS.get()->getBeginLoc(), + diag::err_hlsl_vector_compound_assignment_truncation) + << LHSType << RHSType; + return QualType(); + } + + if (RVecTy && RVecTy->getNumElements() > EndSz) + castVector(SemaRef, RHS, RHSType, EndSz); + if (!IsCompAssign && LVecTy && LVecTy->getNumElements() > EndSz) + castVector(SemaRef, LHS, LHSType, EndSz); + + if (!RVecTy) + castVector(SemaRef, RHS, RHSType, EndSz); + if (!IsCompAssign && !LVecTy) + castVector(SemaRef, LHS, LHSType, EndSz); + + // If we're at the same type after resizing we can stop here. + if (Ctx.hasSameUnqualifiedType(LHSType, RHSType)) + return Ctx.getCommonSugaredType(LHSType, RHSType); + + QualType LElTy = LHSType->castAs()->getElementType(); + QualType RElTy = RHSType->castAs()->getElementType(); + + // Handle conversion for floating point vectors. + if (LElTy->isRealFloatingType() || RElTy->isRealFloatingType()) + return handleFloatVectorBinOpConversion(SemaRef, LHS, RHS, LHSType, RHSType, + LElTy, RElTy, IsCompAssign); + + assert(LElTy->isIntegralType(Ctx) && RElTy->isIntegralType(Ctx) && + "HLSL Vectors can only contain integer or floating point types"); + return handleIntegerVectorBinOpConversion(SemaRef, LHS, RHS, LHSType, RHSType, + LElTy, RElTy, IsCompAssign); +} + +void SemaHLSL::emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, + BinaryOperatorKind Opc) { + assert((Opc == BO_LOr || Opc == BO_LAnd) && + "Called with non-logical operator"); + llvm::SmallVector Buff; + llvm::raw_svector_ostream OS(Buff); + PrintingPolicy PP(SemaRef.getLangOpts()); + StringRef NewFnName = Opc == BO_LOr ? "or" : "and"; + OS << NewFnName << "("; + LHS->printPretty(OS, nullptr, PP); + OS << ", "; + RHS->printPretty(OS, nullptr, PP); + OS << ")"; + SourceRange FullRange = SourceRange(LHS->getBeginLoc(), RHS->getEndLoc()); + SemaRef.Diag(LHS->getBeginLoc(), diag::note_function_suggestion) + << NewFnName << FixItHint::CreateReplacement(FullRange, OS.str()); +} + void SemaHLSL::handleNumThreadsAttr(Decl *D, const ParsedAttr &AL) { llvm::VersionTuple SMVersion = getASTContext().getTargetInfo().getTriple().getOSVersion(); @@ -1777,6 +1965,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { case Builtin::BI__builtin_elementwise_exp: case Builtin::BI__builtin_elementwise_exp2: case Builtin::BI__builtin_elementwise_floor: + case Builtin::BI__builtin_elementwise_fmod: case Builtin::BI__builtin_elementwise_log: case Builtin::BI__builtin_elementwise_log2: case Builtin::BI__builtin_elementwise_log10: diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 9afb8cea26fe7..8615da4b044a8 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -9195,7 +9195,7 @@ static bool checkOpenMPIterationSpace( SemaRef.Diag(CollapseLoopCountExpr->getExprLoc(), diag::note_omp_collapse_ordered_expr) << 0 << CollapseLoopCountExpr->getSourceRange(); - else + else if (OrderedLoopCountExpr) SemaRef.Diag(OrderedLoopCountExpr->getExprLoc(), diag::note_omp_collapse_ordered_expr) << 1 << OrderedLoopCountExpr->getSourceRange(); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index b9222a1b33fd7..b71684569609a 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -1785,6 +1785,18 @@ void ASTDeclWriter::VisitClassTemplateSpecializationDecl( if (ArgsWritten) Record.AddASTTemplateArgumentListInfo(ArgsWritten); + // Mention the implicitly generated C++ deduction guide to make sure the + // deduction guide will be rewritten as expected. + // + // FIXME: Would it be more efficient to add a callback register function + // in sema to register the deduction guide? + if (Writer.isWritingStdCXXNamedModules()) { + auto Name = Context.DeclarationNames.getCXXDeductionGuideName( + D->getSpecializedTemplate()); + for (auto *DG : D->getDeclContext()->noload_lookup(Name)) + Writer.GetDeclRef(DG->getCanonicalDecl()); + } + Code = serialization::DECL_CLASS_TEMPLATE_SPECIALIZATION; } diff --git a/clang/test/AST/ByteCode/fixed-point.cpp b/clang/test/AST/ByteCode/fixed-point.cpp index 42ebdf64e1a9f..4bf80ba7c58f0 100644 --- a/clang/test/AST/ByteCode/fixed-point.cpp +++ b/clang/test/AST/ByteCode/fixed-point.cpp @@ -6,6 +6,72 @@ static_assert(!((bool)0.0k)); static_assert((bool)0.0k); // both-error {{static assertion failed}} static_assert(1.0k == 1.0k); +static_assert(1.0k == 1); static_assert(1.0k != 1.0k); // both-error {{failed due to requirement '1.0k != 1.0k'}} +static_assert(1.0k != 1); // both-error {{failed due to requirement '1.0k != 1'}} static_assert(-12.0k == -(-(-12.0k))); +constexpr _Accum acc = (0.5r, 6.9k); + +/// Zero-init. +constexpr _Accum A{}; +static_assert(A == 0.0k); +static_assert(A == 0); + +namespace IntToFixedPointCast { + constexpr _Accum B = 13; + static_assert(B == 13.0k); + static_assert(B == 13); + + constexpr _Fract sf = -1; + static_assert(sf == -1.0k); + static_assert(sf == -1); +} + +namespace FixedPointToIntCasts { + constexpr _Accum A = -13.0k; + constexpr int I = A; + static_assert(I == -13); +} + +namespace FloatToFixedPointCast { + constexpr _Fract sf = 1.0; // both-error {{must be initialized by a constant expression}} \ + // both-note {{outside the range of representable values of type 'const _Fract'}} + + constexpr _Fract sf2 = 0.5; + static_assert(sf2 == 0.5); + constexpr float sf2f = sf2; + static_assert(sf2f == 0.5); +} + +namespace BinOps { + constexpr _Accum A = 13; + static_assert(A + 1 == 14.0k); + static_assert(1 + A == 14.0k); + static_assert((A + A) == 26); + + static_assert(A + 100000 == 14.0k); // both-error {{is not an integral constant expression}} \ + // both-note {{is outside the range of representable values}} + + static_assert((A - A) == 0); + constexpr short _Accum mul_ovf1 = 255.0hk * 4.5hk; // both-error {{must be initialized by a constant expression}} \ + // both-note {{value 123.5 is outside the range of representable values of type 'short _Accum'}} + constexpr short _Accum div_ovf1 = 255.0hk / 0.5hk; // both-error {{must be initialized by a constant expression}} \ + // both-note {{value -2.0 is outside the range of representable values of type 'short _Accum'}} + +} + +namespace FixedPointCasts { + constexpr _Fract B = 0.3; + constexpr _Accum A = B; + constexpr _Fract C = A; +} + +namespace Cmp { + constexpr _Accum A = 13.0k; + constexpr _Accum B = 14.0k; + static_assert(B > A); + static_assert(B >= A); + static_assert(A < B); + static_assert(A <= B); +} diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl index a95be63da5dc1..55c0dfa2eaa53 100644 --- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl @@ -29,36 +29,26 @@ RWBuffer Buffer; // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class RWBuffer definition // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: implicit h 'element_type * +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME:':'element_type *' +// CHECK-SAME: ':'__hlsl_resource_t' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type * -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const RWBuffer' lvalue implicit this -// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline // CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type * -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'RWBuffer' lvalue implicit this -// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline // CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class RWBuffer definition @@ -66,8 +56,8 @@ RWBuffer Buffer; // CHECK: TemplateArgument type 'float' // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h 'float * +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK-SAME: ':'float *' +// CHECK-SAME: ':'__hlsl_resource_t' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl index a186779870c26..b31db8ce59f22 100644 --- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl @@ -30,39 +30,27 @@ StructuredBuffer Buffer; // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer definition // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h 'element_type * +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ':'element_type *' +// CHECK-SAME: ':'__hlsl_resource_t' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type * -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const StructuredBuffer' lvalue implicit this -// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline // CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type * -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'StructuredBuffer' lvalue implicit this -// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline // CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class StructuredBuffer definition @@ -70,9 +58,9 @@ StructuredBuffer Buffer; // CHECK: TemplateArgument type 'float' // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h 'float * +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK-SAME: ':'float *' +// CHECK-SAME: ':'__hlsl_resource_t' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer diff --git a/clang/test/Analysis/casts.c b/clang/test/Analysis/casts.c index 462a9865f1564..30cd74be564fd 100644 --- a/clang/test/Analysis/casts.c +++ b/clang/test/Analysis/casts.c @@ -129,7 +129,7 @@ void locAsIntegerCasts(void *p) { } void multiDimensionalArrayPointerCasts(void) { - static int x[10][10]; // expected-note2{{Array at the right-hand side of subtraction}} + static int x[10][10]; int *y1 = &(x[3][5]); char *z = ((char *) y1) + 2; int *y2 = (int *)(z - 2); @@ -138,9 +138,7 @@ void multiDimensionalArrayPointerCasts(void) { clang_analyzer_eval(y1 == y2); // expected-warning{{TRUE}} // FIXME: should be FALSE (i.e. equal pointers). - // FIXME: pointer subtraction warning might be incorrect clang_analyzer_eval(y1 - y2); // expected-warning{{UNKNOWN}} - // expected-warning@-1{{Subtraction of two pointers that do not point into the same array is undefined behavior}} // FIXME: should be TRUE (i.e. same symbol). clang_analyzer_eval(*y1 == *y2); // expected-warning{{UNKNOWN}} @@ -149,9 +147,7 @@ void multiDimensionalArrayPointerCasts(void) { clang_analyzer_eval(y1 == y3); // expected-warning{{TRUE}} // FIXME: should be FALSE (i.e. equal pointers). - // FIXME: pointer subtraction warning might be incorrect clang_analyzer_eval(y1 - y3); // expected-warning{{UNKNOWN}} - // expected-warning@-1{{Subtraction of two pointers that do not point into the same array is undefined behavior}} // FIXME: should be TRUE (i.e. same symbol). clang_analyzer_eval(*y1 == *y3); // expected-warning{{UNKNOWN}} diff --git a/clang/test/Analysis/pointer-sub-notes.c b/clang/test/Analysis/pointer-sub-notes.c index 59681b4e7555a..7f94d6544d0f8 100644 --- a/clang/test/Analysis/pointer-sub-notes.c +++ b/clang/test/Analysis/pointer-sub-notes.c @@ -1,4 +1,4 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.PointerSub -analyzer-output=text -verify %s +// RUN: %clang_analyze_cc1 -analyzer-checker=security.PointerSub -analyzer-output=text -verify %s void different_1() { int a[3]; // expected-note{{Array at the left-hand side of subtraction}} diff --git a/clang/test/Analysis/pointer-sub.c b/clang/test/Analysis/pointer-sub.c index cf9eac1abc2dc..1c9d676ebb8f2 100644 --- a/clang/test/Analysis/pointer-sub.c +++ b/clang/test/Analysis/pointer-sub.c @@ -1,4 +1,4 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.PointerSub -analyzer-output=text-minimal -verify %s +// RUN: %clang_analyze_cc1 -analyzer-checker=security.PointerSub -analyzer-output=text-minimal -verify %s void f1(void) { int x, y, z[10]; diff --git a/clang/test/CodeGen/X86/bmi-builtins.c b/clang/test/CodeGen/X86/bmi-builtins.c index 81405429b9b6c..6c0b2c440ea08 100644 --- a/clang/test/CodeGen/X86/bmi-builtins.c +++ b/clang/test/CodeGen/X86/bmi-builtins.c @@ -1,5 +1,7 @@ -// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT -// RUN: %clang_cc1 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT +// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT +// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT +// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT +// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT #include @@ -232,3 +234,64 @@ unsigned long long test_blsr_u64(unsigned long long __X) { #endif #endif // !defined(TEST_TZCNT) + +// Test constexpr handling. +#if defined(__cplusplus) && (__cplusplus >= 201103L) +char bextr32_0[__bextr_u32(0x00000000, 0x00000000) == 0x00000000 ? 1 : -1]; +char bextr32_1[__bextr_u32(0x000003F0, 0xFFFF1004) == 0x0000003F ? 1 : -1]; +char bextr32_2[__bextr_u32(0x000003F0, 0xFFFF3008) == 0x00000003 ? 1 : -1]; + +char bextr32_3[_bextr2_u32(0x00000000, 0x00000000) == 0x00000000 ? 1 : -1]; +char bextr32_4[_bextr2_u32(0x000003F0, 0xFFFF1004) == 0x0000003F ? 1 : -1]; +char bextr32_5[_bextr2_u32(0x000003F0, 0xFFFF3008) == 0x00000003 ? 1 : -1]; + +char bextr32_6[_bextr_u32(0x00000000, 0x00000000, 0x00000000) == 0x00000000 ? 1 : -1]; +char bextr32_7[_bextr_u32(0x000003F0, 0xFFFFFF04, 0xFFFFFF10) == 0x0000003F ? 1 : -1]; +char bextr32_8[_bextr_u32(0x000003F0, 0xFFFFFF08, 0xFFFFFF30) == 0x00000003 ? 1 : -1]; + +char tzcntu16_0[__tzcnt_u16(0x0000) == 16 ? 1 : -1]; +char tzcntu16_1[__tzcnt_u16(0x0001) == 0 ? 1 : -1]; +char tzcntu16_2[__tzcnt_u16(0x0010) == 4 ? 1 : -1]; + +char tzcnt2u16_0[_tzcnt_u16(0x0000) == 16 ? 1 : -1]; +char tzcnt2u16_1[_tzcnt_u16(0x0001) == 0 ? 1 : -1]; +char tzcnt2u16_2[_tzcnt_u16(0x0010) == 4 ? 1 : -1]; + +char tzcntu32_0[__tzcnt_u32(0x00000000) == 32 ? 1 : -1]; +char tzcntu32_1[__tzcnt_u32(0x00000001) == 0 ? 1 : -1]; +char tzcntu32_2[__tzcnt_u32(0x00000080) == 7 ? 1 : -1]; + +char tzcnt2u32_0[_tzcnt_u32(0x00000000) == 32 ? 1 : -1]; +char tzcnt2u32_1[_tzcnt_u32(0x00000001) == 0 ? 1 : -1]; +char tzcnt2u32_2[_tzcnt_u32(0x00000080) == 7 ? 1 : -1]; + +char tzcnt3u32_0[_mm_tzcnt_32(0x00000000) == 32 ? 1 : -1]; +char tzcnt3u32_1[_mm_tzcnt_32(0x00000001) == 0 ? 1 : -1]; +char tzcnt3u32_2[_mm_tzcnt_32(0x00000080) == 7 ? 1 : -1]; + +#ifdef __x86_64__ +char bextr64_0[__bextr_u64(0x0000000000000000ULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1]; +char bextr64_1[__bextr_u64(0xF000000000000001ULL, 0x0000000000004001ULL) == 0x7800000000000000ULL ? 1 : -1]; +char bextr64_2[__bextr_u64(0xF000000000000001ULL, 0xFFFFFFFFFFFF1001ULL) == 0x0000000000000000ULL ? 1 : -1]; + +char bextr64_3[_bextr2_u64(0x0000000000000000ULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1]; +char bextr64_4[_bextr2_u64(0xF000000000000001ULL, 0x0000000000004001ULL) == 0x7800000000000000ULL ? 1 : -1]; +char bextr64_5[_bextr2_u64(0xF000000000000001ULL, 0xFFFFFFFFFFFF1001ULL) == 0x0000000000000000ULL ? 1 : -1]; + +char bextr64_6[_bextr_u64(0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1]; +char bextr64_7[_bextr_u64(0xF000000000000001ULL, 0x0000000000000001ULL, 0x0000000000000040ULL) == 0x7800000000000000ULL ? 1 : -1]; +char bextr64_8[_bextr_u64(0xF000000000000001ULL, 0xFFFFFFFFFFFFFF01ULL, 0xFFFFFFFFFFFFFF10ULL) == 0x0000000000000000ULL ? 1 : -1]; + +char tzcntu64_0[__tzcnt_u64(0x0000000000000000ULL) == 64 ? 1 : -1]; +char tzcntu64_1[__tzcnt_u64(0x0000000000000001ULL) == 0 ? 1 : -1]; +char tzcntu64_2[__tzcnt_u64(0x0000000800000000ULL) == 35 ? 1 : -1]; + +char tzcnt2u64_0[_tzcnt_u64(0x0000000000000000ULL) == 64 ? 1 : -1]; +char tzcnt2u64_1[_tzcnt_u64(0x0000000000000001ULL) == 0 ? 1 : -1]; +char tzcnt2u64_2[_tzcnt_u64(0x0000000800000000ULL) == 35 ? 1 : -1]; + +char tzcnt3u64_0[_mm_tzcnt_64(0x0000000000000000ULL) == 64 ? 1 : -1]; +char tzcnt3u64_1[_mm_tzcnt_64(0x0000000000000001ULL) == 0 ? 1 : -1]; +char tzcnt3u64_2[_mm_tzcnt_64(0x0000000800000000ULL) == 35 ? 1 : -1]; +#endif +#endif \ No newline at end of file diff --git a/clang/test/CodeGen/X86/lzcnt-builtins.c b/clang/test/CodeGen/X86/lzcnt-builtins.c index 9255207ffaef4..18ced89fc79b1 100644 --- a/clang/test/CodeGen/X86/lzcnt-builtins.c +++ b/clang/test/CodeGen/X86/lzcnt-builtins.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s #include @@ -32,3 +33,27 @@ unsigned long long test__lzcnt_u64(unsigned long long __X) // CHECK: @llvm.ctlz.i64(i64 %{{.*}}, i1 false) return _lzcnt_u64(__X); } + + +// Test constexpr handling. +#if defined(__cplusplus) && (__cplusplus >= 201103L) +char lzcnt16_0[__lzcnt16(0x0000) == 16 ? 1 : -1]; +char lzcnt16_1[__lzcnt16(0x8000) == 0 ? 1 : -1]; +char lzcnt16_2[__lzcnt16(0x0010) == 11 ? 1 : -1]; + +char lzcnt32_0[__lzcnt32(0x00000000) == 32 ? 1 : -1]; +char lzcnt32_1[__lzcnt32(0x80000000) == 0 ? 1 : -1]; +char lzcnt32_2[__lzcnt32(0x00000010) == 27 ? 1 : -1]; + +char lzcnt64_0[__lzcnt64(0x0000000000000000ULL) == 64 ? 1 : -1]; +char lzcnt64_1[__lzcnt64(0x8000000000000000ULL) == 0 ? 1 : -1]; +char lzcnt64_2[__lzcnt64(0x0000000100000000ULL) == 31 ? 1 : -1]; + +char lzcntu32_0[_lzcnt_u32(0x00000000) == 32 ? 1 : -1]; +char lzcntu32_1[_lzcnt_u32(0x80000000) == 0 ? 1 : -1]; +char lzcntu32_2[_lzcnt_u32(0x00000010) == 27 ? 1 : -1]; + +char lzcntu64_0[_lzcnt_u64(0x0000000000000000ULL) == 64 ? 1 : -1]; +char lzcntu64_1[_lzcnt_u64(0x8000000000000000ULL) == 0 ? 1 : -1]; +char lzcntu64_2[_lzcnt_u64(0x0000000100000000ULL) == 31 ? 1 : -1]; +#endif \ No newline at end of file diff --git a/clang/test/CodeGen/X86/tbm-builtins.c b/clang/test/CodeGen/X86/tbm-builtins.c index ad4247be9a442..ef5e1657521f9 100644 --- a/clang/test/CodeGen/X86/tbm-builtins.c +++ b/clang/test/CodeGen/X86/tbm-builtins.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s #include @@ -177,3 +178,16 @@ unsigned long long test__tzmsk_u64(unsigned long long a) { return __tzmsk_u64(a); } #endif + +// Test constexpr handling. +#if defined(__cplusplus) && (__cplusplus >= 201103L) +char bextri32_0[__bextri_u32(0x00000000, 0x00000000) == 0x00000000 ? 1 : -1]; +char bextri32_1[__bextri_u32(0x000003F0, 0xFFFF1004) == 0x0000003F ? 1 : -1]; +char bextri32_2[__bextri_u32(0x000003F0, 0xFFFF3008) == 0x00000003 ? 1 : -1]; + +#ifdef __x86_64__ +char bextri64_0[__bextri_u64(0x0000000000000000ULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1]; +char bextri64_1[__bextri_u64(0xF000000000000001ULL, 0x0000000000004001ULL) == 0x7800000000000000ULL ? 1 : -1]; +char bextri64_2[__bextri_u64(0xF000000000000001ULL, 0xFFFFFFFFFFFF1001ULL) == 0x0000000000000000ULL ? 1 : -1]; +#endif +#endif diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c new file mode 100644 index 0000000000000..a2f87aed3187c --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c @@ -0,0 +1,42 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +// CHECK-LABEL: define dso_local { , , , } @test_luti4_zt_u8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, [[OP_COERCE0]], [[OP_COERCE1]]) +// CHECK-NEXT: ret { , , , } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { , , , } @_Z19test_luti4_zt_u8_x411svuint8x2_t( +// CPP-CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, [[OP_COERCE0]], [[OP_COERCE1]]) +// CPP-CHECK-NEXT: ret { , , , } [[TMP0]] +// +svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { + return svluti4_zt_u8_x4(0, op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_luti4_zt_s8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, [[OP_COERCE0]], [[OP_COERCE1]]) +// CHECK-NEXT: ret { , , , } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { , , , } @_Z19test_luti4_zt_s8_x411svuint8x2_t( +// CPP-CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, [[OP_COERCE0]], [[OP_COERCE1]]) +// CPP-CHECK-NEXT: ret { , , , } [[TMP0]] +// +svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { + return svluti4_zt_s8_x4(0, op); +} diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c index 7e094a52653ef..0e53d3e141b01 100644 --- a/clang/test/CodeGen/builtins-elementwise-math.c +++ b/clang/test/CodeGen/builtins-elementwise-math.c @@ -607,6 +607,26 @@ void test_builtin_elementwise_popcount(si8 vi1, si8 vi2, si = __builtin_elementwise_popcount(si); } +void test_builtin_elementwise_fmod(float f1, float f2, double d1, double d2, + float4 vf1, float4 vf2) { + + // CHECK-LABEL: define void @test_builtin_elementwise_fmod( + // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4 + // CHECK: [[F2:%.+]] = load float, ptr %f2.addr, align 4 + // CHECK-NEXT: frem float [[F1]], [[F2]] + f2 = __builtin_elementwise_fmod(f1, f2); + + // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8 + // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8 + // CHECK-NEXT: frem double [[D1]], [[D2]] + d2 = __builtin_elementwise_fmod(d1, d2); + + // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16 + // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16 + // CHECK-NEXT: frem <4 x float> [[VF1]], [[VF2]] + vf2 = __builtin_elementwise_fmod(vf1, vf2); +} + void test_builtin_elementwise_pow(float f1, float f2, double d1, double d2, float4 vf1, float4 vf2) { diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index bfa72e8bd6945..0d0e3ecdb90c9 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -114,6 +114,7 @@ __device__ int read_ids() { // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid() // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid() // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid() +// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize() int a = __nvvm_read_ptx_sreg_laneid(); int b = __nvvm_read_ptx_sreg_warpid(); @@ -121,8 +122,9 @@ __device__ int read_ids() { int d = __nvvm_read_ptx_sreg_smid(); int e = __nvvm_read_ptx_sreg_nsmid(); int f = __nvvm_read_ptx_sreg_gridid(); + int g = __nvvm_read_ptx_sreg_warpsize(); - return a + b + c + d + e + f; + return a + b + c + d + e + f + g; } diff --git a/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp b/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp index 55ba17a195580..651f5bfc94c6c 100644 --- a/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp +++ b/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp @@ -306,3 +306,14 @@ float4 strict_elementwise_fma(float4 a, float4 b, float4 c) { float4 strict_elementwise_pow(float4 a, float4 b) { return __builtin_elementwise_pow(a, b); } + +// CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_fmodDv4_fS_ +// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.frem.v4f32(<4 x float> [[A]], <4 x float> [[B]], +// CHECK-SAME: metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]] +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float4 strict_elementwise_fmod(float4 a, float4 b) { + return __builtin_elementwise_fmod(a, b); +} diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index 41cbd5a0219d5..8548aa00cfe87 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -28,11 +28,11 @@ // RUN: %clang_cc1 -triple sparc-sun-solaris -emit-llvm -o - %s | \ // RUN: FileCheck %s --check-prefix=SPARC-V8 -// SPARC-V8: target datalayout = "E-m:e-p:32:32-i64:64-f128:64-n32-S64" +// SPARC-V8: target datalayout = "E-m:e-p:32:32-i64:64-i128:128-f128:64-n32-S64" // RUN: %clang_cc1 -triple sparcv9-sun-solaris -emit-llvm -o - %s | \ // RUN: FileCheck %s --check-prefix=SPARC-V9 -// SPARC-V9: target datalayout = "E-m:e-i64:64-n32:64-S128" +// SPARC-V9: target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128" // RUN: %clang_cc1 -triple mipsel-linux-gnu -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=MIPS-32EL diff --git a/clang/test/CodeGenCUDA/address-spaces.cu b/clang/test/CodeGenCUDA/address-spaces.cu index 0608c9cabd048..66903c81b9333 100644 --- a/clang/test/CodeGenCUDA/address-spaces.cu +++ b/clang/test/CodeGenCUDA/address-spaces.cu @@ -9,7 +9,7 @@ // CHECK: @i ={{.*}} addrspace(1) externally_initialized global __device__ int i; -// CHECK: @j ={{.*}} addrspace(4) externally_initialized global +// CHECK: @j ={{.*}} addrspace(4) externally_initialized constant __constant__ int j; // CHECK: @k ={{.*}} addrspace(3) global diff --git a/clang/test/CodeGenCUDA/amdgpu-visibility.cu b/clang/test/CodeGenCUDA/amdgpu-visibility.cu index d7dbab112a68c..ef74d932ee8c8 100644 --- a/clang/test/CodeGenCUDA/amdgpu-visibility.cu +++ b/clang/test/CodeGenCUDA/amdgpu-visibility.cu @@ -4,11 +4,11 @@ #include "Inputs/cuda.h" -// CHECK-DEFAULT: @c ={{.*}} addrspace(4) externally_initialized global +// CHECK-DEFAULT: @c ={{.*}} addrspace(4) externally_initialized constant // CHECK-DEFAULT: @g ={{.*}} addrspace(1) externally_initialized global -// CHECK-PROTECTED: @c = protected addrspace(4) externally_initialized global +// CHECK-PROTECTED: @c = protected addrspace(4) externally_initialized constant // CHECK-PROTECTED: @g = protected addrspace(1) externally_initialized global -// CHECK-HIDDEN: @c = protected addrspace(4) externally_initialized global +// CHECK-HIDDEN: @c = protected addrspace(4) externally_initialized constant // CHECK-HIDDEN: @g = protected addrspace(1) externally_initialized global __constant__ int c; __device__ int g; diff --git a/clang/test/CodeGenCUDA/anon-ns.cu b/clang/test/CodeGenCUDA/anon-ns.cu index 3c55e9907dd6c..d931f31d0207c 100644 --- a/clang/test/CodeGenCUDA/anon-ns.cu +++ b/clang/test/CodeGenCUDA/anon-ns.cu @@ -28,13 +28,13 @@ // HIP-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT_\.intern\.b04fd23c98500190]]( // HIP-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT_\.intern\.b04fd23c98500190]]( // HIP-DAG: @[[VM:_ZN12_GLOBAL__N_12vmE\.static\.b04fd23c98500190]] = addrspace(1) externally_initialized global -// HIP-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE\.static\.b04fd23c98500190]] = addrspace(4) externally_initialized global +// HIP-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE\.static\.b04fd23c98500190]] = addrspace(4) externally_initialized constant // HIP-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE\.static\.b04fd23c98500190]] = addrspace(1) externally_initialized global // CUDA-DAG: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]]( // CUDA-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]]( // CUDA-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]]( -// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized global +// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized constant // CUDA-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global // COMMON-DAG: @_ZN12_GLOBAL__N_12vdE = internal addrspace(1) global diff --git a/clang/test/CodeGenCUDA/device-var-init.cu b/clang/test/CodeGenCUDA/device-var-init.cu index 226b7e295f4b4..9d62e4126b430 100644 --- a/clang/test/CodeGenCUDA/device-var-init.cu +++ b/clang/test/CodeGenCUDA/device-var-init.cu @@ -26,7 +26,7 @@ __shared__ int s_v; // DEVICE: @s_v ={{.*}} addrspace(3) global i32 undef, // HOST: @s_v = internal global i32 undef, __constant__ int c_v; -// DEVICE: addrspace(4) externally_initialized global i32 0, +// DEVICE: addrspace(4) externally_initialized constant i32 0, // HOST: @c_v = internal global i32 undef, __device__ int d_v_i = 1; @@ -51,14 +51,14 @@ __shared__ T s_t; // DEVICE: @s_t ={{.*}} addrspace(3) global %struct.T undef, // HOST: @s_t = internal global %struct.T undef, __constant__ T c_t; -// DEVICE: @c_t ={{.*}} addrspace(4) externally_initialized global %struct.T zeroinitializer, +// DEVICE: @c_t ={{.*}} addrspace(4) externally_initialized constant %struct.T zeroinitializer, // HOST: @c_t = internal global %struct.T undef, __device__ T d_t_i = {2}; // DEVICE: @d_t_i ={{.*}} addrspace(1) externally_initialized global %struct.T { i32 2 }, // HOST: @d_t_i = internal global %struct.T undef, __constant__ T c_t_i = {2}; -// DEVICE: @c_t_i ={{.*}} addrspace(4) externally_initialized global %struct.T { i32 2 }, +// DEVICE: @c_t_i ={{.*}} addrspace(4) externally_initialized constant %struct.T { i32 2 }, // HOST: @c_t_i = internal global %struct.T undef, // empty constructor @@ -69,7 +69,7 @@ __shared__ EC s_ec; // DEVICE: @s_ec ={{.*}} addrspace(3) global %struct.EC undef, // HOST: @s_ec = internal global %struct.EC undef, __constant__ EC c_ec; -// DEVICE: @c_ec ={{.*}} addrspace(4) externally_initialized global %struct.EC zeroinitializer, +// DEVICE: @c_ec ={{.*}} addrspace(4) externally_initialized constant %struct.EC zeroinitializer, // HOST: @c_ec = internal global %struct.EC undef // empty destructor @@ -80,7 +80,7 @@ __shared__ ED s_ed; // DEVICE: @s_ed ={{.*}} addrspace(3) global %struct.ED undef, // HOST: @s_ed = internal global %struct.ED undef, __constant__ ED c_ed; -// DEVICE: @c_ed ={{.*}} addrspace(4) externally_initialized global %struct.ED zeroinitializer, +// DEVICE: @c_ed ={{.*}} addrspace(4) externally_initialized constant %struct.ED zeroinitializer, // HOST: @c_ed = internal global %struct.ED undef, __device__ ECD d_ecd; @@ -90,7 +90,7 @@ __shared__ ECD s_ecd; // DEVICE: @s_ecd ={{.*}} addrspace(3) global %struct.ECD undef, // HOST: @s_ecd = internal global %struct.ECD undef, __constant__ ECD c_ecd; -// DEVICE: @c_ecd ={{.*}} addrspace(4) externally_initialized global %struct.ECD zeroinitializer, +// DEVICE: @c_ecd ={{.*}} addrspace(4) externally_initialized constant %struct.ECD zeroinitializer, // HOST: @c_ecd = internal global %struct.ECD undef, // empty templated constructor -- allowed with no arguments @@ -101,14 +101,14 @@ __shared__ ETC s_etc; // DEVICE: @s_etc ={{.*}} addrspace(3) global %struct.ETC undef, // HOST: @s_etc = internal global %struct.ETC undef, __constant__ ETC c_etc; -// DEVICE: @c_etc ={{.*}} addrspace(4) externally_initialized global %struct.ETC zeroinitializer, +// DEVICE: @c_etc ={{.*}} addrspace(4) externally_initialized constant %struct.ETC zeroinitializer, // HOST: @c_etc = internal global %struct.ETC undef, __device__ NCFS d_ncfs; // DEVICE: @d_ncfs ={{.*}} addrspace(1) externally_initialized global %struct.NCFS { i32 3 } // HOST: @d_ncfs = internal global %struct.NCFS undef, __constant__ NCFS c_ncfs; -// DEVICE: @c_ncfs ={{.*}} addrspace(4) externally_initialized global %struct.NCFS { i32 3 } +// DEVICE: @c_ncfs ={{.*}} addrspace(4) externally_initialized constant %struct.NCFS { i32 3 } // HOST: @c_ncfs = internal global %struct.NCFS undef, // Regular base class -- allowed @@ -119,7 +119,7 @@ __shared__ T_B_T s_t_b_t; // DEVICE: @s_t_b_t ={{.*}} addrspace(3) global %struct.T_B_T undef, // HOST: @s_t_b_t = internal global %struct.T_B_T undef, __constant__ T_B_T c_t_b_t; -// DEVICE: @c_t_b_t ={{.*}} addrspace(4) externally_initialized global %struct.T_B_T zeroinitializer, +// DEVICE: @c_t_b_t ={{.*}} addrspace(4) externally_initialized constant %struct.T_B_T zeroinitializer, // HOST: @c_t_b_t = internal global %struct.T_B_T undef, // Incapsulated object of allowed class -- allowed @@ -130,7 +130,7 @@ __shared__ T_F_T s_t_f_t; // DEVICE: @s_t_f_t ={{.*}} addrspace(3) global %struct.T_F_T undef, // HOST: @s_t_f_t = internal global %struct.T_F_T undef, __constant__ T_F_T c_t_f_t; -// DEVICE: @c_t_f_t ={{.*}} addrspace(4) externally_initialized global %struct.T_F_T zeroinitializer, +// DEVICE: @c_t_f_t ={{.*}} addrspace(4) externally_initialized constant %struct.T_F_T zeroinitializer, // HOST: @c_t_f_t = internal global %struct.T_F_T undef, // array of allowed objects -- allowed @@ -141,7 +141,7 @@ __shared__ T_FA_T s_t_fa_t; // DEVICE: @s_t_fa_t ={{.*}} addrspace(3) global %struct.T_FA_T undef, // HOST: @s_t_fa_t = internal global %struct.T_FA_T undef, __constant__ T_FA_T c_t_fa_t; -// DEVICE: @c_t_fa_t ={{.*}} addrspace(4) externally_initialized global %struct.T_FA_T zeroinitializer, +// DEVICE: @c_t_fa_t ={{.*}} addrspace(4) externally_initialized constant %struct.T_FA_T zeroinitializer, // HOST: @c_t_fa_t = internal global %struct.T_FA_T undef, @@ -153,7 +153,7 @@ __shared__ EC_I_EC s_ec_i_ec; // DEVICE: @s_ec_i_ec ={{.*}} addrspace(3) global %struct.EC_I_EC undef, // HOST: @s_ec_i_ec = internal global %struct.EC_I_EC undef, __constant__ EC_I_EC c_ec_i_ec; -// DEVICE: @c_ec_i_ec ={{.*}} addrspace(4) externally_initialized global %struct.EC_I_EC zeroinitializer, +// DEVICE: @c_ec_i_ec ={{.*}} addrspace(4) externally_initialized constant %struct.EC_I_EC zeroinitializer, // HOST: @c_ec_i_ec = internal global %struct.EC_I_EC undef, // DEVICE: @_ZZ2dfvE4s_ec = internal addrspace(3) global %struct.EC undef diff --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu index 3c2efb57525c9..4c57323d85f9d 100644 --- a/clang/test/CodeGenCUDA/device-var-linkage.cu +++ b/clang/test/CodeGenCUDA/device-var-linkage.cu @@ -20,7 +20,7 @@ // NORDC-H-DAG: @v1 = internal global i32 undef // RDC-H-DAG: @v1 = global i32 undef __device__ int v1; -// DEV-DAG: @v2 = addrspace(4) externally_initialized global i32 0 +// DEV-DAG: @v2 = addrspace(4) externally_initialized constant i32 0 // NORDC-H-DAG: @v2 = internal global i32 undef // RDC-H-DAG: @v2 = global i32 undef __constant__ int v2; @@ -48,10 +48,10 @@ extern __managed__ int ev3; // HOST-DAG: @_ZL3sv1 = internal global i32 undef // CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 static __device__ int sv1; -// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized global i32 0 -// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized global i32 0 +// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized constant i32 0 +// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0 // HOST-DAG: @_ZL3sv2 = internal global i32 undef -// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized global i32 0 +// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0 static __constant__ int sv2; // NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null // RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null diff --git a/clang/test/CodeGenCUDA/filter-decl.cu b/clang/test/CodeGenCUDA/filter-decl.cu index 0f4691f7c8aa7..02dacd0ad8ef4 100644 --- a/clang/test/CodeGenCUDA/filter-decl.cu +++ b/clang/test/CodeGenCUDA/filter-decl.cu @@ -10,7 +10,7 @@ __asm__("file scope asm is host only"); // CHECK-HOST: constantdata = internal global -// CHECK-DEVICE: constantdata = {{(dso_local )?}}externally_initialized global +// CHECK-DEVICE: constantdata = {{(dso_local )?}}externally_initialized constant __constant__ char constantdata[256]; // CHECK-HOST: devicedata = internal global diff --git a/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu b/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu index 80655c2d29604..e92b00345e00c 100644 --- a/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu +++ b/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu @@ -50,7 +50,7 @@ static __device__ int x5; } // Check a static constant variable referenced by host is externalized. -// DEV-DAG: @_ZL1y ={{.*}} addrspace(4) externally_initialized global i32 0 +// DEV-DAG: @_ZL1y ={{.*}} addrspace(4) externally_initialized constant i32 0 // HOST-DAG: @_ZL1y = internal global i32 undef // HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00" diff --git a/clang/test/CodeGenCUDA/static-device-var-rdc.cu b/clang/test/CodeGenCUDA/static-device-var-rdc.cu index 16ec413397235..9d2811f9385e1 100644 --- a/clang/test/CodeGenCUDA/static-device-var-rdc.cu +++ b/clang/test/CodeGenCUDA/static-device-var-rdc.cu @@ -81,11 +81,11 @@ static __device__ int x; static __device__ int x2; // Test normal static device variables -// INT-DEV-DAG: @_ZL1y[[FILEID:.*]] = addrspace(4) externally_initialized global i32 0 +// INT-DEV-DAG: @_ZL1y[[FILEID:.*]] = addrspace(4) externally_initialized constant i32 0 // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y[[FILEID:.*]]\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0 +// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0 // EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00" static __constant__ int y; diff --git a/clang/test/CodeGenCUDA/template-class-static-member.cu b/clang/test/CodeGenCUDA/template-class-static-member.cu index d790d2dea66ba..b614cd9dcbb14 100644 --- a/clang/test/CodeGenCUDA/template-class-static-member.cu +++ b/clang/test/CodeGenCUDA/template-class-static-member.cu @@ -38,7 +38,7 @@ const int A::const_member; template class A; //DEV-DAG: @_ZN1AIiE8d_memberE = internal addrspace(1) global i32 0, comdat, align 4 -//DEV-DAG: @_ZN1AIiE8c_memberE = internal addrspace(4) global i32 0, comdat, align 4 +//DEV-DAG: @_ZN1AIiE8c_memberE = internal addrspace(4) constant i32 0, comdat, align 4 //DEV-DAG: @_ZN1AIiE8m_memberE = internal addrspace(1) externally_initialized global ptr addrspace(1) null //DEV-DAG: @_ZN1AIiE12const_memberE = internal addrspace(4) constant i32 0, comdat, align 4 //DEV-NEG-NOT: @_ZN1AIiE8h_memberE diff --git a/clang/test/CodeGenCXX/mangle-concept.cpp b/clang/test/CodeGenCXX/mangle-concept.cpp index 91dc1b0e688e0..6053511c00a7b 100644 --- a/clang/test/CodeGenCXX/mangle-concept.cpp +++ b/clang/test/CodeGenCXX/mangle-concept.cpp @@ -58,19 +58,19 @@ namespace test2 { // CHECK: call {{.*}}@_ZN5test21AIiEF1fEzQ4TrueIT_E( // CLANG17: call {{.*}}@_ZN5test21fEz( f(ai); - // CHECK: call {{.*}}@_ZN5test2F1gIvEEvzQaa4TrueIT_E4TrueITL0__E( + // CHECK: call {{.*}}@_ZN5test21AIiEF1gIvEEvzQaa4TrueIT_E4TrueITL0__E( // CLANG17: call {{.*}}@_ZN5test21gIvEEvz( g(ai); // CHECK: call {{.*}}@_ZN5test21hIvEEvzQ4TrueITL0__E( // CLANG17: call {{.*}}@_ZN5test21hIvEEvz( h(ai); - // CHECK: call {{.*}}@_ZN5test2F1iIvQaa4TrueIT_E4TrueITL0__EEEvz( + // CHECK: call {{.*}}@_ZN5test21AIiEF1iIvQaa4TrueIT_E4TrueITL0__EEEvz( // CLANG17: call {{.*}}@_ZN5test21iIvEEvz( i(ai); // CHECK: call {{.*}}@_ZN5test21jIvQ4TrueITL0__EEEvz( // CLANG17: call {{.*}}@_ZN5test21jIvEEvz( j(ai); - // CHECK: call {{.*}}@_ZN5test2F1kITk4TruevQ4TrueIT_EEEvz( + // CHECK: call {{.*}}@_ZN5test21AIiEF1kITk4TruevQ4TrueIT_EEEvz( // CLANG17: call {{.*}}@_ZN5test21kIvEEvz( k(ai); // CHECK: call {{.*}}@_ZN5test21lITk4TruevEEvz( diff --git a/clang/test/CodeGenCXX/mangle-subst.cpp b/clang/test/CodeGenCXX/mangle-subst.cpp index 20f33a72fff83..524e0febe479a 100644 --- a/clang/test/CodeGenCXX/mangle-subst.cpp +++ b/clang/test/CodeGenCXX/mangle-subst.cpp @@ -1,4 +1,8 @@ // RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-apple-darwin9 | FileCheck %s +// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-apple-darwin9 -fclang-abi-compat=19 | FileCheck %s --check-prefix=CHECK-CLANG-19 + +//CHECK: @_ZTCN16MangleCtorVTable4InstE0_NS_1A4ImplINS1_4WrapEEE +//CHECK-CLANG-19: @_ZTCN16MangleCtorVTable4InstE0_NS_1A4ImplINS0_4WrapEEE struct X {}; @@ -96,3 +100,26 @@ typename X::template Y::type f(typename X::template Y::type2) { retu // CHECK: @_ZN12ManglePrefix1fIiEENS_1XIT_E1YIS2_E4typeENS5_5type2E template int f(int); } + +namespace MangleCtorVTable { +namespace A { + +class VBase { + public: + virtual ~VBase() {}; +}; + +struct Wrap {}; + +template +class Impl : public virtual VBase { + public: +}; + +} // namespace A + +struct Inst : public A::Impl {}; + +void Test() { Inst a; } + +} diff --git a/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp b/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp index c575f49ff6971..05811bb7e1285 100644 --- a/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp +++ b/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp @@ -12,7 +12,7 @@ // CHECK: @d ={{.*}} addrspace(1) externally_initialized global __device__ int d; -// CHECK: @c ={{.*}} addrspace(1) externally_initialized global +// CHECK: @c ={{.*}} addrspace(1) externally_initialized constant __constant__ int c; // CHECK: @s ={{.*}} addrspace(3) global diff --git a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl index 02e570ebdcb4f..f65cdbb43e27b 100644 --- a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl +++ b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl @@ -1,5 +1,8 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// XFAIL: * +// Resource indexing will be properly implemented in llvm/llvm-project#95956 + const RWBuffer In; RWBuffer Out; diff --git a/clang/test/CodeGenHLSL/builtins/asint.hlsl b/clang/test/CodeGenHLSL/builtins/asint.hlsl new file mode 100644 index 0000000000000..e1d80df5015c9 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/asint.hlsl @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s + +// CHECK: define {{.*}}test_int{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}} +// CHECK-NOT: bitcast +// CHECK: ret i32 [[VAL]] +int test_int(int p0) { + return asint(p0); +} + +// CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}} +// CHECK-NOT: bitcast +// CHECK: ret i32 [[VAL]] +int test_uint(uint p0) { + return asint(p0); +} + +// CHECK: define {{.*}}test_float{{.*}}(float {{.*}} [[VAL:%.*]]){{.*}} +// CHECK: bitcast float [[VAL]] to i32 +int test_float(float p0) { + return asint(p0); +} + +// CHECK: define {{.*}}test_vector_int{{.*}}(<4 x i32> {{.*}} [[VAL:%.*]]){{.*}} +// CHECK-NOT: bitcast +// CHECK: ret <4 x i32> [[VAL]] +int4 test_vector_int(int4 p0) { + return asint(p0); +} + +// CHECK: define {{.*}}test_vector_uint{{.*}}(<4 x i32> {{.*}} [[VAL:%.*]]){{.*}} +// CHECK-NOT: bitcast +// CHECK: ret <4 x i32> [[VAL]] +int4 test_vector_uint(uint4 p0) { + return asint(p0); +} + +// CHECK: define {{.*}}test_vector_float{{.*}}(<4 x float> {{.*}} [[VAL:%.*]]){{.*}} +// CHECK: bitcast <4 x float> [[VAL]] to <4 x i32> +int4 test_vector_float(float4 p0) { + return asint(p0); +} diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl new file mode 100644 index 0000000000000..708779daaa7b6 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl @@ -0,0 +1,77 @@ +// DirectX target: +// +// ---------- Native Half support test ----------- +// +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: -DFNATTRS=noundef -DTYPE=half + +// +// ---------- No Native Half support test ----------- +// +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s \ +// RUN: -DFNATTRS=noundef -DTYPE=float + + +// Spirv target: +// +// ---------- Native Half support test ----------- +// +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: -DFNATTRS="spir_func noundef" -DTYPE=half + +// +// ---------- No Native Half support test ----------- +// +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s \ +// RUN: -DFNATTRS="spir_func noundef" -DTYPE=float + + + +// CHECK: define [[FNATTRS]] [[TYPE]] @ +// CHECK: %fmod = frem [[TYPE]] +// CHECK: ret [[TYPE]] %fmod +half test_fmod_half(half p0, half p1) { return fmod(p0, p1); } + +// CHECK: define [[FNATTRS]] <2 x [[TYPE]]> @ +// CHECK: %fmod = frem <2 x [[TYPE]]> +// CHECK: ret <2 x [[TYPE]]> %fmod +half2 test_fmod_half2(half2 p0, half2 p1) { return fmod(p0, p1); } + +// CHECK: define [[FNATTRS]] <3 x [[TYPE]]> @ +// CHECK: %fmod = frem <3 x [[TYPE]]> +// CHECK: ret <3 x [[TYPE]]> %fmod +half3 test_fmod_half3(half3 p0, half3 p1) { return fmod(p0, p1); } + +// CHECK: define [[FNATTRS]] <4 x [[TYPE]]> @ +// CHECK: %fmod = frem <4 x [[TYPE]]> +// CHECK: ret <4 x [[TYPE]]> %fmod +half4 test_fmod_half4(half4 p0, half4 p1) { return fmod(p0, p1); } + +// CHECK: define [[FNATTRS]] float @ +// CHECK: %fmod = frem float +// CHECK: ret float %fmod +float test_fmod_float(float p0, float p1) { return fmod(p0, p1); } + +// CHECK: define [[FNATTRS]] <2 x float> @ +// CHECK: %fmod = frem <2 x float> +// CHECK: ret <2 x float> %fmod +float2 test_fmod_float2(float2 p0, float2 p1) { return fmod(p0, p1); } + +// CHECK: define [[FNATTRS]] <3 x float> @ +// CHECK: %fmod = frem <3 x float> +// CHECK: ret <3 x float> %fmod +float3 test_fmod_float3(float3 p0, float3 p1) { return fmod(p0, p1); } + +// CHECK: define [[FNATTRS]] <4 x float> @ +// CHECK: %fmod = frem <4 x float> +// CHECK: ret <4 x float> %fmod +float4 test_fmod_float4(float4 p0, float4 p1) { return fmod(p0, p1); } + diff --git a/clang/test/CodeGenHLSL/entry.hlsl b/clang/test/CodeGenHLSL/entry.hlsl index ec4254e76fb66..cd3bf948df48c 100644 --- a/clang/test/CodeGenHLSL/entry.hlsl +++ b/clang/test/CodeGenHLSL/entry.hlsl @@ -2,6 +2,10 @@ // RUN: dxil-pc-shadermodel6.3-compute %s -hlsl-entry foo \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -hlsl-entry foo \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s + // Make sure not mangle entry. // CHECK:define void @foo() // Make sure add function attribute and numthreads attribute. diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl index ae3a3b5f90199..f72fe059cb576 100644 --- a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl +++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl @@ -31,7 +31,7 @@ uint Find(Node SortedTree[MAX], uint key) { } // CHECK: Function Attrs:{{.*}}norecurse -// CHECK: define noundef i1 @"?InitTree@@YA_NY0GE@UNode@@V?$RWBuffer@T?$__vector@I$03@__clang@@@hlsl@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]] +// CHECK: define noundef i1 @"?InitTree@@YA_NY0GE@UNode@@V?$RWBuffer@T?$__vector@I$03@__clang@@@hlsl@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]] // CHECK: ret i1 // Initialize tree with given buffer // Imagine the inout works diff --git a/clang/test/Driver/dxc_E.hlsl b/clang/test/Driver/dxc_E.hlsl index 05cfca685c9a6..07715a2e2259a 100644 --- a/clang/test/Driver/dxc_E.hlsl +++ b/clang/test/Driver/dxc_E.hlsl @@ -1,4 +1,5 @@ // RUN: not %clang_dxc -Efoo -Tlib_6_7 foo.hlsl -### %s 2>&1 | FileCheck %s +// RUN: not %clang_dxc -Efoo -Tlib_6_7 -spirv foo.hlsl -### %s 2>&1 | FileCheck %s // Make sure E option flag which translated into "-hlsl-entry". // CHECK:"-hlsl-entry" "foo" diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl index a05e21b37b912..f15130d5f8b61 100644 --- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl +++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl @@ -2,7 +2,7 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null // expected-remark@+10 {{Function Name: foo}} -// expected-remark@+9 {{ SGPRs: 13}} +// expected-remark@+9 {{ TotalSGPRs: 13}} // expected-remark@+8 {{ VGPRs: 10}} // expected-remark@+7 {{ AGPRs: 12}} // expected-remark@+6 {{ ScratchSize [bytes/lane]: 0}} diff --git a/clang/test/Frontend/fixed_point_comparisons.c b/clang/test/Frontend/fixed_point_comparisons.c index 59c4405e41c03..39e62bce51e2b 100644 --- a/clang/test/Frontend/fixed_point_comparisons.c +++ b/clang/test/Frontend/fixed_point_comparisons.c @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNPADDED // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,PADDED +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,UNPADDED +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,PADDED + // Fixed point against other fixed point _Bool b_eq_true = 2.5hk == 2.5uhk; // CHECK-DAG: @b_eq_true = {{.*}}global i8 1, align 1 _Bool b_eq_false = 2.5hk == 2.4uhk; // CHECK-DAG: @b_eq_false = {{.*}}global i8 0, align 1 diff --git a/clang/test/Frontend/fixed_point_conversions_const.c b/clang/test/Frontend/fixed_point_conversions_const.c index e6e89ded534fe..889486e5eb806 100644 --- a/clang/test/Frontend/fixed_point_conversions_const.c +++ b/clang/test/Frontend/fixed_point_conversions_const.c @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,SIGNED +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,UNSIGNED + // Between different fixed point types short _Accum sa_const = 2.5hk; // CHECK-DAG: @sa_const = {{.*}}global i16 320, align 2 diff --git a/clang/test/Frontend/fixed_point_crash.c b/clang/test/Frontend/fixed_point_crash.c index 3b3911117400c..9cfe6ba64c519 100644 --- a/clang/test/Frontend/fixed_point_crash.c +++ b/clang/test/Frontend/fixed_point_crash.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -verify -ffixed-point %s +// RUN: %clang_cc1 -verify -ffixed-point %s -fexperimental-new-constant-interpreter union a { _Accum x; diff --git a/clang/test/Frontend/fixed_point_div_const.c b/clang/test/Frontend/fixed_point_div_const.c index 46935207d186a..66c028e608db6 100644 --- a/clang/test/Frontend/fixed_point_div_const.c +++ b/clang/test/Frontend/fixed_point_div_const.c @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,SIGNED +// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,UNSIGNED + // Division between different fixed point types short _Accum sa_const = 1.0hk / 2.0hk; // CHECK-DAG: @sa_const = {{.*}}global i16 64, align 2 diff --git a/clang/test/Frontend/fixed_point_errors.c b/clang/test/Frontend/fixed_point_errors.c index 6a711936f2397..3472f595089a1 100644 --- a/clang/test/Frontend/fixed_point_errors.c +++ b/clang/test/Frontend/fixed_point_errors.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -verify -ffixed-point %s +// RUN: %clang_cc1 -verify -ffixed-point %s -fexperimental-new-constant-interpreter /* We do not yet support long long. No recommended bit widths are given for this * size. */ diff --git a/clang/test/Modules/lambda-definitions.cppm b/clang/test/Modules/lambda-definitions.cppm new file mode 100644 index 0000000000000..fb4bb8d298f0f --- /dev/null +++ b/clang/test/Modules/lambda-definitions.cppm @@ -0,0 +1,45 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/invocable.cppm -emit-module-interface -o %t/invocable.pcm +// RUN: %clang_cc1 -std=c++20 %t/lambda.cppm -emit-module-interface -o %t/lambda.pcm -fprebuilt-module-path=%t +// RUN: %clang_cc1 -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -verify +// +// RUN: %clang_cc1 -std=c++20 %t/invocable.cppm -emit-reduced-module-interface -o %t/invocable.pcm +// RUN: %clang_cc1 -std=c++20 %t/lambda.cppm -emit-reduced-module-interface -o %t/lambda.pcm -fprebuilt-module-path=%t +// RUN: %clang_cc1 -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -verify + +//--- invocable.cppm +export module invocable; +export template +concept invocable = requires(_Fn&& __fn, _Args&&... __args) { + _Fn(__args...); +}; + +export template +constexpr bool is_callable(_Fn&& __fn, _Args&& __args) { + return invocable<_Fn, _Args>; +} + +export template +struct Callable : _Fn { + constexpr explicit Callable(_Fn &&__fn) : _Fn(static_cast<_Fn&&>(__fn)) {} + + template + constexpr auto operator()(_Args&& __args) { + return _Fn(__args); + } +}; + +//--- lambda.cppm +export module lambda; +import invocable; +export constexpr auto l = Callable([](auto &&x){}); + +//--- test.cc +// expected-no-diagnostics +import invocable; +import lambda; + +static_assert(is_callable(l, 4) == true); diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl index 5e4ed96561a30..e7d19c3da7216 100644 --- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl @@ -3,10 +3,10 @@ // CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RWBuffer definition implicit_instantiation // CHECK: -TemplateArgument type 'float' // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h 'float * +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK-SAME: ':'float *' +// CHECK-SAME: ':'__hlsl_resource_t' // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer RWBuffer Buffer1; @@ -14,10 +14,10 @@ RWBuffer Buffer1; // CHECK: -TemplateArgument type 'vector' // CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector' 4 // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h 'vector +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)] // CHECK-SAME{LITERAL}: [[hlsl::is_rov]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector)]] -// CHECK-SAME: ':'vector' +// CHECK-SAME: ':'__hlsl_resource_t' // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer RasterizerOrderedBuffer > BufferArray3[4]; diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp index 5de97649af5d3..d9bb6daf974d5 100644 --- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp @@ -350,3 +350,8 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16, svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} } + +void test_luti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { + // Check Zt tile 0 + svluti4_zt_u8_x4(1, op); // expected-error {{argument value 1 is outside the valid range [0, 0]}} +} diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c index 1727be1d6286d..26b153dd5b210 100644 --- a/clang/test/Sema/builtins-elementwise-math.c +++ b/clang/test/Sema/builtins-elementwise-math.c @@ -538,6 +538,32 @@ void test_builtin_elementwise_popcount(int i, float f, double d, float4 v, int3 // expected-error@-1 {{assigning to 'int3' (vector of 3 'int' values) from incompatible type 'unsigned3' (vector of 3 'unsigned int' values)}} } +void test_builtin_elementwise_fmod(int i, short s, double d, float4 v, int3 iv, unsigned3 uv, int *p) { + i = __builtin_elementwise_fmod(p, d); + // expected-error@-1 {{arguments are of different types ('int *' vs 'double')}} + + struct Foo foo = __builtin_elementwise_fmod(i, i); + // expected-error@-1 {{1st argument must be a floating point type (was 'int')}} + + i = __builtin_elementwise_fmod(i); + // expected-error@-1 {{too few arguments to function call, expected 2, have 1}} + + i = __builtin_elementwise_fmod(); + // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} + + i = __builtin_elementwise_fmod(i, i, i); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} + + i = __builtin_elementwise_fmod(v, iv); + // expected-error@-1 {{arguments are of different types ('float4' (vector of 4 'float' values) vs 'int3' (vector of 3 'int' values))}} + + i = __builtin_elementwise_fmod(uv, iv); + // expected-error@-1 {{arguments are of different types ('unsigned3' (vector of 3 'unsigned int' values) vs 'int3' (vector of 3 'int' values))}} + + i = __builtin_elementwise_fmod(d, v); + // expected-error@-1 {{arguments are of different types ('double' vs 'float4' (vector of 4 'float' values))}} +} + void test_builtin_elementwise_pow(int i, short s, double d, float4 v, int3 iv, unsigned3 uv, int *p) { i = __builtin_elementwise_pow(p, d); // expected-error@-1 {{arguments are of different types ('int *' vs 'double')}} @@ -562,7 +588,6 @@ void test_builtin_elementwise_pow(int i, short s, double d, float4 v, int3 iv, u } - void test_builtin_elementwise_roundeven(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) { struct Foo s = __builtin_elementwise_roundeven(f); diff --git a/clang/test/SemaCXX/builtins-elementwise-math.cpp b/clang/test/SemaCXX/builtins-elementwise-math.cpp index c3d8bc593c0bb..5910796c5d298 100644 --- a/clang/test/SemaCXX/builtins-elementwise-math.cpp +++ b/clang/test/SemaCXX/builtins-elementwise-math.cpp @@ -255,6 +255,14 @@ void test_builtin_elementwise_fma() { static_assert(!is_const::value); } +void test_builtin_elementwise_fmod() { + const double a = 2; + double b = 1; + static_assert(!is_const::value); + static_assert(!is_const::value); + static_assert(!is_const::value); +} + void test_builtin_elementwise_pow() { const double a = 2; double b = 1; diff --git a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl new file mode 100644 index 0000000000000..815a0c35cb04c --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify + + +int4 test_asint_too_many_arg(float p0, float p1) { + return asint(p0, p1); + // expected-error@-1 {{no matching function for call to 'asint'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'V', but 2 arguments were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'F', but 2 arguments were provided}} +} + +int test_asint_double(double p1) { + return asint(p1); + // expected-error@hlsl/hlsl_intrinsics.h:* {{no matching function for call to 'bit_cast'}} + // expected-note@-2 {{in instantiation of function template specialization 'hlsl::asint'}} + // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: could not match 'vector' against 'double'}} + // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: substitution failure [with U = int, T = double]: no type named 'Type'}} +} + +int test_asint_half(half p1) { + return asint(p1); + // expected-error@hlsl/hlsl_intrinsics.h:* {{no matching function for call to 'bit_cast'}} + // expected-note@-2 {{in instantiation of function template specialization 'hlsl::asint'}} + // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: could not match 'vector' against 'half'}} + // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: substitution failure [with U = int, T = half]: no type named 'Type'}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl new file mode 100644 index 0000000000000..e4fa609dd6a05 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl @@ -0,0 +1,22 @@ + +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected + +float builtin_bool_to_float_type_promotion(bool p1, bool p2) { + return __builtin_elementwise_fmod(p1, p2); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}} +} + +float2 builtin_fmod_int2_to_float2_promotion(int2 p1, int2 p2) { + return __builtin_elementwise_fmod(p1, p2); + // expected-error@-1 {{1st argument must be a floating point type (was 'int2' (aka 'vector'))}} +} + +half builtin_fmod_double_type (double p0, double p1) { + return __builtin_elementwise_fmod(p0, p1); + // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}} +} + +half builtin_fmod_double2_type (double2 p0, double2 p1) { + return __builtin_elementwise_fmod(p0, p1); + // expected-error@-1 {{passing 'double2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/Language/UsualArithmeticConversions.hlsl b/clang/test/SemaHLSL/Language/UsualArithmeticConversions.hlsl new file mode 100644 index 0000000000000..d9f20a4cb79ec --- /dev/null +++ b/clang/test/SemaHLSL/Language/UsualArithmeticConversions.hlsl @@ -0,0 +1,383 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -fnative-half-type %s -DERRORS -Wconversion -Wdouble-promotion -verify +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl2018 -finclude-default-header -fnative-half-type %s -DERRORS -Wconversion -Wdouble-promotion -verify +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -fnative-half-type %s -ast-dump | FileCheck %s + +#if __HLSL_VERSION <= 2021 +// expected-warning@*{{support for HLSL language version hlsl2018 is incomplete, recommend using hlsl202x instead}} +#endif + +//----------------------------------------------------------------------------// +// Case 1: float4 * int4 and inverse. +// +// In both cases here the int is converted to a float and the computation +// produces a float value. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used f4f4i4 'float4 (float4, int4)' +// CHECK: BinaryOperator {{.*}} 'float4':'vector' '*' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector' lvalue ParmVar {{.*}} 'A' 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'B' 'int4':'vector' +export float4 f4f4i4(float4 A, int4 B) { + return A * B; // expected-warning{{implicit conversion from 'int4' (aka 'vector') to 'float4' (aka 'vector') may lose precision}} +} + +// CHECK-LABEL: FunctionDecl {{.*}} used f4i4f4 'float4 (float4, int4)' +// CHECK: BinaryOperator {{.*}} 'float4':'vector' '*' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'B' 'int4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector' lvalue ParmVar {{.*}} 'A' 'float4':'vector' +export float4 f4i4f4(float4 A, int4 B) { + return B * A; // expected-warning{{implicit conversion from 'int4' (aka 'vector') to 'float4' (aka 'vector') may lose precision}} +} + +//----------------------------------------------------------------------------// +// Case 2: float4 * int2 and inverse. +// +// In both cases the float vector is trunctated to a float2 and the integer +// vector is converted to a float2. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used f2f4i2 'float2 (float4, int2)' +// CHECK: BinaryOperator {{.*}} 'vector' '*' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}}'float4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector' lvalue ParmVar {{.*}} 'A' 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int2':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'B' 'int2':'vector' +export float2 f2f4i2(float4 A, int2 B) { + // expected-warning@#f2f4i2 {{implicit conversion from 'int2' (aka 'vector') to 'vector' (vector of 2 'float' values) may lose precision}} + // expected-warning@#f2f4i2 {{implicit conversion truncates vector: 'float4' (aka 'vector') to 'vector' (vector of 2 'float' values)}} + return A * B; // #f2f4i2 +} + +// CHECK-LABEL: FunctionDecl {{.*}} used f2i2f4 'float2 (float4, int2)' +// CHECK: BinaryOperator {{.*}} 'vector' '*' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int2':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'B' 'int2':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}}'float4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector' lvalue ParmVar {{.*}} 'A' 'float4':'vector' +export float2 f2i2f4(float4 A, int2 B) { + // expected-warning@#f2i2f4 {{implicit conversion from 'int2' (aka 'vector') to 'vector' (vector of 2 'float' values) may lose precision}} + // expected-warning@#f2i2f4 {{implicit conversion truncates vector: 'float4' (aka 'vector') to 'vector' (vector of 2 'float' values)}} + return B * A; // #f2i2f4 +} + +//----------------------------------------------------------------------------// +// Case 3: Integers of mismatched sign, equivalent size, but the unsigned type +// has lower conversion rank. +// +// This is the odd-ball case for HLSL that isn't really in spec, but we should +// handle gracefully. The lower-ranked unsigned type is converted to the +// equivalent unsigned type of higher rank, and the signed type is also +// converted to that unsigned type (meaning `unsigned long` becomes `unsinged +// long long`, and `long long` becomes `unsigned long long`). +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used wierdo 'int4 (vector, vector)' +// CHECK: BinaryOperator {{.*}} 'vector' '*' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: DeclRefExpr{{.*}} 'vector' lvalue ParmVar {{.*}} 'A' 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr{{.*}}> 'vector' +// CHECK-NEXT: DeclRefExpr {{.*}}'vector' lvalue ParmVar {{.*}} 'B' 'vector' +export int4 wierdo(vector A, vector B) { + // expected-warning@#wierdo {{implicit conversion loses integer precision: 'vector' (vector of 4 'unsigned long long' values) to 'vector' (vector of 4 'int' values)}} + // expected-warning@#wierdo {{implicit conversion changes signedness: 'vector' (vector of 4 'long long' values) to 'vector' (vector of 4 'unsigned long long' values)}} + return A * B; // #wierdo +} + +//----------------------------------------------------------------------------// +// Case 4: Compound assignment of float4 with an int4. +// +// In compound assignment the RHS is converted to match the LHS. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used f4f4i4compound 'float4 (float4, int4)' +// CHECK: CompoundAssignOperator {{.*}} 'float4':'vector' lvalue '+=' ComputeLHSTy='float4':'vector' ComputeResultTy='float4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector' lvalue ParmVar {{.*}} 'A' 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'B' 'int4':'vector' +export float4 f4f4i4compound(float4 A, int4 B) { + A += B; // expected-warning{{implicit conversion from 'int4' (aka 'vector') to 'float4' (aka 'vector') may lose precision}} + return A; +} + + +//----------------------------------------------------------------------------// +// Case 5: Compound assignment of float2 with an int4. +// +// In compound assignment the RHS is converted to match the LHS. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used f4f2i4compound 'float4 (float2, int4)' +// CHECK: CompoundAssignOperator {{.*}} 'float2':'vector' lvalue '+=' ComputeLHSTy='float2':'vector' ComputeResultTy='float2':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector' lvalue ParmVar {{.*}} 'A' 'float2':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'B' 'int4':'vector' +export float4 f4f2i4compound(float2 A, int4 B) { + // expected-warning@#f4f2i4compound{{implicit conversion truncates vector: 'int4' (aka 'vector') to 'float2' (aka 'vector')}} + // expected-warning@#f4f2i4compound{{implicit conversion from 'int4' (aka 'vector') to 'float2' (aka 'vector') may lose precision}} + A += B; // #f4f2i4compound + return A.xyxy; +} + +//----------------------------------------------------------------------------// +// Case 6: float2 * int4 +// +// The int4 vector is trunctated to int2 then converted to float2. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used f4f2i4 'float2 (float2, int4)' +// CHECK: BinaryOperator {{.*}} 'float2':'vector' '*' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector' lvalue ParmVar {{.*}} 'A' 'float2':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'B' 'int4':'vector' +export float2 f4f2i4(float2 A, int4 B) { + // expected-warning@#f4f2i4{{implicit conversion truncates vector: 'int4' (aka 'vector') to 'float2' (aka 'vector')}} + // expected-warning@#f4f2i4{{implicit conversion from 'int4' (aka 'vector') to 'float2' (aka 'vector') may lose precision}} + return A * B; // #f4f2i4 +} + +//----------------------------------------------------------------------------// +// Case 7: Compound assignment of half4 with float4, and inverse. +// +// In compound assignment the RHS is converted to match the LHS. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used f4h4f4compound 'float4 (half4, float4)' +// CHECK: CompoundAssignOperator {{.*}} 'half4':'vector' lvalue '+=' ComputeLHSTy='half4':'vector' ComputeResultTy='half4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'half4':'vector' lvalue ParmVar {{.*}} 'A' 'half4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector' lvalue ParmVar {{.*}} 'B' 'float4':'vector' +export float4 f4h4f4compound(half4 A, float4 B) { + A += B; // expected-warning{{implicit conversion loses floating-point precision: 'float4' (aka 'vector') to 'half4' (aka 'vector')}} + return B; +} + +// CHECK-LABEL: FunctionDecl {{.*}} used f4f4h4compound 'float4 (float4, half4)' +// CHECK: CompoundAssignOperator {{.*}} 'float4':'vector' lvalue '+=' ComputeLHSTy='float4':'vector' ComputeResultTy='float4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector' lvalue ParmVar {{.*}} 'A' 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'half4':'vector' lvalue ParmVar {{.*}} 'B' 'half4':'vector' +export float4 f4f4h4compound(float4 A, half4 B) { + A += B; // expected-warning{{implicit conversion increases floating-point precision: 'half4' (aka 'vector') to 'float4' (aka 'vector')}} + return A; +} + +//----------------------------------------------------------------------------// +// Case 8: int64_t4 * uint4 +// +// The unsigned argument is promoted to the higher ranked signed type since it +// can express all values of the unsgined argument. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used l4l4i4 'int64_t4 (int64_t4, uint4)' +// CHECK: BinaryOperator {{.*}} 'int64_t4':'vector' '*' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int64_t4':'vector' lvalue ParmVar {{.*}} 'A' 'int64_t4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'uint4':'vector' lvalue ParmVar {{.*}} 'B' 'uint4':'vector' +export int64_t4 l4l4i4(int64_t4 A, uint4 B) { + return A * B; +} + +//----------------------------------------------------------------------------// +// Case 9: Compound assignment of int4 from int64_t4 +// +// In compound assignment the RHS is converted to match the LHS. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used i4i4l4compound 'int4 (int4, int64_t4)' +// CHECK: CompoundAssignOperator {{.*}} 'int4':'vector' lvalue '+=' ComputeLHSTy='int4':'vector' ComputeResultTy='int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'A' 'int4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int64_t4':'vector' lvalue ParmVar {{.*}} 'B' 'int64_t4':'vector' +export int4 i4i4l4compound(int4 A, int64_t4 B) { + A += B; // expected-warning{{implicit conversion loses integer precision: 'int64_t4' (aka 'vector') to 'int4' (aka 'vector')}} + return A; +} + +//----------------------------------------------------------------------------// +// Case 10: Compound assignment of vector with argument of +// vector +// +// In compound assignment the RHS is converted to match the LHS. This one is +// also the weird case because it is out of spec, but we should handle it +// gracefully. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used wierdocompound 'vector (vector, vector)' +// CHECK: CompoundAssignOperator {{.*}} 'vector' lvalue '+=' ComputeLHSTy='vector' ComputeResultTy='vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'vector' lvalue ParmVar {{.*}} 'A' 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'vector' lvalue ParmVar {{.*}} 'B' 'vector' +export vector wierdocompound(vector A, vector B) { + // expected-warning@#wierdocompound{{implicit conversion changes signedness: 'vector' (vector of 4 'long long' values) to 'vector' (vector of 4 'unsigned long' values)}} + A += B; // #wierdocompound + return A; +} + +//----------------------------------------------------------------------------// +// Case 11: Compound assignment of scalar with vector argument. +// +// Because the LHS of a compound assignment cannot change type, the RHS must be +// implicitly convertable to the LHS type. +//----------------------------------------------------------------------------// + +// CHECK-LABEL: FunctionDecl {{.*}} used ffi2compound 'float (float, int2)' +// CHECK: CompoundAssignOperator {{.*}} 'float' lvalue '+=' ComputeLHSTy='float' ComputeResultTy='float' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'A' 'float' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int2':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'B' 'int2':'vector' +export float ffi2compound(float A, int2 B) { + A += B; // expected-warning {{implicit conversion turns vector to scalar: 'int2' (aka 'vector') to 'float'}} + return A; +} + +// CHECK-LABEL: FunctionDecl {{.*}} used iif2compound 'int (int, float2)' +// CHECK: CompoundAssignOperator {{.*}} 'int' lvalue '+=' ComputeLHSTy='int' ComputeResultTy='int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'A' 'int' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: mplicitCastExpr {{.*}} 'float' +// CHECK-NEXT: ImplicitCastExpr{{.*}} 'float2':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector' lvalue ParmVar {{.*}} 'B' 'float2':'vector' +export int iif2compound(int A, float2 B) { + A += B; // expected-warning{{implicit conversion turns vector to scalar: 'float2' (aka 'vector') to 'int'}} + return A; +} + + +//----------------------------------------------------------------------------// +// Case 12: Compound assignment of vector of larger size than the argument. +// +// Because the LHS of a compound assignment cannot change type, the RHS must be +// implicitly convertable to the LHS type. This fails since the RHS type can't +// be vector-extended implicitly. +//----------------------------------------------------------------------------// + +#ifdef ERRORS +// The only cases that are really illegal here are when the RHS is a vector that +// is larger than the LHS or when the LHS is a scalar. + +export float2 f2f4i2compound(float4 A, int2 B) { + A += B; // expected-error{{left hand operand of type 'float4' (aka 'vector') to compound assignment cannot be truncated when used with right hand operand of type 'int2' (aka 'vector')}} + return A.xy; +} + +#endif + +//----------------------------------------------------------------------------// +// Case 13: Comparison operators for mismatched arguments follow the same rules. +// +// Compare operators convert each argument following the usual arithmetic +// conversions. +//----------------------------------------------------------------------------// + +// Note: these cases work and generate correct code, but the way they get there +// may change with https://github.com/llvm/llvm-project/issues/91639, because +// representing boolean vectors as 32-bit integer vectors will allow more +// efficient code generation. + +// CHECK-LABEL: FunctionDecl {{.*}} used b4f4i4Compare 'bool4 (float4, int4)' +// CHECK: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: BinaryOperator {{.*}} 'vector' '<' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector' lvalue ParmVar {{.*}} 'A' 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'B' 'int4':'vector' +export bool4 b4f4i4Compare(float4 A, int4 B) { + return A < B; // expected-warning{{implicit conversion from 'int4' (aka 'vector') to 'float4' (aka 'vector') may lose precision}} +} + + +// CHECK-LABEL: FunctionDecl {{.*}} used b2f2i4Compare 'bool2 (float2, int4)' +// CHECK: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: BinaryOperator {{.*}} 'vector' '<=' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector' lvalue ParmVar {{.*}} 'A' 'float2':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'B' 'int4':'vector' + +export bool2 b2f2i4Compare(float2 A, int4 B) { + // expected-warning@#b2f2i4Compare{{implicit conversion truncates vector: 'int4' (aka 'vector') to 'float2' (aka 'vector')}} + // expected-warning@#b2f2i4Compare{{implicit conversion from 'int4' (aka 'vector') to 'float2' (aka 'vector') may lose precision}} + return A <= B; // #b2f2i4Compare +} + +// CHECK-LABEL: FunctionDecl {{.*}} used b4fi4Compare 'bool4 (float, int4)' +// CHECK: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: BinaryOperator {{.*}} 'vector' '>' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'A' 'float' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector' lvalue ParmVar {{.*}} 'B' 'int4':'vector' +export bool4 b4fi4Compare(float A, int4 B) { + return A > B; // expected-warning{{implicit conversion from 'int4' (aka 'vector') to 'vector' (vector of 4 'float' values) may lose precision}} +} + +//----------------------------------------------------------------------------// +// Case 14: Logical operators on vectors are disallowed in HLSL 2021+ +//----------------------------------------------------------------------------// + +#ifdef ERRORS + +#if __HLSL_VERSION >= 2021 +// expected-error@#b4f4i4Logical{{invalid operands to binary expression ('float4' (aka 'vector') and 'int4' (aka 'vector'))}} +// expected-note@#b4f4i4Logical{{did you mean or?}} +#else +// expected-warning@#b4f4i4Logical{{implicit conversion from 'int4' (aka 'vector') to 'float4' (aka 'vector') may lose precision}} +#endif + +export bool4 b4f4i4Logical(float4 A, int4 B) { + return A || B; // #b4f4i4Logical +} + +#if __HLSL_VERSION >= 2021 +// expected-error@#b2f2i4Logical{{invalid operands to binary expression ('float2' (aka 'vector') and 'int4' (aka 'vector'))}} +// expected-note@#b2f2i4Logical{{did you mean and?}} +#else +// expected-warning@#b2f2i4Logical{{implicit conversion truncates vector: 'int4' (aka 'vector') to 'float2' (aka 'vector')}} +// expected-warning@#b2f2i4Logical{{implicit conversion from 'int4' (aka 'vector') to 'float2' (aka 'vector') may lose precision}} +#endif + +export bool2 b2f2i4Logical(float2 A, int4 B) { + return A && B; // #b2f2i4Logical +} + +#if __HLSL_VERSION >= 2021 +// expected-error@#b2b2b2Logical{{invalid operands to binary expression ('bool2' (aka 'vector') and 'bool2')}} +// expected-note@#b2b2b2Logical{{did you mean and?}} +#endif + +export bool2 b2b2b2Logical(bool2 A, bool2 B) { + return A && B; // #b2b2b2Logical +} + +#endif diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl index 92cba1dcd4bdf..8c0f8d6f271db 100644 --- a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl +++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl @@ -76,3 +76,6 @@ template struct SimpleTemplate { }; _Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), ""); _Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate), ""); + +_Static_assert(__builtin_hlsl_is_intangible(RWBuffer), ""); +_Static_assert(__builtin_hlsl_is_intangible(StructuredBuffer), ""); diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp index 8ec1f722fa8a1..b4b376fe0d114 100644 --- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp +++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp @@ -236,9 +236,8 @@ void printCommands(ArrayRef CmdArgs) { if (CmdArgs.empty()) return; - llvm::errs() << " \"" << CmdArgs.front() << "\" "; - llvm::errs() << llvm::join(std::next(CmdArgs.begin()), CmdArgs.end(), " ") - << "\n"; + errs() << " \"" << CmdArgs.front() << "\" "; + errs() << join(std::next(CmdArgs.begin()), CmdArgs.end(), " ") << "\n"; } /// A minimum symbol interface that provides the necessary information to @@ -329,12 +328,12 @@ Expected> createLTO(const ArgList &Args) { lto::ThinBackend Backend; unsigned Jobs = 0; if (auto *Arg = Args.getLastArg(OPT_jobs)) - if (!llvm::to_integer(Arg->getValue(), Jobs) || Jobs == 0) + if (!to_integer(Arg->getValue(), Jobs) || Jobs == 0) reportError(createStringError("%s: expected a positive integer, got '%s'", Arg->getSpelling().data(), Arg->getValue())); - Backend = lto::createInProcessThinBackend( - llvm::heavyweight_hardware_concurrency(Jobs)); + Backend = + lto::createInProcessThinBackend(heavyweight_hardware_concurrency(Jobs)); Conf.CPU = Args.getLastArgValue(OPT_arch); Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple); @@ -378,7 +377,7 @@ Expected> createLTO(const ArgList &Args) { unsigned Partitions = 1; if (auto *Arg = Args.getLastArg(OPT_lto_partitions)) - if (!llvm::to_integer(Arg->getValue(), Partitions) || Partitions == 0) + if (!to_integer(Arg->getValue(), Partitions) || Partitions == 0) reportError(createStringError("%s: expected a positive integer, got '%s'", Arg->getSpelling().data(), Arg->getValue())); @@ -510,7 +509,7 @@ Expected> getInput(const ArgList &Args) { InputFiles.emplace_back(std::move(*BufferOrErr), /*IsLazy=*/false); break; case file_magic::archive: { - Expected> LibFile = + Expected> LibFile = object::Archive::create(Buffer); if (!LibFile) return LibFile.takeError(); @@ -563,7 +562,7 @@ Expected> getInput(const ArgList &Args) { for (auto &Input : LinkerInput) if (identify_magic(Input->getBuffer()) == file_magic::bitcode) BitcodeFiles.emplace_back(std::move(Input)); - llvm::erase_if(LinkerInput, [](const auto &F) { return !F; }); + erase_if(LinkerInput, [](const auto &F) { return !F; }); // Run the LTO pipeline on the extracted inputs. SmallVector Files; @@ -574,7 +573,7 @@ Expected> getInput(const ArgList &Args) { lto::LTO <OBackend = **LTOBackendOrErr; for (auto &BitcodeFile : BitcodeFiles) { Expected> BitcodeFileOrErr = - llvm::lto::InputFile::create(*BitcodeFile); + lto::InputFile::create(*BitcodeFile); if (!BitcodeFileOrErr) return BitcodeFileOrErr.takeError(); @@ -638,7 +637,7 @@ Expected> getInput(const ArgList &Args) { if (std::error_code EC = sys::fs::openFileForWrite(TempFile, FD)) reportError(errorCodeToError(EC)); return std::make_unique( - std::make_unique(FD, true)); + std::make_unique(FD, true)); }; if (Error Err = LTOBackend.run(AddStream)) @@ -655,11 +654,11 @@ Expected> getInput(const ArgList &Args) { } } - // Create a link for each file to a new file ending in `.cubin`. The 'nvlink' + // Create a copy for each file to a new file ending in `.cubin`. The 'nvlink' // linker requires all NVPTX inputs to have this extension for some reason. - // Windows cannot create symbolic links so we just copy the whole file. + // We don't use a symbolic link because it's not supported on Windows and some + // of this input files could be extracted from an archive. for (auto &Input : LinkerInput) { -#ifdef _WIN32 auto TempFileOrErr = createTempFile( Args, sys::path::stem(Input->getBufferIdentifier()), "cubin"); if (!TempFileOrErr) @@ -669,22 +668,10 @@ Expected> getInput(const ArgList &Args) { if (!OutputOrErr) return OutputOrErr.takeError(); std::unique_ptr Output = std::move(*OutputOrErr); - llvm::copy(Input->getBuffer(), Output->getBufferStart()); + copy(Input->getBuffer(), Output->getBufferStart()); if (Error E = Output->commit()) return E; Files.emplace_back(Args.MakeArgString(*TempFileOrErr)); -#else - SmallString<128> TempFile; - if (std::error_code EC = sys::fs::getPotentiallyUniqueTempFileName( - sys::path::stem(Input->getBufferIdentifier()), "cubin", TempFile)) - reportError(createFileError(TempFile, EC)); - if (std::error_code EC = - sys::fs::create_link(Input->getBufferIdentifier(), TempFile)) { - reportError(createFileError(TempFile, EC)); - } - Files.emplace_back(Args.MakeArgString(TempFile)); - TempFiles.emplace_back(std::move(TempFile)); -#endif } return Files; @@ -718,8 +705,8 @@ Error runNVLink(ArrayRef Files, const ArgList &Args) { Arg->render(Args, NewLinkerArgs); } - llvm::transform(Files, std::back_inserter(NewLinkerArgs), - [&](StringRef Arg) { return Args.MakeArgString(Arg); }); + transform(Files, std::back_inserter(NewLinkerArgs), + [&](StringRef Arg) { return Args.MakeArgString(Arg); }); SmallVector LinkerArgs({*NVLinkPath}); if (!Args.hasArg(OPT_o)) diff --git a/compiler-rt/lib/orc/elfnix_platform.cpp b/compiler-rt/lib/orc/elfnix_platform.cpp index bd76e3ed0ece7..57673f088f77c 100644 --- a/compiler-rt/lib/orc/elfnix_platform.cpp +++ b/compiler-rt/lib/orc/elfnix_platform.cpp @@ -15,6 +15,7 @@ #include "compiler.h" #include "error.h" #include "jit_dispatch.h" +#include "record_section_tracker.h" #include "wrapper_function_utils.h" #include @@ -29,8 +30,7 @@ using namespace orc_rt; using namespace orc_rt::elfnix; // Declare function tags for functions in the JIT process. -ORC_RT_JIT_DISPATCH_TAG(__orc_rt_elfnix_get_initializers_tag) -ORC_RT_JIT_DISPATCH_TAG(__orc_rt_elfnix_get_deinitializers_tag) +ORC_RT_JIT_DISPATCH_TAG(__orc_rt_elfnix_push_initializers_tag) ORC_RT_JIT_DISPATCH_TAG(__orc_rt_elfnix_symbol_lookup_tag) // eh-frame registration functions, made available via aliases @@ -45,33 +45,6 @@ __unw_remove_dynamic_eh_frame_section(const void *) ORC_RT_WEAK_IMPORT; namespace { -Error validatePointerSectionExtent(const char *SectionName, - const ExecutorAddrRange &SE) { - if (SE.size() % sizeof(uintptr_t)) { - std::ostringstream ErrMsg; - ErrMsg << std::hex << "Size of " << SectionName << " 0x" - << SE.Start.getValue() << " -- 0x" << SE.End.getValue() - << " is not a pointer multiple"; - return make_error(ErrMsg.str()); - } - return Error::success(); -} - -Error runInitArray(const std::vector &InitArraySections, - const ELFNixJITDylibInitializers &MOJDIs) { - - for (const auto &ModInits : InitArraySections) { - if (auto Err = validatePointerSectionExtent(".init_array", ModInits)) - return Err; - - using InitFunc = void (*)(); - for (auto *Init : ModInits.toSpan()) - (*Init)(); - } - - return Error::success(); -} - struct TLSInfoEntry { unsigned long Key = 0; unsigned long DataAddress = 0; @@ -92,10 +65,18 @@ class ELFNixPlatformRuntimeState { using AtExitsVector = std::vector; struct PerJITDylibState { + std::string Name; void *Header = nullptr; size_t RefCount = 0; + size_t LinkedAgainstRefCount = 0; bool AllowReinitialization = false; AtExitsVector AtExits; + std::vector Deps; + RecordSectionsTracker RecordedInits; + + bool referenced() const { + return LinkedAgainstRefCount != 0 || RefCount != 0; + } }; public: @@ -113,6 +94,12 @@ class ELFNixPlatformRuntimeState { ELFNixPlatformRuntimeState &operator=(ELFNixPlatformRuntimeState &&) = delete; Error registerObjectSections(ELFNixPerObjectSectionsToRegister POSR); + Error registerJITDylib(std::string &Name, void *Handle); + Error deregisterJITDylib(void *Handle); + Error registerInits(ExecutorAddr HeaderAddr, + std::vector Inits); + Error deregisterInits(ExecutorAddr HeaderAddr, + std::vector Inits); Error deregisterObjectSections(ELFNixPerObjectSectionsToRegister POSR); const char *dlerror(); @@ -122,6 +109,8 @@ class ELFNixPlatformRuntimeState { int registerAtExit(void (*F)(void *), void *Arg, void *DSOHandle); void runAtExits(void *DSOHandle); + void runAtExits(std::unique_lock &JDStateLock, + PerJITDylibState &JDS); /// Returns the base address of the section containing ThreadData. Expected> @@ -132,18 +121,23 @@ class ELFNixPlatformRuntimeState { private: PerJITDylibState *getJITDylibStateByHeaderAddr(void *DSOHandle); PerJITDylibState *getJITDylibStateByName(std::string_view Path); - PerJITDylibState & - getOrCreateJITDylibState(ELFNixJITDylibInitializers &MOJDIs); Error registerThreadDataSection(span ThreadDataSection); Expected lookupSymbolInJITDylib(void *DSOHandle, std::string_view Symbol); - Expected - getJITDylibInitializersByName(std::string_view Path); - Expected dlopenInitialize(std::string_view Path, int Mode); - Error initializeJITDylib(ELFNixJITDylibInitializers &MOJDIs); + Error runInits(std::unique_lock &JDStatesLock, + PerJITDylibState &JDS); + Expected dlopenImpl(std::string_view Path, int Mode); + Error dlopenFull(std::unique_lock &JDStatesLock, + PerJITDylibState &JDS); + Error dlopenInitialize(std::unique_lock &JDStatesLock, + PerJITDylibState &JDS, + ELFNixJITDylibDepInfoMap &DepInfo); + Error dlcloseImpl(void *DSOHandle); + Error dlcloseInitialize(std::unique_lock &JDStatesLock, + PerJITDylibState &JDS); static ELFNixPlatformRuntimeState *MOPS; @@ -215,31 +209,110 @@ Error ELFNixPlatformRuntimeState::deregisterObjectSections( return Error::success(); } -const char *ELFNixPlatformRuntimeState::dlerror() { return DLFcnError.c_str(); } +Error ELFNixPlatformRuntimeState::registerJITDylib(std::string &Name, + void *Handle) { + std::lock_guard Lock(JDStatesMutex); -void *ELFNixPlatformRuntimeState::dlopen(std::string_view Path, int Mode) { + if (JDStates.count(Handle)) { + std::ostringstream ErrStream; + ErrStream << "Duplicate JITDylib registration for header " << Handle + << " (name = " << Name << ")"; + return make_error(ErrStream.str()); + } + + if (JDNameToHeader.count(Name)) { + std::ostringstream ErrStream; + ErrStream << "Duplicate JITDylib registration for header " << Handle + << " (header = " << Handle << ")"; + return make_error(ErrStream.str()); + } + + auto &JD = JDStates[Handle]; + JD.Header = Handle; + JD.Name = std::move(Name); + JDNameToHeader[JD.Name] = Handle; + return Error::success(); +} + +Error ELFNixPlatformRuntimeState::deregisterJITDylib(void *Handle) { std::lock_guard Lock(JDStatesMutex); - // Use fast path if all JITDylibs are already loaded and don't require - // re-running initializers. - if (auto *JDS = getJITDylibStateByName(Path)) { - if (!JDS->AllowReinitialization) { - ++JDS->RefCount; - return JDS->Header; - } + auto I = JDStates.find(Handle); + if (I == JDStates.end()) { + std::ostringstream ErrStream; + ErrStream << "Attempted to deregister unrecognized header " << Handle; + return make_error(ErrStream.str()); } - auto H = dlopenInitialize(Path, Mode); - if (!H) { + auto J = JDNameToHeader.find( + std::string(I->second.Name.data(), I->second.Name.size())); + assert(J != JDNameToHeader.end() && + "Missing JDNameToHeader entry for JITDylib"); + JDNameToHeader.erase(J); + JDStates.erase(I); + return Error::success(); +} + +Error ELFNixPlatformRuntimeState::registerInits( + ExecutorAddr HeaderAddr, std::vector Inits) { + std::lock_guard Lock(JDStatesMutex); + PerJITDylibState *JDS = + getJITDylibStateByHeaderAddr(HeaderAddr.toPtr()); + + if (!JDS) { + std::ostringstream ErrStream; + ErrStream << "Could not register object platform sections for " + "unrecognized header " + << HeaderAddr.toPtr(); + return make_error(ErrStream.str()); + } + + for (auto &I : Inits) { + JDS->RecordedInits.add(I.toSpan()); + } + + return Error::success(); +} + +Error ELFNixPlatformRuntimeState::deregisterInits( + ExecutorAddr HeaderAddr, std::vector Inits) { + + std::lock_guard Lock(JDStatesMutex); + PerJITDylibState *JDS = + getJITDylibStateByHeaderAddr(HeaderAddr.toPtr()); + + if (!JDS) { + std::ostringstream ErrStream; + ErrStream << "Could not register object platform sections for unrecognized " + "header " + << HeaderAddr.toPtr(); + return make_error(ErrStream.str()); + } + + for (auto &I : Inits) { + JDS->RecordedInits.removeIfPresent(I); + } + + return Error::success(); +} + +const char *ELFNixPlatformRuntimeState::dlerror() { return DLFcnError.c_str(); } + +void *ELFNixPlatformRuntimeState::dlopen(std::string_view Path, int Mode) { + if (auto H = dlopenImpl(Path, Mode)) + return *H; + else { + // FIXME: Make dlerror thread safe. DLFcnError = toString(H.takeError()); return nullptr; } - - return *H; } int ELFNixPlatformRuntimeState::dlclose(void *DSOHandle) { - runAtExits(DSOHandle); + if (auto Err = dlcloseImpl(DSOHandle)) { + DLFcnError = toString(std::move(Err)); + return -1; + } return 0; } @@ -265,15 +338,17 @@ int ELFNixPlatformRuntimeState::registerAtExit(void (*F)(void *), void *Arg, } void ELFNixPlatformRuntimeState::runAtExits(void *DSOHandle) { - // FIXME: Should atexits be allowed to run concurrently with access to - // JDState? - AtExitsVector V; - { - std::lock_guard Lock(JDStatesMutex); - auto *JDS = getJITDylibStateByHeaderAddr(DSOHandle); - assert(JDS && "JITDlybi state not initialized"); - std::swap(V, JDS->AtExits); - } + std::unique_lock Lock(JDStatesMutex); + PerJITDylibState *JDS = getJITDylibStateByHeaderAddr(DSOHandle); + + if (JDS) + runAtExits(Lock, *JDS); +} + +void ELFNixPlatformRuntimeState::runAtExits( + std::unique_lock &JDStateLock, + PerJITDylibState &JDS) { + AtExitsVector V = std::move(JDS.AtExits); while (!V.empty()) { auto &AE = V.back(); @@ -300,6 +375,7 @@ ELFNixPlatformRuntimeState::getJITDylibStateByHeaderAddr(void *DSOHandle) { auto I = JDStates.find(DSOHandle); if (I == JDStates.end()) return nullptr; + return &I->second; } @@ -316,24 +392,6 @@ ELFNixPlatformRuntimeState::getJITDylibStateByName(std::string_view Name) { return &J->second; } -ELFNixPlatformRuntimeState::PerJITDylibState & -ELFNixPlatformRuntimeState::getOrCreateJITDylibState( - ELFNixJITDylibInitializers &MOJDIs) { - void *Header = MOJDIs.DSOHandleAddress.toPtr(); - - auto &JDS = JDStates[Header]; - - // If this entry hasn't been created yet. - if (!JDS.Header) { - assert(!JDNameToHeader.count(MOJDIs.Name) && - "JITDylib has header map entry but no name map entry"); - JDNameToHeader[MOJDIs.Name] = Header; - JDS.Header = Header; - } - - return JDS; -} - Error ELFNixPlatformRuntimeState::registerThreadDataSection( span ThreadDataSection) { std::lock_guard Lock(ThreadDataSectionsMutex); @@ -360,74 +418,142 @@ ELFNixPlatformRuntimeState::lookupSymbolInJITDylib(void *DSOHandle, return Result; } -Expected -ELFNixPlatformRuntimeState::getJITDylibInitializersByName( - std::string_view Path) { - Expected Result( - (ELFNixJITDylibInitializerSequence())); - std::string PathStr(Path.data(), Path.size()); - if (auto Err = - WrapperFunction( - SPSString)>:: - call(JITDispatch(&__orc_rt_elfnix_get_initializers_tag), Result, - Path)) - return std::move(Err); - return Result; +Error ELFNixPlatformRuntimeState::runInits( + std::unique_lock &JDStatesLock, + PerJITDylibState &JDS) { + std::vector> InitSections; + InitSections.reserve(JDS.RecordedInits.numNewSections()); + + JDS.RecordedInits.processNewSections( + [&](span Inits) { InitSections.push_back(Inits); }); + + JDStatesLock.unlock(); + for (auto Sec : InitSections) + for (auto *Init : Sec) + Init(); + + JDStatesLock.lock(); + + return Error::success(); } -Expected -ELFNixPlatformRuntimeState::dlopenInitialize(std::string_view Path, int Mode) { - // Either our JITDylib wasn't loaded, or it or one of its dependencies allows - // reinitialization. We need to call in to the JIT to see if there's any new - // work pending. - auto InitSeq = getJITDylibInitializersByName(Path); - if (!InitSeq) - return InitSeq.takeError(); - - // Init sequences should be non-empty. - if (InitSeq->empty()) - return make_error( - "__orc_rt_elfnix_get_initializers returned an " - "empty init sequence"); - - // Otherwise register and run initializers for each JITDylib. - for (auto &MOJDIs : *InitSeq) - if (auto Err = initializeJITDylib(MOJDIs)) - return std::move(Err); - - // Return the header for the last item in the list. - auto *JDS = getJITDylibStateByHeaderAddr( - InitSeq->back().DSOHandleAddress.toPtr()); - assert(JDS && "Missing state entry for JD"); +Expected ELFNixPlatformRuntimeState::dlopenImpl(std::string_view Path, + int Mode) { + std::unique_lock Lock(JDStatesMutex); + PerJITDylibState *JDS = getJITDylibStateByName(Path); + + if (!JDS) + return make_error("No registered JTIDylib for path " + + std::string(Path.data(), Path.size())); + + if (auto Err = dlopenFull(Lock, *JDS)) + return std::move(Err); + + ++JDS->RefCount; + return JDS->Header; } -long getPriority(const std::string &name) { - auto pos = name.find_last_not_of("0123456789"); - if (pos == name.size() - 1) - return 65535; - else - return std::strtol(name.c_str() + pos + 1, nullptr, 10); +Error ELFNixPlatformRuntimeState::dlopenFull( + std::unique_lock &JDStateLock, + PerJITDylibState &JDS) { + Expected DepInfo((ELFNixJITDylibDepInfoMap())); + JDStateLock.unlock(); + if (auto Err = WrapperFunction( + SPSExecutorAddr)>:: + call(JITDispatch(&__orc_rt_elfnix_push_initializers_tag), DepInfo, + ExecutorAddr::fromPtr(JDS.Header))) + return Err; + JDStateLock.lock(); + + if (!DepInfo) + return DepInfo.takeError(); + + if (auto Err = dlopenInitialize(JDStateLock, JDS, *DepInfo)) + return Err; + + if (!DepInfo->empty()) { + std::ostringstream ErrStream; + ErrStream << "Encountered unrecognized dep-info key headers " + "while processing dlopen of " + << JDS.Name; + return make_error(ErrStream.str()); + } + + return Error::success(); } -Error ELFNixPlatformRuntimeState::initializeJITDylib( - ELFNixJITDylibInitializers &MOJDIs) { +Error ELFNixPlatformRuntimeState::dlopenInitialize( + std::unique_lock &JDStatesLock, PerJITDylibState &JDS, + ELFNixJITDylibDepInfoMap &DepInfo) { - auto &JDS = getOrCreateJITDylibState(MOJDIs); - ++JDS.RefCount; + auto I = DepInfo.find(ExecutorAddr::fromPtr(JDS.Header)); + if (I == DepInfo.end()) + return Error::success(); - using SectionList = std::vector; - std::sort(MOJDIs.InitSections.begin(), MOJDIs.InitSections.end(), - [](const std::pair &LHS, - const std::pair &RHS) -> bool { - return getPriority(LHS.first) < getPriority(RHS.first); - }); - for (auto &Entry : MOJDIs.InitSections) - if (auto Err = runInitArray(Entry.second, MOJDIs)) + auto Deps = std::move(I->second); + DepInfo.erase(I); + + std::vector OldDeps; + std::swap(JDS.Deps, OldDeps); + JDS.Deps.reserve(Deps.size()); + for (auto H : Deps) { + PerJITDylibState *DepJDS = getJITDylibStateByHeaderAddr(H.toPtr()); + if (!DepJDS) { + std::ostringstream ErrStream; + ErrStream << "Encountered unrecognized dep header " << H.toPtr() + << " while initializing " << JDS.Name; + return make_error(ErrStream.str()); + } + ++DepJDS->LinkedAgainstRefCount; + if (auto Err = dlopenInitialize(JDStatesLock, *DepJDS, DepInfo)) return Err; + } + + if (auto Err = runInits(JDStatesLock, JDS)) + return Err; + + for (auto *DepJDS : OldDeps) { + --DepJDS->LinkedAgainstRefCount; + if (!DepJDS->referenced()) + if (auto Err = dlcloseInitialize(JDStatesLock, *DepJDS)) + return Err; + } + return Error::success(); +} + +Error ELFNixPlatformRuntimeState::dlcloseImpl(void *DSOHandle) { + + std::unique_lock Lock(JDStatesMutex); + PerJITDylibState *JDS = getJITDylibStateByHeaderAddr(DSOHandle); + + if (!JDS) { + std::ostringstream ErrStream; + ErrStream << "No registered JITDylib for " << DSOHandle; + return make_error(ErrStream.str()); + } + + --JDS->RefCount; + + if (!JDS->referenced()) + return dlcloseInitialize(Lock, *JDS); + + return Error::success(); +} + +Error ELFNixPlatformRuntimeState::dlcloseInitialize( + std::unique_lock &JDStatesLock, + PerJITDylibState &JDS) { + runAtExits(JDStatesLock, JDS); + JDS.RecordedInits.reset(); + for (auto *DepJDS : JDS.Deps) + if (!JDS.referenced()) + if (auto Err = dlcloseInitialize(JDStatesLock, *DepJDS)) + return Err; return Error::success(); } + class ELFNixPlatformRuntimeTLVManager { public: void *getInstance(const char *ThreadData); @@ -473,19 +599,73 @@ void destroyELFNixTLVMgr(void *ELFNixTLVMgr) { ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult __orc_rt_elfnix_platform_bootstrap(char *ArgData, size_t ArgSize) { - return WrapperFunction::handle( + return WrapperFunction::handle( ArgData, ArgSize, - [](uint64_t &DSOHandle) { + [](ExecutorAddr DSOHandle) { ELFNixPlatformRuntimeState::initialize( - reinterpret_cast(DSOHandle)); + DSOHandle.toPtr()); + return Error::success(); }) .release(); } ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult __orc_rt_elfnix_platform_shutdown(char *ArgData, size_t ArgSize) { - ELFNixPlatformRuntimeState::destroy(); - return WrapperFunctionResult().release(); + return WrapperFunction::handle( + ArgData, ArgSize, + []() { + ELFNixPlatformRuntimeState::destroy(); + return Error::success(); + }) + .release(); +} + +ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult +__orc_rt_elfnix_register_jitdylib(char *ArgData, size_t ArgSize) { + return WrapperFunction::handle( + ArgData, ArgSize, + [](std::string &JDName, ExecutorAddr HeaderAddr) { + return ELFNixPlatformRuntimeState::get().registerJITDylib( + JDName, HeaderAddr.toPtr()); + }) + .release(); +} + +ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult +__orc_rt_elfnix_deregister_jitdylib(char *ArgData, size_t ArgSize) { + return WrapperFunction::handle( + ArgData, ArgSize, + [](ExecutorAddr HeaderAddr) { + return ELFNixPlatformRuntimeState::get().deregisterJITDylib( + HeaderAddr.toPtr()); + }) + .release(); +} + +ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult +__orc_rt_elfnix_register_init_sections(char *ArgData, size_t ArgSize) { + return WrapperFunction)>:: + handle(ArgData, ArgSize, + [](ExecutorAddr HeaderAddr, + std::vector &Inits) { + return ELFNixPlatformRuntimeState::get().registerInits( + HeaderAddr, std::move(Inits)); + }) + .release(); +} + +ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult +__orc_rt_elfnix_deregister_init_sections(char *ArgData, size_t ArgSize) { + return WrapperFunction)>:: + handle(ArgData, ArgSize, + [](ExecutorAddr HeaderAddr, + std::vector &Inits) { + return ELFNixPlatformRuntimeState::get().deregisterInits( + HeaderAddr, std::move(Inits)); + }) + .release(); } /// Wrapper function for registering metadata on a per-object basis. diff --git a/compiler-rt/lib/orc/elfnix_platform.h b/compiler-rt/lib/orc/elfnix_platform.h index 3efac4b2327f3..5ecbdf0cb9c86 100644 --- a/compiler-rt/lib/orc/elfnix_platform.h +++ b/compiler-rt/lib/orc/elfnix_platform.h @@ -37,26 +37,10 @@ struct ELFNixPerObjectSectionsToRegister { ExecutorAddrRange ThreadDataSection; }; -struct ELFNixJITDylibInitializers { - using SectionList = std::vector; +using ELFNixJITDylibDepInfo = std::vector; - ELFNixJITDylibInitializers() = default; - ELFNixJITDylibInitializers(std::string Name, ExecutorAddr DSOHandleAddress) - : Name(std::move(Name)), DSOHandleAddress(std::move(DSOHandleAddress)) {} - - std::string Name; - ExecutorAddr DSOHandleAddress; - - std::vector> InitSections; -}; - -class ELFNixJITDylibDeinitializers {}; - -using ELFNixJITDylibInitializerSequence = - std::vector; - -using ELFNixJITDylibDeinitializerSequence = - std::vector; +using ELFNixJITDylibDepInfoMap = + std::unordered_map; enum dlopen_mode : int { ORC_RT_RTLD_LAZY = 0x1, @@ -94,37 +78,9 @@ class SPSSerializationTraits>; - -using SPSELFNixJITDylibInitializers = - SPSTuple; - -using SPSELFNixJITDylibInitializerSequence = - SPSSequence; - -/// Serialization traits for ELFNixJITDylibInitializers. -template <> -class SPSSerializationTraits { -public: - static size_t size(const elfnix::ELFNixJITDylibInitializers &MOJDIs) { - return SPSELFNixJITDylibInitializers::AsArgList::size( - MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections); - } - - static bool serialize(SPSOutputBuffer &OB, - const elfnix::ELFNixJITDylibInitializers &MOJDIs) { - return SPSELFNixJITDylibInitializers::AsArgList::serialize( - OB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections); - } - - static bool deserialize(SPSInputBuffer &IB, - elfnix::ELFNixJITDylibInitializers &MOJDIs) { - return SPSELFNixJITDylibInitializers::AsArgList::deserialize( - IB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections); - } -}; +using SPSELFNixJITDylibDepInfo = SPSSequence; +using SPSELFNixJITDylibDepInfoMap = + SPSSequence>; } // namespace orc_rt diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp index 8cc3594b5d0cf..afd90c791ae13 100644 --- a/compiler-rt/lib/orc/macho_platform.cpp +++ b/compiler-rt/lib/orc/macho_platform.cpp @@ -17,6 +17,7 @@ #include "error.h" #include "interval_map.h" #include "jit_dispatch.h" +#include "record_section_tracker.h" #include "wrapper_function_utils.h" #include @@ -168,93 +169,6 @@ class MachOPlatformRuntimeState { using AtExitsVector = std::vector; - /// Used to manage sections of fixed-sized metadata records (e.g. pointer - /// sections, selector refs, etc.) - template class RecordSectionsTracker { - public: - /// Add a section to the "new" list. - void add(span Sec) { New.push_back(std::move(Sec)); } - - /// Returns true if there are new sections to process. - bool hasNewSections() const { return !New.empty(); } - - /// Returns the number of new sections to process. - size_t numNewSections() const { return New.size(); } - - /// Process all new sections. - template - std::enable_if_t>>> - processNewSections(ProcessSectionFunc &&ProcessSection) { - for (auto &Sec : New) - ProcessSection(Sec); - moveNewToProcessed(); - } - - /// Proces all new sections with a fallible handler. - /// - /// Successfully handled sections will be moved to the Processed - /// list. - template - std::enable_if_t< - std::is_same_v>>, - Error> - processNewSections(ProcessSectionFunc &&ProcessSection) { - for (size_t I = 0; I != New.size(); ++I) { - if (auto Err = ProcessSection(New[I])) { - for (size_t J = 0; J != I; ++J) - Processed.push_back(New[J]); - New.erase(New.begin(), New.begin() + I); - return Err; - } - } - moveNewToProcessed(); - return Error::success(); - } - - /// Move all sections back to New for reprocessing. - void reset() { - moveNewToProcessed(); - New = std::move(Processed); - } - - /// Remove the section with the given range. - bool removeIfPresent(ExecutorAddrRange R) { - if (removeIfPresent(New, R)) - return true; - return removeIfPresent(Processed, R); - } - - private: - void moveNewToProcessed() { - if (Processed.empty()) - Processed = std::move(New); - else { - Processed.reserve(Processed.size() + New.size()); - std::copy(New.begin(), New.end(), std::back_inserter(Processed)); - New.clear(); - } - } - - bool removeIfPresent(std::vector> &V, - ExecutorAddrRange R) { - auto RI = std::find_if( - V.rbegin(), V.rend(), - [RS = R.toSpan()](const span &E) { - return E.data() == RS.data(); - }); - if (RI != V.rend()) { - V.erase(std::next(RI).base()); - return true; - } - return false; - } - - std::vector> Processed; - std::vector> New; - }; - struct UnwindSections { UnwindSections(const UnwindSectionInfo &USI) : DwarfSection(USI.DwarfSection.toSpan()), diff --git a/compiler-rt/lib/orc/record_section_tracker.h b/compiler-rt/lib/orc/record_section_tracker.h new file mode 100644 index 0000000000000..1ac729a953919 --- /dev/null +++ b/compiler-rt/lib/orc/record_section_tracker.h @@ -0,0 +1,113 @@ +//===- record_section_tracker.h -- for fixed-sized record sects -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RecordSectionsTracker: Responsible for managing sections of metadata records +// with fixed sizes. +// +//===----------------------------------------------------------------------===// + +#ifndef ORC_RT_RECORD_SECTION_TRACKER_H +#define ORC_RT_RECORD_SECTION_TRACKER_H + +#include "error.h" +#include "executor_address.h" +#include +#include + +namespace orc_rt { + +/// Used to manage sections of fixed-sized metadata records (e.g. pointer +/// sections, selector refs, etc.) +template class RecordSectionsTracker { +public: + /// Add a section to the "new" list. + void add(span Sec) { New.push_back(std::move(Sec)); } + + /// Returns true if there are new sections to process. + bool hasNewSections() const { return !New.empty(); } + + /// Returns the number of new sections to process. + size_t numNewSections() const { return New.size(); } + + /// Process all new sections. + template + std::enable_if_t>>> + processNewSections(ProcessSectionFunc &&ProcessSection) { + for (auto &Sec : New) + ProcessSection(Sec); + moveNewToProcessed(); + } + + /// Proces all new sections with a fallible handler. + /// + /// Successfully handled sections will be moved to the Processed + /// list. + template + std::enable_if_t< + std::is_same_v< + Error, std::invoke_result_t>>, + Error> + processNewSections(ProcessSectionFunc &&ProcessSection) { + for (size_t I = 0; I != New.size(); ++I) { + if (auto Err = ProcessSection(New[I])) { + for (size_t J = 0; J != I; ++J) + Processed.push_back(New[J]); + New.erase(New.begin(), New.begin() + I); + return Err; + } + } + moveNewToProcessed(); + return Error::success(); + } + + /// Move all sections back to New for reprocessing. + void reset() { + moveNewToProcessed(); + New = std::move(Processed); + } + + /// Remove the section with the given range. + bool removeIfPresent(ExecutorAddrRange R) { + if (removeIfPresent(New, R)) + return true; + return removeIfPresent(Processed, R); + } + +private: + void moveNewToProcessed() { + if (Processed.empty()) + Processed = std::move(New); + else { + Processed.reserve(Processed.size() + New.size()); + std::copy(New.begin(), New.end(), std::back_inserter(Processed)); + New.clear(); + } + } + + bool removeIfPresent(std::vector> &V, + ExecutorAddrRange R) { + auto RI = std::find_if( + V.rbegin(), V.rend(), + [RS = R.toSpan()](const span &E) { + return E.data() == RS.data(); + }); + if (RI != V.rend()) { + V.erase(std::next(RI).base()); + return true; + } + return false; + } + + std::vector> Processed; + std::vector> New; +}; + +} // namespace orc_rt + +#endif // ORC_RT_RECORD_SECTION_TRACKER_H diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 64ed8b62e9eba..bad4cc71801ec 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -196,7 +196,7 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) { } return 0; } -#elif defined(__ELF__) || defined(_WIN32) +#elif defined(__ELF__) || defined(_WIN32) || defined(_AIX) #define INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR \ INSTR_PROF_CONCAT(INSTR_PROF_PROFILE_COUNTER_BIAS_VAR, _default) @@ -862,7 +862,7 @@ static int parseFilenamePattern(const char *FilenamePat, __llvm_profile_disable_continuous_mode(); return -1; } -#if defined(__APPLE__) || defined(__ELF__) || defined(_WIN32) +#if defined(__APPLE__) || defined(__ELF__) || defined(_WIN32) || defined(_AIX) __llvm_profile_set_page_size(getpagesize()); __llvm_profile_enable_continuous_mode(); #else diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp index 9cc7214aef85c..63b0ca28a1f40 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp @@ -157,22 +157,38 @@ INTERCEPTOR(int, creat64, const char *path, mode_t mode) { INTERCEPTOR(int, fcntl, int filedes, int cmd, ...) { __rtsan_notify_intercepted_call("fcntl"); - va_list args; - va_start(args, cmd); - // Following precedent here. The linux source (fcntl.c, do_fcntl) accepts the // final argument in a variable that will hold the largest of the possible - // argument types (pointers and ints are typical in fcntl) It is then assumed - // that the implementation of fcntl will cast it properly depending on cmd. + // argument types. It is then assumed that the implementation of fcntl will + // cast it properly depending on cmd. // - // This is also similar to what is done in - // sanitizer_common/sanitizer_common_syscalls.inc - const unsigned long arg = va_arg(args, unsigned long); - int result = REAL(fcntl)(filedes, cmd, arg); + // The two types we expect for possible args are `struct flock*` and `int` + // we will cast to `intptr_t` which should hold both comfortably. + // Why `intptr_t`? It should fit both types, and it follows the freeBSD + // approach linked below. + using arg_type = intptr_t; + static_assert(sizeof(arg_type) >= sizeof(struct flock *)); + static_assert(sizeof(arg_type) >= sizeof(int)); + + // Some cmds will not actually have an argument passed in this va_list. + // Calling va_arg when no arg exists is UB, however all currently + // supported architectures will give us a result in all three cases + // (no arg/int arg/struct flock* arg) + // va_arg() will generally read the next argument register or the + // stack. If we ever support an arch like CHERI with bounds checking, we + // may have to re-evaluate this approach. + // + // More discussion, and other examples following this approach + // https://discourse.llvm.org/t/how-to-write-an-interceptor-for-fcntl/81203 + // https://reviews.freebsd.org/D46403 + // https://github.com/bminor/glibc/blob/c444cc1d8335243c5c4e636d6a26c472df85522c/sysdeps/unix/sysv/linux/fcntl64.c#L37-L46 + va_list args; + va_start(args, cmd); + const arg_type arg = va_arg(args, arg_type); va_end(args); - return result; + return REAL(fcntl)(filedes, cmd, arg); } #if SANITIZER_INTERCEPT_FCNTL64 diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index 7898af4a335e3..139c80b4f4a53 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -41,6 +41,7 @@ #include "sanitizer_errno.h" #include "sanitizer_placement_new.h" #include "sanitizer_platform_interceptors.h" +#include "sanitizer_platform_limits_posix.h" #include "sanitizer_symbolizer.h" #include "sanitizer_tls_get_addr.h" @@ -3473,23 +3474,27 @@ INTERCEPTOR(uptr, ptrace, int request, int pid, void *addr, void *data) { COMMON_INTERCEPTOR_ENTER(ctx, ptrace, request, pid, addr, data); __sanitizer_iovec local_iovec; - if (data) { + void *data_arg = ptrace_data_arg(request, addr, data); + if (data_arg) { if (request == ptrace_setregs) { - COMMON_INTERCEPTOR_READ_RANGE(ctx, data, struct_user_regs_struct_sz); + COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg, struct_user_regs_struct_sz); } else if (request == ptrace_setfpregs) { - COMMON_INTERCEPTOR_READ_RANGE(ctx, data, struct_user_fpregs_struct_sz); + COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg, + struct_user_fpregs_struct_sz); } else if (request == ptrace_setfpxregs) { - COMMON_INTERCEPTOR_READ_RANGE(ctx, data, struct_user_fpxregs_struct_sz); + COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg, + struct_user_fpxregs_struct_sz); } else if (request == ptrace_setvfpregs) { - COMMON_INTERCEPTOR_READ_RANGE(ctx, data, struct_user_vfpregs_struct_sz); + COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg, + struct_user_vfpregs_struct_sz); } else if (request == ptrace_setsiginfo) { - COMMON_INTERCEPTOR_READ_RANGE(ctx, data, siginfo_t_sz); + COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg, siginfo_t_sz); - // Some kernel might zero the iovec::iov_base in case of invalid - // write access. In this case copy the invalid address for further - // inspection. + // Some kernel might zero the iovec::iov_base in case of invalid + // write access. In this case copy the invalid address for further + // inspection. } else if (request == ptrace_setregset || request == ptrace_getregset) { - __sanitizer_iovec *iovec = (__sanitizer_iovec*)data; + __sanitizer_iovec *iovec = (__sanitizer_iovec *)data_arg; COMMON_INTERCEPTOR_READ_RANGE(ctx, iovec, sizeof(*iovec)); local_iovec = *iovec; if (request == ptrace_setregset) @@ -3502,23 +3507,26 @@ INTERCEPTOR(uptr, ptrace, int request, int pid, void *addr, void *data) { // https://github.com/google/sanitizers/issues/321. uptr res = REAL(ptrace)(request, pid, addr, data); - if (!res && data) { + if (!res && data_arg) { // Note that PEEK* requests assign different meaning to the return value. // This function does not handle them (nor does it need to). if (request == ptrace_getregs) { - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, struct_user_regs_struct_sz); + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, struct_user_regs_struct_sz); } else if (request == ptrace_getfpregs) { - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, struct_user_fpregs_struct_sz); + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, + struct_user_fpregs_struct_sz); } else if (request == ptrace_getfpxregs) { - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, struct_user_fpxregs_struct_sz); + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, + struct_user_fpxregs_struct_sz); } else if (request == ptrace_getvfpregs) { - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, struct_user_vfpregs_struct_sz); + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, + struct_user_vfpregs_struct_sz); } else if (request == ptrace_getsiginfo) { - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, siginfo_t_sz); + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, siginfo_t_sz); } else if (request == ptrace_geteventmsg) { - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, sizeof(unsigned long)); + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, sizeof(unsigned long)); } else if (request == ptrace_getregset) { - __sanitizer_iovec *iovec = (__sanitizer_iovec*)data; + __sanitizer_iovec *iovec = (__sanitizer_iovec *)data_arg; COMMON_INTERCEPTOR_WRITE_RANGE(ctx, iovec, sizeof(*iovec)); COMMON_INTERCEPTOR_WRITE_RANGE(ctx, local_iovec.iov_base, local_iovec.iov_len); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_syscalls.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_syscalls.inc index 14615f9668dea..29fe4721ba40d 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_syscalls.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_syscalls.inc @@ -48,6 +48,7 @@ #if SANITIZER_LINUX # include "sanitizer_libc.h" +# include "sanitizer_platform_limits_posix.h" # define PRE_SYSCALL(name) \ SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_syscall_pre_impl_##name @@ -2530,18 +2531,19 @@ PRE_SYSCALL(ptrace)(long request, long pid, long addr, long data) { # if !SANITIZER_ANDROID && \ (defined(__i386) || defined(__x86_64) || defined(__mips64) || \ defined(__powerpc64__) || defined(__aarch64__) || defined(__s390__) || \ - defined(__loongarch__) || SANITIZER_RISCV64) - if (data) { + defined(__loongarch__) || SANITIZER_RISCV64 || defined(__sparc__)) + long data_arg = ptrace_data_arg(request, addr, data); + if (data_arg) { if (request == ptrace_setregs) { - PRE_READ((void *)data, struct_user_regs_struct_sz); + PRE_READ((void *)data_arg, struct_user_regs_struct_sz); } else if (request == ptrace_setfpregs) { - PRE_READ((void *)data, struct_user_fpregs_struct_sz); + PRE_READ((void *)data_arg, struct_user_fpregs_struct_sz); } else if (request == ptrace_setfpxregs) { - PRE_READ((void *)data, struct_user_fpxregs_struct_sz); + PRE_READ((void *)data_arg, struct_user_fpxregs_struct_sz); } else if (request == ptrace_setsiginfo) { - PRE_READ((void *)data, siginfo_t_sz); + PRE_READ((void *)data_arg, siginfo_t_sz); } else if (request == ptrace_setregset) { - __sanitizer_iovec *iov = (__sanitizer_iovec *)data; + __sanitizer_iovec *iov = (__sanitizer_iovec *)data_arg; PRE_READ(iov->iov_base, iov->iov_len); } } @@ -2552,25 +2554,26 @@ POST_SYSCALL(ptrace)(long res, long request, long pid, long addr, long data) { # if !SANITIZER_ANDROID && \ (defined(__i386) || defined(__x86_64) || defined(__mips64) || \ defined(__powerpc64__) || defined(__aarch64__) || defined(__s390__) || \ - defined(__loongarch__) || SANITIZER_RISCV64) - if (res >= 0 && data) { + defined(__loongarch__) || SANITIZER_RISCV64 || defined(__sparc__)) + long data_arg = ptrace_data_arg(request, addr, data); + if (res >= 0 && data_arg) { // Note that this is different from the interceptor in // sanitizer_common_interceptors.inc. // PEEK* requests return resulting values through data pointer. if (request == ptrace_getregs) { - POST_WRITE((void *)data, struct_user_regs_struct_sz); + POST_WRITE((void *)data_arg, struct_user_regs_struct_sz); } else if (request == ptrace_getfpregs) { - POST_WRITE((void *)data, struct_user_fpregs_struct_sz); + POST_WRITE((void *)data_arg, struct_user_fpregs_struct_sz); } else if (request == ptrace_getfpxregs) { - POST_WRITE((void *)data, struct_user_fpxregs_struct_sz); + POST_WRITE((void *)data_arg, struct_user_fpxregs_struct_sz); } else if (request == ptrace_getsiginfo) { - POST_WRITE((void *)data, siginfo_t_sz); + POST_WRITE((void *)data_arg, siginfo_t_sz); } else if (request == ptrace_getregset) { - __sanitizer_iovec *iov = (__sanitizer_iovec *)data; + __sanitizer_iovec *iov = (__sanitizer_iovec *)data_arg; POST_WRITE(iov->iov_base, iov->iov_len); } else if (request == ptrace_peekdata || request == ptrace_peektext || request == ptrace_peekuser) { - POST_WRITE((void *)data, sizeof(void *)); + POST_WRITE((void *)data_arg, sizeof(void *)); } } # endif diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index d4cc380f641b8..272e4a02e6a7f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -279,8 +279,9 @@ #if SI_LINUX_NOT_ANDROID && \ (defined(__i386) || defined(__x86_64) || defined(__mips64) || \ defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \ - defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64) -#define SANITIZER_INTERCEPT_PTRACE 1 + defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64 || \ + defined(__sparc__)) +# define SANITIZER_INTERCEPT_PTRACE 1 #else #define SANITIZER_INTERCEPT_PTRACE 0 #endif @@ -606,13 +607,13 @@ // FIXME: also available from musl 1.2.5 #define SANITIZER_INTERCEPT_PREADV2 (SI_LINUX && __GLIBC_PREREQ(2, 26)) #define SANITIZER_INTERCEPT_PWRITEV2 (SI_LINUX && __GLIBC_PREREQ(2, 26)) -#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && \ - __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130000 -# define SI_MAC_DEPLOYMENT_BELOW_13_00 1 +#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \ + __MAC_OS_X_VERSION_MIN_REQUIRED >= 130000 +# define SI_MAC_OS_DEPLOYMENT_MIN_13_00 1 #else -# define SI_MAC_DEPLOYMENT_BELOW_13_00 0 +# define SI_MAC_OS_DEPLOYMENT_MIN_13_00 0 #endif -#define SANITIZER_INTERCEPT_FREADLINK (SI_MAC && !SI_MAC_DEPLOYMENT_BELOW_13_00) +#define SANITIZER_INTERCEPT_FREADLINK (SI_MAC && SI_MAC_OS_DEPLOYMENT_MIN_13_00) // This macro gives a way for downstream users to override the above // interceptor macros irrespective of the platform they are on. They have diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp index 5eeb2a89efa8c..c87d5ef42c924 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp @@ -94,8 +94,9 @@ #if SANITIZER_LINUX # include # include -# if defined(__mips64) || defined(__aarch64__) || defined(__arm__) || \ - defined(__hexagon__) || defined(__loongarch__) ||SANITIZER_RISCV64 +# if defined(__mips64) || defined(__aarch64__) || defined(__arm__) || \ + defined(__hexagon__) || defined(__loongarch__) || SANITIZER_RISCV64 || \ + defined(__sparc__) # include # ifdef __arm__ typedef struct user_fpregs elf_fpregset_t; @@ -359,11 +360,12 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr); const int wordexp_wrde_dooffs = WRDE_DOOFFS; # endif // !SANITIZER_ANDROID -#if SANITIZER_LINUX && !SANITIZER_ANDROID && \ - (defined(__i386) || defined(__x86_64) || defined(__mips64) || \ - defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \ - defined(__s390__) || defined(__loongarch__)|| SANITIZER_RISCV64) -#if defined(__mips64) || defined(__powerpc64__) || defined(__arm__) +# if SANITIZER_LINUX && !SANITIZER_ANDROID && \ + (defined(__i386) || defined(__x86_64) || defined(__mips64) || \ + defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \ + defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64 || \ + defined(__sparc__)) +# if defined(__mips64) || defined(__powerpc64__) || defined(__arm__) unsigned struct_user_regs_struct_sz = sizeof(struct pt_regs); unsigned struct_user_fpregs_struct_sz = sizeof(elf_fpregset_t); #elif SANITIZER_RISCV64 @@ -378,19 +380,22 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr); #elif defined(__s390__) unsigned struct_user_regs_struct_sz = sizeof(struct _user_regs_struct); unsigned struct_user_fpregs_struct_sz = sizeof(struct _user_fpregs_struct); -#else +# elif defined(__sparc__) + unsigned struct_user_regs_struct_sz = sizeof(struct sunos_regs); + unsigned struct_user_fpregs_struct_sz = sizeof(struct sunos_fp); +# else unsigned struct_user_regs_struct_sz = sizeof(struct user_regs_struct); unsigned struct_user_fpregs_struct_sz = sizeof(struct user_fpregs_struct); -#endif // __mips64 || __powerpc64__ || __aarch64__ || __loongarch__ -#if defined(__x86_64) || defined(__mips64) || defined(__powerpc64__) || \ - defined(__aarch64__) || defined(__arm__) || defined(__s390__) || \ - defined(__loongarch__) || SANITIZER_RISCV64 +# endif // __mips64 || __powerpc64__ || __aarch64__ || __loongarch__ +# if defined(__x86_64) || defined(__mips64) || defined(__powerpc64__) || \ + defined(__aarch64__) || defined(__arm__) || defined(__s390__) || \ + defined(__loongarch__) || SANITIZER_RISCV64 || defined(__sparc__) unsigned struct_user_fpxregs_struct_sz = 0; #else unsigned struct_user_fpxregs_struct_sz = sizeof(struct user_fpxregs_struct); #endif // __x86_64 || __mips64 || __powerpc64__ || __aarch64__ || __arm__ -// || __s390__ || __loongarch__ -#ifdef __arm__ + // || __s390__ || __loongarch__ || SANITIZER_RISCV64 || __sparc__ +# ifdef __arm__ unsigned struct_user_vfpregs_struct_sz = ARM_VFPREGS_SIZE; #else unsigned struct_user_vfpregs_struct_sz = 0; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h index ca03841ccc198..e8c81aa8e2816 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h @@ -855,10 +855,11 @@ typedef void __sanitizer_FILE; # define SANITIZER_HAS_STRUCT_FILE 0 #endif -#if SANITIZER_LINUX && !SANITIZER_ANDROID && \ - (defined(__i386) || defined(__x86_64) || defined(__mips64) || \ - defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \ - defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64) +# if SANITIZER_LINUX && !SANITIZER_ANDROID && \ + (defined(__i386) || defined(__x86_64) || defined(__mips64) || \ + defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \ + defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64 || \ + defined(__sparc__)) extern unsigned struct_user_regs_struct_sz; extern unsigned struct_user_fpregs_struct_sz; extern unsigned struct_user_fpxregs_struct_sz; @@ -880,9 +881,24 @@ extern int ptrace_setsiginfo; extern int ptrace_getregset; extern int ptrace_setregset; extern int ptrace_geteventmsg; -#endif -#if SANITIZER_LINUX && !SANITIZER_ANDROID +// Helper for the ptrace interceptor. +template +inline T ptrace_data_arg(int request, T addr, T data) { +# if SANITIZER_LINUX && SANITIZER_SPARC + // As described in ptrace(2), the meanings of addr and data are reversed + // for the PTRACE_GETREGS, PTRACE_GETFPREGS, PTRACE_GETREGS, and + // PTRACE_GETFPREGS requests on Linux/sparc64. + if (request == ptrace_getregs || request == ptrace_getfpregs || + request == ptrace_setregs || request == ptrace_setfpregs) + return addr; + else +# endif + return data; +} +# endif + +# if SANITIZER_LINUX && !SANITIZER_ANDROID extern unsigned struct_shminfo_sz; extern unsigned struct_shm_info_sz; extern int shmctl_ipc_stat; diff --git a/compiler-rt/test/asan/TestCases/Linux/ptrace.cpp b/compiler-rt/test/asan/TestCases/Linux/ptrace.cpp index e01021ff344c3..edff30e5a4753 100644 --- a/compiler-rt/test/asan/TestCases/Linux/ptrace.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/ptrace.cpp @@ -81,6 +81,13 @@ typedef __riscv_q_ext_state fpregs_struct; #define PRINT_REG_PC(__regs) printf("%lx\n", (unsigned long)(__regs.pc)) #define PRINT_REG_FP(__fpregs) printf("%lx\n", (unsigned long)(__fpregs.fcsr)) #define ARCH_IOVEC_FOR_GETREGSET + +#elif defined(__sparc__) +typedef sunos_regs regs_struct; +typedef sunos_fp fpregs_struct; +# define PRINT_REG_PC(__regs) printf("%x\n", (unsigned)(__regs.pc)) +# define PRINT_REG_FP(__fpregs) printf("%x\n", (unsigned)(__fpregs.fsr)) +# define __PTRACE_FPREQUEST PTRACE_GETFPREGS #endif @@ -110,7 +117,13 @@ int main(void) { regset_io.iov_len = sizeof(regs_struct); #else # define __PTRACE_REQUEST PTRACE_GETREGS -# define __PTRACE_ARGS NULL, pregs +# ifdef __sparc__ + // The meanings of addr and data are reversed for a few requests on + // Linux/sparc64. +# define __PTRACE_ARGS pregs, NULL +# else +# define __PTRACE_ARGS NULL, pregs +# endif #endif res = ptrace((enum __ptrace_request)__PTRACE_REQUEST, pid, __PTRACE_ARGS); // CHECK: AddressSanitizer: stack-buffer-overflow @@ -127,7 +140,13 @@ int main(void) { res = ptrace((enum __ptrace_request)PTRACE_GETREGSET, pid, (void*)NT_FPREGSET, (void*)®set_io); #else -# define __PTRACE_FPARGS NULL, &fpregs + // The meanings of addr and data are reversed for a few requests on + // Linux/sparc64. +# ifdef __sparc__ +# define __PTRACE_FPARGS &fpregs, NULL +# else +# define __PTRACE_FPARGS NULL, &fpregs +# endif #endif res = ptrace((enum __ptrace_request)__PTRACE_FPREQUEST, pid, __PTRACE_FPARGS); assert(!res); diff --git a/compiler-rt/test/profile/ContinuousSyncMode/runtime-counter-relocation.c b/compiler-rt/test/profile/ContinuousSyncMode/runtime-counter-relocation.c index 19a7aae70cb0d..6ec9077f4d614 100644 --- a/compiler-rt/test/profile/ContinuousSyncMode/runtime-counter-relocation.c +++ b/compiler-rt/test/profile/ContinuousSyncMode/runtime-counter-relocation.c @@ -1,11 +1,11 @@ -// REQUIRES: target={{.*(linux|solaris|windows-msvc).*}} +// REQUIRES: target={{.*(linux|solaris|windows-msvc|aix).*}} // RUN: %clang -fprofile-instr-generate -fcoverage-mapping -mllvm -runtime-counter-relocation=true -o %t.exe %s // RUN: echo "garbage" > %t.profraw // RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe // RUN: llvm-profdata show --counts --all-functions %t.profraw | FileCheck %s -check-prefix=CHECK-COUNTS // RUN: llvm-profdata merge -o %t.profdata %t.profraw -// RUN: llvm-cov report %t.exe -instr-profile %t.profdata | FileCheck %s -check-prefix=CHECK-COVERAGE +// RUN: %if !target={{.*aix.*}} %{ llvm-cov report %t.exe -instr-profile %t.profdata | FileCheck %s -check-prefix=CHECK-COVERAGE %} // CHECK-COUNTS: Counters: // CHECK-COUNTS-NEXT: main: diff --git a/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c b/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c index 53609f5838f75..c7eb27057a923 100644 --- a/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c +++ b/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c @@ -1,4 +1,4 @@ -// REQUIRES: target={{.*(darwin|linux|solaris).*}} +// REQUIRES: target={{.*(darwin|linux|solaris|aix).*}} // Test using __llvm_profile_set_file_object in continuous mode (%c). // Create & cd into a temporary directory. diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 87716731ead85..e288fdeec6cd2 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -765,7 +765,7 @@ This phase currently supports all the intrinsic procedures listed above but the | Coarray intrinsic functions | COSHAPE | | Object characteristic inquiry functions | ALLOCATED, ASSOCIATED, EXTENDS_TYPE_OF, IS_CONTIGUOUS, PRESENT, RANK, SAME_TYPE, STORAGE_SIZE | | Type inquiry intrinsic functions | BIT_SIZE, DIGITS, EPSILON, HUGE, KIND, MAXEXPONENT, MINEXPONENT, NEW_LINE, PRECISION, RADIX, RANGE, TINY| -| Non-standard intrinsic functions | AND, OR, XOR, SHIFT, ZEXT, IZEXT, COSD, SIND, TAND, ACOSD, ASIND, ATAND, ATAN2D, COMPL, EQV, NEQV, INT8, JINT, JNINT, KNINT, QCMPLX, DREAL, DFLOAT, QEXT, QFLOAT, QREAL, DNUM, NUM, JNUM, KNUM, QNUM, RNUM, RAN, RANF, ILEN, SIZEOF, MCLOCK, SECNDS, COTAN, IBCHNG, ISHA, ISHC, ISHL, IXOR, IARG, IARGC, NARGS, GETPID, NUMARG, BADDRESS, IADDR, CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, MALLOC | +| Non-standard intrinsic functions | AND, OR, XOR, SHIFT, ZEXT, IZEXT, COSD, SIND, TAND, ACOSD, ASIND, ATAND, ATAN2D, COMPL, EQV, NEQV, INT8, JINT, JNINT, KNINT, QCMPLX, DREAL, DFLOAT, QEXT, QFLOAT, QREAL, DNUM, NUM, JNUM, KNUM, QNUM, RNUM, RAN, RANF, ILEN, SIZEOF, MCLOCK, SECNDS, COTAN, IBCHNG, ISHA, ISHC, ISHL, IXOR, IARG, IARGC, NARGS, GETPID, NUMARG, BADDRESS, IADDR, CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, MALLOC, GETUID, GETGID | | Intrinsic subroutines |MVBITS (elemental), CPU_TIME, DATE_AND_TIME, EVENT_QUERY, EXECUTE_COMMAND_LINE, GET_COMMAND, GET_COMMAND_ARGUMENT, GET_ENVIRONMENT_VARIABLE, MOVE_ALLOC, RANDOM_INIT, RANDOM_NUMBER, RANDOM_SEED, SIGNAL, SLEEP, SYSTEM, SYSTEM_CLOCK | | Atomic intrinsic subroutines | ATOMIC_ADD | | Collective intrinsic subroutines | CO_REDUCE | diff --git a/flang/include/flang/Evaluate/target.h b/flang/include/flang/Evaluate/target.h index d076fcbf08307..b347c549e012d 100644 --- a/flang/include/flang/Evaluate/target.h +++ b/flang/include/flang/Evaluate/target.h @@ -102,6 +102,11 @@ class TargetCharacteristics { bool isPPC() const { return isPPC_; } void set_isPPC(bool isPPC = false); + bool isOSWindows() const { return isOSWindows_; } + void set_isOSWindows(bool isOSWindows = false) { + isOSWindows_ = isOSWindows; + }; + IeeeFeatures &ieeeFeatures() { return ieeeFeatures_; } const IeeeFeatures &ieeeFeatures() const { return ieeeFeatures_; } @@ -111,6 +116,7 @@ class TargetCharacteristics { std::uint8_t align_[common::TypeCategory_enumSize][maxKind]{}; bool isBigEndian_{false}; bool isPPC_{false}; + bool isOSWindows_{false}; bool areSubnormalsFlushedToZero_{false}; Rounding roundingMode_{defaultRounding}; std::size_t procedurePointerByteSize_{8}; diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 78bb82b17d405..b2da6138fc9d8 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -256,6 +256,10 @@ struct IntrinsicLibrary { llvm::ArrayRef args); void genGetCommandArgument(mlir::ArrayRef args); void genGetEnvironmentVariable(llvm::ArrayRef); + mlir::Value genGetGID(mlir::Type resultType, + llvm::ArrayRef args); + mlir::Value genGetUID(mlir::Type resultType, + llvm::ArrayRef args); fir::ExtendedValue genIall(mlir::Type, llvm::ArrayRef); mlir::Value genIand(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genIany(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h index 240de5a899d37..49d8249d6bcbc 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h @@ -47,6 +47,9 @@ void genDateAndTime(fir::FirOpBuilder &, mlir::Location, void genEtime(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value values, mlir::Value time); +mlir::Value genGetUID(fir::FirOpBuilder &, mlir::Location); +mlir::Value genGetGID(fir::FirOpBuilder &, mlir::Location); + void genRandomInit(fir::FirOpBuilder &, mlir::Location, mlir::Value repeatable, mlir::Value imageDistinct); void genRandomNumber(fir::FirOpBuilder &, mlir::Location, mlir::Value harvest); diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h index fef651f3b2eed..6a842bafc155f 100644 --- a/flang/include/flang/Runtime/extensions.h +++ b/flang/include/flang/Runtime/extensions.h @@ -20,6 +20,14 @@ #include #include +#ifdef _WIN32 +// UID and GID don't exist on Windows, these exist to avoid errors. +typedef std::uint32_t uid_t; +typedef std::uint32_t gid_t; +#else +#include "sys/types.h" //pid_t +#endif + extern "C" { // CALL FLUSH(n) antedates the Fortran 2003 FLUSH statement. @@ -35,6 +43,12 @@ std::int32_t FORTRAN_PROCEDURE_NAME(iargc)(); void FORTRAN_PROCEDURE_NAME(getarg)( std::int32_t &n, char *arg, std::int64_t length); +// Calls getgid() +gid_t RTNAME(GetGID)(); + +// Calls getuid() +uid_t RTNAME(GetUID)(); + // GNU extension subroutine GETLOG(C). void FORTRAN_PROCEDURE_NAME(getlog)(char *name, std::int64_t length); diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h index c8d32e8e87cf1..f52b5ddaa8d49 100644 --- a/flang/include/flang/Tools/TargetSetup.h +++ b/flang/include/flang/Tools/TargetSetup.h @@ -59,6 +59,9 @@ namespace Fortran::tools { if (targetTriple.isPPC()) targetCharacteristics.set_isPPC(true); + if (targetTriple.isOSWindows()) + targetCharacteristics.set_isOSWindows(true); + // TODO: use target machine data layout to set-up the target characteristics // type size and alignment info. } diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 17a09c080e72c..2b11b40e27ad1 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -523,7 +523,9 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {{"c", DefaultChar, Rank::scalar, Optionality::required, common::Intent::Out}}, TypePattern{IntType, KindCode::greaterOrEqualToKind, 4}}, + {"getgid", {}, DefaultInt}, {"getpid", {}, DefaultInt}, + {"getuid", {}, DefaultInt}, {"huge", {{"x", SameIntOrReal, Rank::anyOrAssumedRank, Optionality::required, common::Intent::In, {ArgFlag::canBeMoldNull}}}, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 4e6d92213c124..6c5bd3b9417e8 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -293,7 +293,9 @@ static constexpr IntrinsicHandler handlers[]{ &I::genGetCwd, {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}}, /*isElemental=*/false}, + {"getgid", &I::genGetGID}, {"getpid", &I::genGetPID}, + {"getuid", &I::genGetUID}, {"iachar", &I::genIchar}, {"iall", &I::genIall, @@ -3650,6 +3652,14 @@ void IntrinsicLibrary::genGetCommand(llvm::ArrayRef args) { } } +// GETGID +mlir::Value IntrinsicLibrary::genGetGID(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 0 && "getgid takes no input"); + return builder.createConvert(loc, resultType, + fir::runtime::genGetGID(builder, loc)); +} + // GETPID mlir::Value IntrinsicLibrary::genGetPID(mlir::Type resultType, llvm::ArrayRef args) { @@ -3658,6 +3668,14 @@ mlir::Value IntrinsicLibrary::genGetPID(mlir::Type resultType, fir::runtime::genGetPID(builder, loc)); } +// GETUID +mlir::Value IntrinsicLibrary::genGetUID(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 0 && "getgid takes no input"); + return builder.createConvert(loc, resultType, + fir::runtime::genGetUID(builder, loc)); +} + // GET_COMMAND_ARGUMENT void IntrinsicLibrary::genGetCommandArgument( llvm::ArrayRef args) { diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp index aff3cadc3c300..6bdc7d8c6bc82 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp @@ -120,6 +120,22 @@ void fir::runtime::genEtime(fir::FirOpBuilder &builder, mlir::Location loc, builder.create(loc, runtimeFunc, args); } +mlir::Value fir::runtime::genGetGID(fir::FirOpBuilder &builder, + mlir::Location loc) { + auto runtimeFunc = + fir::runtime::getRuntimeFunc(loc, builder); + + return builder.create(loc, runtimeFunc).getResult(0); +} + +mlir::Value fir::runtime::genGetUID(fir::FirOpBuilder &builder, + mlir::Location loc) { + auto runtimeFunc = + fir::runtime::getRuntimeFunc(loc, builder); + + return builder.create(loc, runtimeFunc).getResult(0); +} + void fir::runtime::genRandomInit(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value repeatable, mlir::Value imageDistinct) { diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index 4aa14ca2c2bdd..e52812fb320cb 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -14,7 +14,6 @@ #include "DebugTypeGenerator.h" #include "flang/Optimizer/CodeGen/DescriptorModel.h" -#include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Support/InternalNames.h" #include "mlir/Pass/Pass.h" #include "llvm/ADT/ScopeExit.h" @@ -48,7 +47,7 @@ DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m, mlir::SymbolTable *symbolTable_, const mlir::DataLayout &dl) : module(m), symbolTable(symbolTable_), dataLayout{&dl}, - kindMapping(getKindMapping(m)) { + kindMapping(getKindMapping(m)), llvmTypeConverter(m, false, false, dl) { LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n"); mlir::MLIRContext *context = module.getContext(); @@ -160,10 +159,91 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( dataLocation, /*rank=*/nullptr, allocated, associated); } +// If the type is a pointer or array type then gets its underlying type. +static mlir::LLVM::DITypeAttr getUnderlyingType(mlir::LLVM::DITypeAttr Ty) { + if (auto ptrTy = + mlir::dyn_cast_if_present(Ty)) { + if (ptrTy.getTag() == llvm::dwarf::DW_TAG_pointer_type) + Ty = getUnderlyingType(ptrTy.getBaseType()); + } + if (auto comTy = + mlir::dyn_cast_if_present(Ty)) { + if (comTy.getTag() == llvm::dwarf::DW_TAG_array_type) + Ty = getUnderlyingType(comTy.getBaseType()); + } + return Ty; +} + +// Currently, the handling of recursive debug type in mlir has some limitations. +// Those limitations were discussed at the end of the thread for following PR. +// https://github.com/llvm/llvm-project/pull/106571 +// +// Problem could be explained with the following example code: +// type t2 +// type(t1), pointer :: p1 +// end type +// type t1 +// type(t2), pointer :: p2 +// end type +// In the description below, type_self means a temporary type that is generated +// as a place holder while the members of that type are being processed. +// +// If we process t1 first then we will have the following structure after it has +// been processed. +// t1 -> t2 -> t1_self +// This is because when we started processing t2, we did not have the complete +// t1 but its place holder t1_self. +// Now if some entity requires t2, we will already have that in cache and will +// return it. But this t2 refers to t1_self and not to t1. In mlir handling, +// only those types are allowed to have _self reference which are wrapped by +// entity whose reference it is. So t1 -> t2 -> t1_self is ok because the +// t1_self reference can be resolved by the outer t1. But standalone t2 is not +// because there will be no way to resolve it. Until this is fixed in mlir, we +// avoid caching such types. Please see DebugTranslation::translateRecursive for +// details on how mlir handles recursive types. +static bool canCacheThisType(mlir::LLVM::DICompositeTypeAttr comTy) { + for (auto el : comTy.getElements()) { + if (auto mem = + mlir::dyn_cast_if_present(el)) { + mlir::LLVM::DITypeAttr memTy = getUnderlyingType(mem.getBaseType()); + if (auto baseTy = + mlir::dyn_cast_if_present( + memTy)) { + // We will not cache a type if one of its member meets the following + // conditions: + // 1. It is a structure type + // 2. It is a place holder type (getIsRecSelf() is true) + // 3. It is not a self reference. It is ok to have t1_self in t1. + if (baseTy.getTag() == llvm::dwarf::DW_TAG_structure_type && + baseTy.getIsRecSelf() && (comTy.getRecId() != baseTy.getRecId())) + return false; + } + } + } + return true; +} + mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType( fir::RecordType Ty, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp) { + // Check if this type has already been converted. + auto iter = typeCache.find(Ty); + if (iter != typeCache.end()) + return iter->second; + + llvm::SmallVector elements; mlir::MLIRContext *context = module.getContext(); + auto recId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context)); + // Generate a place holder TypeAttr which will be used if a member + // references the parent type. + auto comAttr = mlir::LLVM::DICompositeTypeAttr::get( + context, recId, /*isRecSelf=*/true, llvm::dwarf::DW_TAG_structure_type, + mlir::StringAttr::get(context, ""), fileAttr, /*line=*/0, scope, + /*baseType=*/nullptr, mlir::LLVM::DIFlags::Zero, /*sizeInBits=*/0, + /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr, + /*allocated=*/nullptr, /*associated=*/nullptr); + typeCache[Ty] = comAttr; + auto result = fir::NameUniquer::deconstruct(Ty.getName()); if (result.first != fir::NameUniquer::NameKind::DERIVED_TYPE) return genPlaceholderType(context); @@ -171,18 +251,18 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType( fir::TypeInfoOp tiOp = symbolTable->lookup(Ty.getName()); unsigned line = (tiOp) ? getLineFromLoc(tiOp.getLoc()) : 1; - llvm::SmallVector elements; std::uint64_t offset = 0; for (auto [fieldName, fieldTy] : Ty.getTypeList()) { - auto result = fir::getTypeSizeAndAlignment(module.getLoc(), fieldTy, - *dataLayout, kindMapping); - // If we get a type whose size we can't determine, we will break the loop - // and generate the derived type with whatever components we have - // assembled thus far. - if (!result) - break; - auto [byteSize, byteAlign] = *result; + mlir::Type llvmTy; + if (auto boxTy = mlir::dyn_cast_or_null(fieldTy)) + llvmTy = + llvmTypeConverter.convertBoxTypeAsStruct(boxTy, getBoxRank(boxTy)); + else + llvmTy = llvmTypeConverter.convertType(fieldTy); + // FIXME: Handle non defaults array bound in derived types + uint64_t byteSize = dataLayout->getTypeSize(llvmTy); + unsigned short byteAlign = dataLayout->getTypeABIAlignment(llvmTy); mlir::LLVM::DITypeAttr elemTy = convertType(fieldTy, fileAttr, scope, /*declOp=*/nullptr); offset = llvm::alignTo(offset, byteAlign); @@ -195,12 +275,20 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType( offset += llvm::alignTo(byteSize, byteAlign); } - return mlir::LLVM::DICompositeTypeAttr::get( - context, llvm::dwarf::DW_TAG_structure_type, + auto finalAttr = mlir::LLVM::DICompositeTypeAttr::get( + context, recId, /*isRecSelf=*/false, llvm::dwarf::DW_TAG_structure_type, mlir::StringAttr::get(context, result.second.name), fileAttr, line, scope, /*baseType=*/nullptr, mlir::LLVM::DIFlags::Zero, offset * 8, /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr, /*allocated=*/nullptr, /*associated=*/nullptr); + if (canCacheThisType(finalAttr)) { + typeCache[Ty] = finalAttr; + } else { + auto iter = typeCache.find(Ty); + if (iter != typeCache.end()) + typeCache.erase(iter); + } + return finalAttr; } mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType( diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h index e3220f18958df..b8a068e5ba148 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h @@ -14,6 +14,7 @@ #define FORTRAN_OPTIMIZER_TRANSFORMS_DEBUGTYPEGENERATOR_H #include "flang/Optimizer/CodeGen/CGOps.h" +#include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/Dialect/Support/KindMapping.h" @@ -68,10 +69,12 @@ class DebugTypeGenerator { mlir::SymbolTable *symbolTable; const mlir::DataLayout *dataLayout; KindMapping kindMapping; + fir::LLVMTypeConverter llvmTypeConverter; std::uint64_t dimsSize; std::uint64_t dimsOffset; std::uint64_t ptrSize; std::uint64_t lenOffset; + llvm::DenseMap typeCache; }; } // namespace fir diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp index 71d1c083c3127..7c8427733e1db 100644 --- a/flang/lib/Semantics/check-call.cpp +++ b/flang/lib/Semantics/check-call.cpp @@ -2028,6 +2028,22 @@ bool CheckPPCIntrinsic(const Symbol &generic, const Symbol &specific, return false; } +bool CheckWindowsIntrinsic( + const Symbol &intrinsic, evaluate::FoldingContext &foldingContext) { + parser::ContextualMessages &messages{foldingContext.messages()}; + // TODO: there are other intrinsics that are unsupported on Windows that + // should be added here. + if (intrinsic.name() == "getuid") { + messages.Say( + "User IDs do not exist on Windows. This function will always return 1"_warn_en_US); + } + if (intrinsic.name() == "getgid") { + messages.Say( + "Group IDs do not exist on Windows. This function will always return 1"_warn_en_US); + } + return true; +} + bool CheckArguments(const characteristics::Procedure &proc, evaluate::ActualArguments &actuals, SemanticsContext &context, const Scope &scope, bool treatingExternalAsImplicit, diff --git a/flang/lib/Semantics/check-call.h b/flang/lib/Semantics/check-call.h index 8553f3a31efb5..46bc61a601bd3 100644 --- a/flang/lib/Semantics/check-call.h +++ b/flang/lib/Semantics/check-call.h @@ -41,6 +41,8 @@ bool CheckArguments(const evaluate::characteristics::Procedure &, bool CheckPPCIntrinsic(const Symbol &generic, const Symbol &specific, const evaluate::ActualArguments &actuals, evaluate::FoldingContext &context); +bool CheckWindowsIntrinsic( + const Symbol &intrinsic, evaluate::FoldingContext &context); bool CheckArgumentIsConstantExprInRange( const evaluate::ActualArguments &actuals, int index, int lowerBound, int upperBound, parser::ContextualMessages &messages); diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 2943ee5dd7552..51341b3faf3a4 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -1837,9 +1837,23 @@ inline void OmpStructureChecker::ErrIfLHSAndRHSSymbolsMatch( const Symbol &varSymbol = vSyms.front(); for (const Symbol &symbol : evaluate::GetSymbolVector(*e)) { if (varSymbol == symbol) { - context_.Say(expr.source, - "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US, - var.GetSource().ToString()); + const Fortran::common::Indirection + *designator = std::get_if< + Fortran::common::Indirection>( + &expr.u); + if (designator) { + auto *z{var.typedExpr.get()}; + auto *c{expr.typedExpr.get()}; + if (z->v == c->v) { + context_.Say(expr.source, + "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US, + var.GetSource()); + } + } else { + context_.Say(expr.source, + "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US, + var.GetSource()); + } } } } diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 182ea5d441956..364f99d73f5cc 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -2916,6 +2916,9 @@ auto ExpressionAnalyzer::GetCalleeAndArguments(const parser::Name &name, } else { resolution = symbol; } + if (resolution && context_.targetCharacteristics().isOSWindows()) { + semantics::CheckWindowsIntrinsic(*resolution, GetFoldingContext()); + } if (!resolution || resolution->attrs().test(semantics::Attr::INTRINSIC)) { auto name{resolution ? resolution->name() : ultimate.name()}; if (std::optional specificCall{context_.intrinsics().Probe( diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp index be3833db88b07..f2823ca770bc5 100644 --- a/flang/runtime/extensions.cpp +++ b/flang/runtime/extensions.cpp @@ -58,6 +58,24 @@ extern "C" { namespace Fortran::runtime { +gid_t RTNAME(GetGID)() { +#ifdef _WIN32 + // Group IDs don't exist on Windows, return 1 to avoid errors + return 1; +#else + return getgid(); +#endif +} + +uid_t RTNAME(GetUID)() { +#ifdef _WIN32 + // User IDs don't exist on Windows, return 1 to avoid errors + return 1; +#else + return getuid(); +#endif +} + void GetUsernameEnvVar(const char *envName, char *arg, std::int64_t length) { Descriptor name{*Descriptor::Create( 1, std::strlen(envName) + 1, const_cast(envName), 0)}; @@ -66,6 +84,7 @@ void GetUsernameEnvVar(const char *envName, char *arg, std::int64_t length) { RTNAME(GetEnvVariable) (name, &value, nullptr, false, nullptr, __FILE__, __LINE__); } + namespace io { // SUBROUTINE FLUSH(N) // FLUSH N diff --git a/flang/test/Integration/debug-cyclic-derived-type-2.f90 b/flang/test/Integration/debug-cyclic-derived-type-2.f90 new file mode 100644 index 0000000000000..c49c9d00957e8 --- /dev/null +++ b/flang/test/Integration/debug-cyclic-derived-type-2.f90 @@ -0,0 +1,22 @@ +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s + +! mainly test that this program does not cause an assertion failure +module m + type t2 + type(t1), pointer :: p1 + end type + type t1 + type(t2), pointer :: p2 + integer abc + end type + type(t1) :: tee1 +end module + +program test + use m + type(t2) :: lc2 + print *, lc2%p1%abc +end program test + +! CHECK-DAG: DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}}) +! CHECK-DAG: DICompositeType(tag: DW_TAG_structure_type, name: "t2"{{.*}}) diff --git a/flang/test/Integration/debug-cyclic-derived-type.f90 b/flang/test/Integration/debug-cyclic-derived-type.f90 index 03e06336a6e08..a26ffd19ef6b1 100644 --- a/flang/test/Integration/debug-cyclic-derived-type.f90 +++ b/flang/test/Integration/debug-cyclic-derived-type.f90 @@ -11,5 +11,11 @@ module m type(t2) :: v3 end module -! CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}}) -! CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "t2"{{.*}}) +! CHECK-DAG: ![[T1:[0-9]+]] = {{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}}elements: ![[T1_ELEMS:[0-9]+]]) +! CHECK-DAG: ![[T1_ELEMS]] = !{![[T1_ELEM1:[0-9]+]]} +! CHECK-DAG: ![[T1_ELEM1]] = !DIDerivedType(tag: DW_TAG_member, name: "p", baseType: ![[T2P:[0-9]+]]{{.*}}) +! CHECK-DAG: ![[T2P]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[T2:[0-9]+]]{{.*}}) + +! CHECK-DAG: ![[T2]] = {{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "t2"{{.*}}elements: ![[T2_ELEMS:[0-9]+]]) +! CHECK-DAG: ![[T2_ELEMS]] = !{![[T2_ELEM1:[0-9]+]]} +! CHECK-DAG: ![[T2_ELEM1]] = !DIDerivedType(tag: DW_TAG_member, name: "v1", baseType: ![[T1]]{{.*}}) diff --git a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt-read.f90 b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt-read.f90 new file mode 100644 index 0000000000000..6469b1bfb7847 --- /dev/null +++ b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt-read.f90 @@ -0,0 +1,41 @@ +! RUN: %flang_fc1 -fopenmp %s -o - + +integer :: x, vv(2), xx(2) +type t1 + integer :: v,y,yy(2) +end type t1 +type(t1)::t,tt(2) +x=1 +xx=1 +vv=1 +t%y=1 +t%yy=1 +tt(1)%y=1 +tt(1)%yy=1 +tt(2)%v=1 +tt(2)%y=1 +tt(2)%yy=1 + +!$omp atomic read + vv(1) = vv(2) +!$omp atomic read + t%v = t%y +!$omp atomic read + t%v = t%yy(1) +!$omp atomic read + tt(1)%v = tt(1)%y +!$omp atomic read + tt(1)%v = tt(2)%v +!$omp atomic read + tt(1)%v = tt(1)%yy(1) +!$omp atomic read + t%yy(2) = t%y +!$omp atomic read + t%yy(2) = t%yy(1) +!$omp atomic read + tt(1)%yy(2) = tt(1)%y +!$omp atomic read + tt(1)%yy(2) = tt(1)%yy(1) +!$omp atomic read + tt(1)%yy(2) = tt(2)%yy(2) +end diff --git a/flang/test/Semantics/windows.f90 b/flang/test/Semantics/windows.f90 new file mode 100644 index 0000000000000..8f9d1aa606c0a --- /dev/null +++ b/flang/test/Semantics/windows.f90 @@ -0,0 +1,12 @@ +! RUN: %python %S/test_errors.py %s %flang --target=x86_64-pc-windows-msvc -Werror +! RUN: %python %S/test_errors.py %s %flang --target=aarch64-pc-windows-msvc -Werror + +subroutine uid + !WARNING: User IDs do not exist on Windows. This function will always return 1 + i = getuid() +end subroutine uid + +subroutine gid + !WARNING: Group IDs do not exist on Windows. This function will always return 1 + i = getgid() +end subroutine gid diff --git a/flang/test/Transforms/debug-derived-type-1.fir b/flang/test/Transforms/debug-derived-type-1.fir index e453db6ae6fbb..26f7017f5f5a3 100644 --- a/flang/test/Transforms/debug-derived-type-1.fir +++ b/flang/test/Transforms/debug-derived-type-1.fir @@ -12,12 +12,18 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d %0 = fir.zero_bits !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}> fir.has_value %0 : !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}> } loc(#loc6) + fir.global @_QMtest_1Exyz : !fir.type<_QMtest_1Tsometype{m_array:!fir.array<3xi32>,m_vt1:!fir.box,height:f32}>>>,v2:i32,m_alloc:!fir.box>>,v3:i32,m_first:!fir.box>>,v4:i32,m_p1:!fir.box>,v5:i32,m_p2:!fir.box>,v6:i32,m_p3:!fir.box>>,v7:i32}> { + %0 = fir.zero_bits !fir.type<_QMtest_1Tsometype{m_array:!fir.array<3xi32>,m_vt1:!fir.box,height:f32}>>>,v2:i32,m_alloc:!fir.box>>,v3:i32,m_first:!fir.box>>,v4:i32,m_p1:!fir.box>,v5:i32,m_p2:!fir.box>,v6:i32,m_p3:!fir.box>>,v7:i32}> + fir.has_value %0 : !fir.type<_QMtest_1Tsometype{m_array:!fir.array<3xi32>,m_vt1:!fir.box,height:f32}>>>,v2:i32,m_alloc:!fir.box>>,v3:i32,m_first:!fir.box>>,v4:i32,m_p1:!fir.box>,v5:i32,m_p2:!fir.box>,v6:i32,m_p3:!fir.box>>,v7:i32}> + } loc(#loc12) fir.type_info @_QMt1Tt_t1 noinit nodestroy nofinal : !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}> loc(#loc7) fir.type_info @_QMm_employeeTt_address noinit nodestroy nofinal : !fir.type<_QMm_employeeTt_address{house_number:i32}> loc(#loc1) fir.type_info @_QMm_employeeTt_person noinit nodestroy nofinal extends !fir.type<_QMm_employeeTt_address{house_number:i32}> : !fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}> loc(#loc2) fir.type_info @_QMm_employeeTt_date noinit nodestroy nofinal : !fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}> loc(#loc3) fir.type_info @_QMm_employeeTt_employee noinit nodestroy nofinal extends !fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}> : !fir.type<_QMm_employeeTt_employee{t_person:!fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}>,hired_date:!fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}>,monthly_salary:f32}> loc(#loc4) fir.type_info @_QFTt_pair noinit nodestroy nofinal : !fir.type<_QFTt_pair{i:i64,x:f64}> loc(#loc8) + fir.type_info @_QMtest_1Tt1 noinit nodestroy nofinal : !fir.type<_QMtest_1Tt1{name:!fir.char<1,20>,height:f32}> loc(#loc11) + fir.type_info @_QMtest_1Tsometype nofinal : !fir.type<_QMtest_1Tsometype{m_array:!fir.array<3xi32>,m_vt1:!fir.box,height:f32}>>>,v2:i32,m_alloc:!fir.box>>,v3:i32,m_first:!fir.box>>,v4:i32,m_p1:!fir.box>,v5:i32,m_p2:!fir.box>,v6:i32,m_p3:!fir.box>>,v7:i32}> loc(#loc12) func.func @_QQmain() attributes {fir.bindc_name = "test"} { %1 = fir.alloca !fir.type<_QFTt_pair{i:i64,x:f64}> {bindc_name = "pair", uniq_name = "_QFEpair"} %2 = fircg.ext_declare %1 {uniq_name = "_QFEpair"} : (!fir.ref>) -> !fir.ref> loc(#loc9) @@ -34,6 +40,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d #loc8 = loc("derived1.f90":85:3) #loc9 = loc("derived1.f90":77:3) #loc10 = loc("derived1.f90":75:3) +#loc11 = loc("derived1.f90":95:3) +#loc12 = loc("derived1.f90":105:3) // CHECK-DAG: #[[INT_TY:.*]] = #llvm.di_basic_type @@ -47,27 +55,42 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}name = "m_employee"{{.*}}> // CHECK-DAG: #[[MOD1:.*]] = #llvm.di_module<{{.*}}name = "t1"{{.*}}> // CHECK-DAG: #[[ELMA1:.*]] = #llvm.di_derived_type -// CHECK-DAG: #[[ADDR:.*]] = #llvm.di_composite_type +// CHECK-DAG: #[[ADDR:.*]] = #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_address"{{.*}}line = 24, scope = #[[MOD]], sizeInBits = 32, elements = #[[ELMA1]]> // CHECK-DAG: #[[ELMD1:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELMD2:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELMD3:.*]] = #llvm.di_derived_type -// CHECK-DAG: #[[DATE:.*]] = #llvm.di_composite_type +// CHECK-DAG: #[[DATE:.*]] = #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_date", file = #di_file, line = 17, scope = #[[MOD]], sizeInBits = 96, elements = #[[ELMD1]], #[[ELMD2]], #[[ELMD3]]> // CHECK-DAG: #[[ELMP1:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELMP2:.*]] = #llvm.di_derived_type -// CHECK-DAG: #[[PERS:.*]] = #llvm.di_composite_type +// CHECK-DAG: #[[PERS:.*]] = #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_person"{{.*}}line = 35, scope = #[[MOD]], sizeInBits = 192, elements = #[[ELMP1]], #[[ELMP2]]> // CHECK-DAG: #[[ELME1:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELME2:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELME3:.*]] = #llvm.di_derived_type -// CHECK-DAG: #[[EMP:.*]] = #llvm.di_composite_type +// CHECK-DAG: #[[EMP:.*]] = #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_employee"{{.*}}line = 46, scope = #[[MOD]], sizeInBits = 320, elements = #[[ELME1]], #[[ELME2]], #[[ELME3]]> // CHECK-DAG: #[[ELM1:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELM2:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELM3:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELM4:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELM5:.*]] = #llvm.di_derived_type -// CHECK-DAG: #llvm.di_composite_type +// CHECK-DAG: #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_t1"{{.*}}, line = 70, scope = #[[MOD1]], sizeInBits = 512, elements = #[[ELM1]], #[[ELM2]], #[[ELM3]], #[[ELM4]], #[[ELM5]]> // CHECK-DAG: #[[SP:.*]] = #llvm.di_subprogram // CHECK-DAG: #[[ELML1:.*]] = #llvm.di_derived_type // CHECK-DAG: #[[ELML2:.*]] = #llvm.di_derived_type -// CHECK-DAG: #llvm.di_composite_type +// CHECK-DAG: #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_pair"{{.*}}line = 85, scope = #di_subprogram, sizeInBits = 128, elements = #[[ELML1]], #[[ELML2]]> + +// CHECK-DAG: #[[E1:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E2:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E3:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E4:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E5:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E6:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E7:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E8:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E9:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E10:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E11:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E12:.*]] = #llvm.di_derived_type +// CHECK-DAG: #[[E13:.*]] = #llvm.di_derived_type +// CHECK-DAG: #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "sometype"{{.*}}sizeInBits = 2144, elements = #[[E1]], #[[E2]], #[[E3]], #[[E4]], #[[E5]], #[[E6]], #[[E7]], #[[E8]], #[[E9]], #[[E10]], #[[E11]], #[[E12]], #[[E13]]> diff --git a/flang/unittests/Optimizer/Builder/Runtime/CommandTest.cpp b/flang/unittests/Optimizer/Builder/Runtime/CommandTest.cpp index 58a151447d5b4..8bc1e87814a98 100644 --- a/flang/unittests/Optimizer/Builder/Runtime/CommandTest.cpp +++ b/flang/unittests/Optimizer/Builder/Runtime/CommandTest.cpp @@ -50,4 +50,4 @@ TEST_F(RuntimeCallTest, genGetPID) { mlir::Value result = fir::runtime::genGetPID(*firBuilder, loc); checkCallOp(result.getDefiningOp(), "_FortranAGetPID", /*nbArgs=*/0, /*addLocArgs=*/false); -} \ No newline at end of file +} diff --git a/flang/unittests/Optimizer/Builder/Runtime/IntrinsicsTest.cpp b/flang/unittests/Optimizer/Builder/Runtime/IntrinsicsTest.cpp new file mode 100644 index 0000000000000..1440a5fd01c2b --- /dev/null +++ b/flang/unittests/Optimizer/Builder/Runtime/IntrinsicsTest.cpp @@ -0,0 +1,17 @@ +#include "flang/Optimizer/Builder/Runtime/Intrinsics.h" +#include "RuntimeCallTestBase.h" +#include "gtest/gtest.h" + +TEST_F(RuntimeCallTest, genGetGID) { + mlir::Location loc = firBuilder->getUnknownLoc(); + mlir::Value result = fir::runtime::genGetGID(*firBuilder, loc); + checkCallOp(result.getDefiningOp(), "_FortranAGetGID", /*nbArgs=*/0, + /*addLocArgs=*/false); +} + +TEST_F(RuntimeCallTest, genGetUID) { + mlir::Location loc = firBuilder->getUnknownLoc(); + mlir::Value result = fir::runtime::genGetUID(*firBuilder, loc); + checkCallOp(result.getDefiningOp(), "_FortranAGetUID", /*nbArgs=*/0, + /*addLocArgs=*/false); +} diff --git a/flang/unittests/Optimizer/CMakeLists.txt b/flang/unittests/Optimizer/CMakeLists.txt index 7299e3ee0529a..c58fb226a175c 100644 --- a/flang/unittests/Optimizer/CMakeLists.txt +++ b/flang/unittests/Optimizer/CMakeLists.txt @@ -25,6 +25,7 @@ add_flang_unittest(FlangOptimizerTests Builder/Runtime/CommandTest.cpp Builder/Runtime/CharacterTest.cpp Builder/Runtime/DerivedTest.cpp + Builder/Runtime/IntrinsicsTest.cpp Builder/Runtime/NumericTest.cpp Builder/Runtime/RaggedTest.cpp Builder/Runtime/ReductionTest.cpp diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp index 9ed9d99a60724..ca13414519d4c 100644 --- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp @@ -281,6 +281,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, // Initialize the packet header and set the doorbell signal to begin execution // by the HSA runtime. uint16_t header = + 1u << HSA_PACKET_HEADER_BARRIER | (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); @@ -540,11 +541,11 @@ int load(int argc, const char **argv, const char **envp, void *image, } } - // Obtain a queue with the minimum (power of two) size, used to send commands + // Obtain a queue with the maximum (power of two) size, used to send commands // to the HSA runtime and launch execution on the device. uint64_t queue_size; if (hsa_status_t err = hsa_agent_get_info( - dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size)) + dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size)) handle_error(err); hsa_queue_t *queue = nullptr; if (hsa_status_t err = diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp index 602b266f9dacb..6951c5ae147df 100644 --- a/libc/utils/gpu/server/rpc_server.cpp +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -42,8 +42,19 @@ static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer), static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT, "Incorrect maximum port count"); +namespace { +struct TempStorage { + char *alloc(size_t size) { + storage.emplace_back(std::make_unique(size)); + return storage.back().get(); + } + + std::vector> storage; +}; +} // namespace + template -void handle_printf(rpc::Server::Port &port) { +static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) { FILE *files[lane_size] = {nullptr}; // Get the appropriate output stream to use. if (port.get_opcode() == RPC_PRINTF_TO_STREAM || @@ -65,7 +76,7 @@ void handle_printf(rpc::Server::Port &port) { // Recieve the format string and arguments from the client. port.recv_n(format, format_sizes, - [&](uint64_t size) { return new char[size]; }); + [&](uint64_t size) { return temp_storage.alloc(size); }); // Parse the format string to get the expected size of the buffer. for (uint32_t lane = 0; lane < lane_size; ++lane) { @@ -88,7 +99,8 @@ void handle_printf(rpc::Server::Port &port) { port.send([&](rpc::Buffer *buffer, uint32_t id) { buffer->data[0] = args_sizes[id]; }); - port.recv_n(args, args_sizes, [&](uint64_t size) { return new char[size]; }); + port.recv_n(args, args_sizes, + [&](uint64_t size) { return temp_storage.alloc(size); }); // Identify any arguments that are actually pointers to strings on the client. // Additionally we want to determine how much buffer space we need to print. @@ -137,7 +149,8 @@ void handle_printf(rpc::Server::Port &port) { }); uint64_t str_sizes[lane_size] = {0}; void *strs[lane_size] = {nullptr}; - port.recv_n(strs, str_sizes, [](uint64_t size) { return new char[size]; }); + port.recv_n(strs, str_sizes, + [&](uint64_t size) { return temp_storage.alloc(size); }); for (uint32_t lane = 0; lane < lane_size; ++lane) { if (!strs[lane]) continue; @@ -149,13 +162,12 @@ void handle_printf(rpc::Server::Port &port) { // Perform the final formatting and printing using the LLVM C library printf. int results[lane_size] = {0}; - std::vector to_be_deleted; for (uint32_t lane = 0; lane < lane_size; ++lane) { if (!format[lane]) continue; - std::unique_ptr buffer(new char[buffer_size[lane]]); - WriteBuffer wb(buffer.get(), buffer_size[lane]); + char *buffer = temp_storage.alloc(buffer_size[lane]); + WriteBuffer wb(buffer, buffer_size[lane]); Writer writer(&wb); internal::StructArgList printf_args(args[lane], args_sizes[lane]); @@ -173,7 +185,6 @@ void handle_printf(rpc::Server::Port &port) { if (cur_section.has_conv && cur_section.conv_name == 's') { if (!copied_strs[lane].empty()) { cur_section.conv_val_ptr = copied_strs[lane].back(); - to_be_deleted.push_back(copied_strs[lane].back()); copied_strs[lane].pop_back(); } else { cur_section.conv_val_ptr = nullptr; @@ -188,8 +199,7 @@ void handle_printf(rpc::Server::Port &port) { } } - results[lane] = - fwrite(buffer.get(), 1, writer.get_chars_written(), files[lane]); + results[lane] = fwrite(buffer, 1, writer.get_chars_written(), files[lane]); if (results[lane] != writer.get_chars_written() || ret == -1) results[lane] = -1; } @@ -199,11 +209,7 @@ void handle_printf(rpc::Server::Port &port) { port.send([&](rpc::Buffer *buffer, uint32_t id) { buffer->data[0] = static_cast(results[id]); buffer->data[1] = reinterpret_cast(nullptr); - delete[] reinterpret_cast(format[id]); - delete[] reinterpret_cast(args[id]); }); - for (void *ptr : to_be_deleted) - delete[] reinterpret_cast(ptr); } template @@ -216,6 +222,8 @@ rpc_status_t handle_server_impl( if (!port) return RPC_STATUS_SUCCESS; + TempStorage temp_storage; + switch (port->get_opcode()) { case RPC_WRITE_TO_STREAM: case RPC_WRITE_TO_STDERR: @@ -234,7 +242,8 @@ rpc_status_t handle_server_impl( std::fill(files, files + lane_size, stdout); } - port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; }); + port->recv_n(strs, sizes, + [&](uint64_t size) { return temp_storage.alloc(size); }); port->send([&](rpc::Buffer *buffer, uint32_t id) { flockfile(files[id]); buffer->data[0] = fwrite_unlocked(strs[id], 1, sizes[id], files[id]); @@ -242,7 +251,6 @@ rpc_status_t handle_server_impl( buffer->data[0] == sizes[id]) buffer->data[0] += fwrite_unlocked("\n", 1, 1, files[id]); funlockfile(files[id]); - delete[] reinterpret_cast(strs[id]); }); break; } @@ -250,13 +258,12 @@ rpc_status_t handle_server_impl( uint64_t sizes[lane_size] = {0}; void *data[lane_size] = {nullptr}; port->recv([&](rpc::Buffer *buffer, uint32_t id) { - data[id] = new char[buffer->data[0]]; + data[id] = temp_storage.alloc(buffer->data[0]); sizes[id] = fread(data[id], 1, buffer->data[0], file::to_stream(buffer->data[1])); }); port->send_n(data, sizes); port->send([&](rpc::Buffer *buffer, uint32_t id) { - delete[] reinterpret_cast(data[id]); std::memcpy(buffer->data, &sizes[id], sizeof(uint64_t)); }); break; @@ -265,27 +272,24 @@ rpc_status_t handle_server_impl( uint64_t sizes[lane_size] = {0}; void *data[lane_size] = {nullptr}; port->recv([&](rpc::Buffer *buffer, uint32_t id) { - data[id] = new char[buffer->data[0]]; + data[id] = temp_storage.alloc(buffer->data[0]); const char *str = fgets(reinterpret_cast(data[id]), buffer->data[0], file::to_stream(buffer->data[1])); sizes[id] = !str ? 0 : std::strlen(str) + 1; }); port->send_n(data, sizes); - for (uint32_t id = 0; id < lane_size; ++id) - if (data[id]) - delete[] reinterpret_cast(data[id]); break; } case RPC_OPEN_FILE: { uint64_t sizes[lane_size] = {0}; void *paths[lane_size] = {nullptr}; - port->recv_n(paths, sizes, [&](uint64_t size) { return new char[size]; }); + port->recv_n(paths, sizes, + [&](uint64_t size) { return temp_storage.alloc(size); }); port->recv_and_send([&](rpc::Buffer *buffer, uint32_t id) { FILE *file = fopen(reinterpret_cast(paths[id]), reinterpret_cast(buffer->data)); buffer->data[0] = reinterpret_cast(file); - delete[] reinterpret_cast(paths[id]); }); break; } @@ -316,13 +320,12 @@ rpc_status_t handle_server_impl( case RPC_HOST_CALL: { uint64_t sizes[lane_size] = {0}; void *args[lane_size] = {nullptr}; - port->recv_n(args, sizes, [&](uint64_t size) { return new char[size]; }); + port->recv_n(args, sizes, + [&](uint64_t size) { return temp_storage.alloc(size); }); port->recv([&](rpc::Buffer *buffer, uint32_t id) { reinterpret_cast(buffer->data[0])(args[id]); }); - port->send([&](rpc::Buffer *, uint32_t id) { - delete[] reinterpret_cast(args[id]); - }); + port->send([&](rpc::Buffer *, uint32_t id) {}); break; } case RPC_FEOF: { @@ -373,23 +376,23 @@ rpc_status_t handle_server_impl( case RPC_PRINTF_TO_STREAM_PACKED: case RPC_PRINTF_TO_STDOUT_PACKED: case RPC_PRINTF_TO_STDERR_PACKED: { - handle_printf(*port); + handle_printf(*port, temp_storage); break; } case RPC_PRINTF_TO_STREAM: case RPC_PRINTF_TO_STDOUT: case RPC_PRINTF_TO_STDERR: { - handle_printf(*port); + handle_printf(*port, temp_storage); break; } case RPC_REMOVE: { uint64_t sizes[lane_size] = {0}; void *args[lane_size] = {nullptr}; - port->recv_n(args, sizes, [&](uint64_t size) { return new char[size]; }); + port->recv_n(args, sizes, + [&](uint64_t size) { return temp_storage.alloc(size); }); port->send([&](rpc::Buffer *buffer, uint32_t id) { buffer->data[0] = static_cast( remove(reinterpret_cast(args[id]))); - delete[] reinterpret_cast(args[id]); }); break; } @@ -399,26 +402,24 @@ rpc_status_t handle_server_impl( void *oldpath[lane_size] = {nullptr}; void *newpath[lane_size] = {nullptr}; port->recv_n(oldpath, oldsizes, - [&](uint64_t size) { return new char[size]; }); + [&](uint64_t size) { return temp_storage.alloc(size); }); port->recv_n(newpath, newsizes, - [&](uint64_t size) { return new char[size]; }); + [&](uint64_t size) { return temp_storage.alloc(size); }); port->send([&](rpc::Buffer *buffer, uint32_t id) { buffer->data[0] = static_cast( rename(reinterpret_cast(oldpath[id]), reinterpret_cast(newpath[id]))); - delete[] reinterpret_cast(oldpath[id]); - delete[] reinterpret_cast(newpath[id]); }); break; } case RPC_SYSTEM: { uint64_t sizes[lane_size] = {0}; void *args[lane_size] = {nullptr}; - port->recv_n(args, sizes, [&](uint64_t size) { return new char[size]; }); + port->recv_n(args, sizes, + [&](uint64_t size) { return temp_storage.alloc(size); }); port->send([&](rpc::Buffer *buffer, uint32_t id) { buffer->data[0] = static_cast( system(reinterpret_cast(args[id]))); - delete[] reinterpret_cast(args[id]); }); break; } diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 1bf7eb2ca7ed7..260e4d433a1d4 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -221,8 +221,10 @@ if( ENABLE_RUNTIME_SUBNORMAL ) TARGET ${file} INPUTS ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/${file}.ll ) - install( FILES $ ARCHIVE - DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" ) + install( + FILES $ + DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" + ) endforeach() endif() @@ -426,9 +428,9 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) add_custom_target( ${builtins_opt_lib_tgt} ALL DEPENDS ${builtins_opt_lib_tgt}.bc ) - set_target_properties( ${builtins_opt_lib_tgt} - PROPERTIES TARGET_FILE ${builtins_opt_lib_tgt}.bc - FOLDER "libclc/Device IR/Opt" + set_target_properties( ${builtins_opt_lib_tgt} PROPERTIES + TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${builtins_opt_lib_tgt}.bc + FOLDER "libclc/Device IR/Opt" ) set( builtins_opt_lib $ ) diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 839815d8cc6ff..f2032660ba99b 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -113,7 +113,7 @@ function(link_bc) add_custom_target( ${ARG_TARGET} ALL DEPENDS ${ARG_TARGET}.bc ) set_target_properties( ${ARG_TARGET} PROPERTIES - TARGET_FILE ${ARG_TARGET}.bc + TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${ARG_TARGET}.bc FOLDER "libclc/Device IR/Linking" ) endfunction() diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv index bde203ea12f14..a62c4992020a0 100644 --- a/libcxx/docs/Status/Cxx2cIssues.csv +++ b/libcxx/docs/Status/Cxx2cIssues.csv @@ -50,7 +50,7 @@ "`LWG4013 `__","``lazy_split_view::outer-iterator::value_type`` should not provide default constructor","2024-03 (Tokyo)","","","" "`LWG4016 `__","container-insertable checks do not match what container-inserter does","2024-03 (Tokyo)","","","" "`LWG4023 `__","Preconditions of ``std::basic_streambuf::setg/setp``","2024-03 (Tokyo)","|Complete|","19.0","" -"`LWG4025 `__","Move assignment operator of ``std::expected`` should not be conditionally deleted","2024-03 (Tokyo)","","","" +"`LWG4025 `__","Move assignment operator of ``std::expected`` should not be conditionally deleted","2024-03 (Tokyo)","|Complete|","20.0","" "`LWG4030 `__","Clarify whether arithmetic expressions in ``[numeric.sat.func]`` are mathematical or C++","2024-03 (Tokyo)","|Nothing To Do|","","" "`LWG4031 `__","``bad_expected_access`` member functions should be ``noexcept``","2024-03 (Tokyo)","|Complete|","16.0","" "`LWG4035 `__","``single_view`` should provide ``empty``","2024-03 (Tokyo)","|Complete|","19.0","" diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 8a63280053340..0be6c1ae59182 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -688,6 +688,7 @@ set(files __ranges/views.h __ranges/zip_view.h __split_buffer + __std_clang_module __std_mbstate_t.h __stop_token/atomic_unique_lock.h __stop_token/intrusive_list_view.h diff --git a/libcxx/include/__config b/libcxx/include/__config index f90c966aad695..0d71264611ff6 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -649,6 +649,10 @@ typedef __char32_t char32_t; __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) # define _LIBCPP_HAS_NO_C11_ALIGNED_ALLOC # endif +# if (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && \ + __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) +# define _LIBCPP_HAS_NO_C11_ALIGNED_ALLOC +# endif # elif defined(__ANDROID__) && __ANDROID_API__ < 28 // Android only provides aligned_alloc when targeting API 28 or higher. # define _LIBCPP_HAS_NO_C11_ALIGNED_ALLOC diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h index 6b39e20d858be..b10f29590a2c9 100644 --- a/libcxx/include/__configuration/availability.h +++ b/libcxx/include/__configuration/availability.h @@ -141,7 +141,9 @@ # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 150000) || \ (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 180000) || \ (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 180000) || \ - (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 110000) + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 110000) || \ + (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 90000) || \ + (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 240000) # define _LIBCPP_INTRODUCED_IN_LLVM_18 0 # else # define _LIBCPP_INTRODUCED_IN_LLVM_18 1 @@ -150,13 +152,17 @@ __attribute__((availability(macos, strict, introduced = 15.0))) \ __attribute__((availability(ios, strict, introduced = 18.0))) \ __attribute__((availability(tvos, strict, introduced = 18.0))) \ - __attribute__((availability(watchos, strict, introduced = 11.0))) + __attribute__((availability(watchos, strict, introduced = 11.0))) \ + __attribute__((availability(bridgeos, strict, introduced = 9.0))) \ + __attribute__((availability(driverkit, strict, introduced = 24.0))) // LLVM 17 # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140400) || \ (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170400) || \ (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 170400) || \ - (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100400) + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100400) || \ + (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 80400) || \ + (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 230400) # define _LIBCPP_INTRODUCED_IN_LLVM_17 0 # else # define _LIBCPP_INTRODUCED_IN_LLVM_17 1 @@ -165,13 +171,17 @@ __attribute__((availability(macos, strict, introduced = 14.4))) \ __attribute__((availability(ios, strict, introduced = 17.4))) \ __attribute__((availability(tvos, strict, introduced = 17.4))) \ - __attribute__((availability(watchos, strict, introduced = 10.4))) + __attribute__((availability(watchos, strict, introduced = 10.4))) \ + __attribute__((availability(bridgeos, strict, introduced = 8.4))) \ + __attribute__((availability(driverkit, strict, introduced = 23.4))) // LLVM 16 # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140000) || \ (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170000) || \ (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 170000) || \ - (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100000) + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100000) || \ + (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 80000) || \ + (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 230000) # define _LIBCPP_INTRODUCED_IN_LLVM_16 0 # else # define _LIBCPP_INTRODUCED_IN_LLVM_16 1 @@ -180,13 +190,17 @@ __attribute__((availability(macos, strict, introduced = 14.0))) \ __attribute__((availability(ios, strict, introduced = 17.0))) \ __attribute__((availability(tvos, strict, introduced = 17.0))) \ - __attribute__((availability(watchos, strict, introduced = 10.0))) + __attribute__((availability(watchos, strict, introduced = 10.0))) \ + __attribute__((availability(bridgeos, strict, introduced = 8.0))) \ + __attribute__((availability(driverkit, strict, introduced = 23.0))) // LLVM 15 # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130400) || \ (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160500) || \ (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160500) || \ - (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500) + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500) || \ + (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 70500) || \ + (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 220400) # define _LIBCPP_INTRODUCED_IN_LLVM_15 0 # else # define _LIBCPP_INTRODUCED_IN_LLVM_15 1 @@ -195,7 +209,9 @@ __attribute__((availability(macos, strict, introduced = 13.4))) \ __attribute__((availability(ios, strict, introduced = 16.5))) \ __attribute__((availability(tvos, strict, introduced = 16.5))) \ - __attribute__((availability(watchos, strict, introduced = 9.5))) + __attribute__((availability(watchos, strict, introduced = 9.5))) \ + __attribute__((availability(bridgeos, strict, introduced = 7.5))) \ + __attribute__((availability(driverkit, strict, introduced = 22.4))) // LLVM 14 # define _LIBCPP_INTRODUCED_IN_LLVM_14 _LIBCPP_INTRODUCED_IN_LLVM_15 @@ -205,7 +221,9 @@ # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130000) || \ (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160000) || \ (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160000) || \ - (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90000) + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90000) || \ + (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 70000) || \ + (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 220000) # define _LIBCPP_INTRODUCED_IN_LLVM_13 0 # else # define _LIBCPP_INTRODUCED_IN_LLVM_13 1 @@ -214,13 +232,17 @@ __attribute__((availability(macos, strict, introduced = 13.0))) \ __attribute__((availability(ios, strict, introduced = 16.0))) \ __attribute__((availability(tvos, strict, introduced = 16.0))) \ - __attribute__((availability(watchos, strict, introduced = 9.0))) + __attribute__((availability(watchos, strict, introduced = 9.0))) \ + __attribute__((availability(bridgeos, strict, introduced = 7.0))) \ + __attribute__((availability(driverkit, strict, introduced = 22.0))) // LLVM 12 # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 120300) || \ (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 150300) || \ (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 150300) || \ - (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80300) + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80300) || \ + (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 60000) || \ + (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 210300) # define _LIBCPP_INTRODUCED_IN_LLVM_12 0 # else # define _LIBCPP_INTRODUCED_IN_LLVM_12 1 @@ -229,7 +251,9 @@ __attribute__((availability(macos, strict, introduced = 12.3))) \ __attribute__((availability(ios, strict, introduced = 15.3))) \ __attribute__((availability(tvos, strict, introduced = 15.3))) \ - __attribute__((availability(watchos, strict, introduced = 8.3))) + __attribute__((availability(watchos, strict, introduced = 8.3))) \ + __attribute__((availability(bridgeos, strict, introduced = 6.0))) \ + __attribute__((availability(driverkit, strict, introduced = 21.3))) // LLVM 11 # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000) || \ diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h index f4ad455a19ea5..e04e17d1ebe69 100644 --- a/libcxx/include/__expected/expected.h +++ b/libcxx/include/__expected/expected.h @@ -1493,8 +1493,6 @@ class expected<_Tp, _Err> : private __expected_void_base<_Err> { return *this; } - _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(expected&&) = delete; - _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(expected&& __rhs) noexcept(is_nothrow_move_assignable_v<_Err> && is_nothrow_move_constructible_v<_Err>) requires(is_move_assignable_v<_Err> && is_move_constructible_v<_Err>) diff --git a/libcxx/include/__format/formatter_integral.h b/libcxx/include/__format/formatter_integral.h index 0c04cce855a08..beed3ab8d93df 100644 --- a/libcxx/include/__format/formatter_integral.h +++ b/libcxx/include/__format/formatter_integral.h @@ -27,7 +27,6 @@ #include <__type_traits/make_unsigned.h> #include <__utility/unreachable.h> #include -#include #include #include #include diff --git a/libcxx/include/__iterator/next.h b/libcxx/include/__iterator/next.h index fb6c8ea6d7550..1f68a5bec8f39 100644 --- a/libcxx/include/__iterator/next.h +++ b/libcxx/include/__iterator/next.h @@ -25,7 +25,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template ::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter +[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = 1) { // Calling `advance` with a negative value on a non-bidirectional iterator is a no-op in the current implementation. // Note that this check duplicates the similar check in `std::advance`. @@ -43,25 +43,26 @@ next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = namespace ranges { struct __next { template - _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const { ++__x; return __x; } template - _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const { ranges::advance(__x, __n); return __x; } template _Sp> - _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, _Sp __bound_sentinel) const { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, _Sp __bound_sentinel) const { ranges::advance(__x, __bound_sentinel); return __x; } template _Sp> - _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n, _Sp __bound_sentinel) const { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip + operator()(_Ip __x, iter_difference_t<_Ip> __n, _Sp __bound_sentinel) const { ranges::advance(__x, __n, __bound_sentinel); return __x; } diff --git a/libcxx/include/__iterator/prev.h b/libcxx/include/__iterator/prev.h index e950d8dc41471..7e97203836eb9 100644 --- a/libcxx/include/__iterator/prev.h +++ b/libcxx/include/__iterator/prev.h @@ -25,7 +25,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template ::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter +[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = 1) { // Calling `advance` with a negative value on a non-bidirectional iterator is a no-op in the current implementation. // Note that this check duplicates the similar check in `std::advance`. @@ -42,19 +42,20 @@ prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = namespace ranges { struct __prev { template - _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const { --__x; return __x; } template - _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const { ranges::advance(__x, -__n); return __x; } template - _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n, _Ip __bound_iter) const { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip + operator()(_Ip __x, iter_difference_t<_Ip> __n, _Ip __bound_iter) const { ranges::advance(__x, -__n, __bound_iter); return __x; } diff --git a/libcxx/include/__std_clang_module b/libcxx/include/__std_clang_module new file mode 100644 index 0000000000000..a21ed26addfe8 --- /dev/null +++ b/libcxx/include/__std_clang_module @@ -0,0 +1,193 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// WARNING, this entire header is generated by +// utils/generate_std_clang_module_header.py +// DO NOT MODIFY! + +// This header should not be directly included, it's exclusively to import all +// of the libc++ public clang modules for the `std` clang module to export. In +// other words, it's to facilitate `@import std;` in Objective-C++ and `import std` +// in Swift to expose all of the libc++ interfaces. This is generally not +// recommended, however there are some clients that need to import all of libc++ +// without knowing what "all" is. +#if !__building_module(std) +# error "Do not include this header directly, include individual headers instead" +#endif + +#include <__config> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#include +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#include +#include +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#include +#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index aa05bde939f6c..f8a44c4ab4217 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1,2235 +1,2125 @@ -// This module contains headers related to the configuration of the library. These headers -// are free of any dependency on the rest of libc++. -module std_config [system] { - textual header "__config" - textual header "__configuration/abi.h" - textual header "__configuration/availability.h" - textual header "__configuration/compiler.h" - textual header "__configuration/language.h" - textual header "__configuration/platform.h" - textual header "version" +// Main C++ standard library interfaces +module std_algorithm [system] { + header "algorithm" + export * } - -module std_core [system] { - module cstddef { - module byte { header "__cstddef/byte.h" } - module max_align_t { header "__cstddef/max_align_t.h" } - module nullptr_t { header "__cstddef/nullptr_t.h" } - module ptrdiff_t { header "__cstddef/ptrdiff_t.h" } - module size_t { header "__cstddef/size_t.h" } - } - - module cstdint { - header "cstdint" - export * - } - - module fwd { - module byte { header "__fwd/byte.h" } - module functional { header "__fwd/functional.h" } - module pair { header "__fwd/pair.h" } - module tuple { header "__fwd/tuple.h" } - } - - module limits { - header "limits" - export * - } - - module math { - module abs { header "__math/abs.h" } - module copysign { header "__math/copysign.h" } - module error_functions { header "__math/error_functions.h" } - module exponential_functions { header "__math/exponential_functions.h" } - module fdim { header "__math/fdim.h" } - module fma { header "__math/fma.h" } - module gamma { header "__math/gamma.h" } - module hyperbolic_functions { header "__math/hyperbolic_functions.h" } - module hypot { header "__math/hypot.h" } - module inverse_hyperbolic_functions { header "__math/inverse_hyperbolic_functions.h" } - module inverse_trigonometric_functions { header "__math/inverse_trigonometric_functions.h" } - module logarithms { header "__math/logarithms.h" } - module min_max { header "__math/min_max.h" } - module modulo { header "__math/modulo.h" } - module remainder { header "__math/remainder.h" } - module roots { header "__math/roots.h" } - module rounding_functions { header "__math/rounding_functions.h" } - module special_functions { header "__math/special_functions.h" } - module traits { header "__math/traits.h" } - module trigonometric_functions { header "__math/trigonometric_functions.h" } - } - - module type_traits { - module add_const { header "__type_traits/add_const.h" } - module add_cv { header "__type_traits/add_cv.h" } - module add_lvalue_reference { header "__type_traits/add_lvalue_reference.h" } - module add_pointer { header "__type_traits/add_pointer.h" } - module add_rvalue_reference { header "__type_traits/add_rvalue_reference.h" } - module add_volatile { header "__type_traits/add_volatile.h" } - module aligned_storage { header "__type_traits/aligned_storage.h" } - module aligned_union { header "__type_traits/aligned_union.h" } - module alignment_of { header "__type_traits/alignment_of.h" } - module can_extract_key { header "__type_traits/can_extract_key.h" } - module common_reference { header "__type_traits/common_reference.h" } - module common_type { header "__type_traits/common_type.h" } - module conditional { header "__type_traits/conditional.h" } - module conjunction { header "__type_traits/conjunction.h" } - module copy_cv { header "__type_traits/copy_cv.h" } - module copy_cvref { header "__type_traits/copy_cvref.h" } - module datasizeof { header "__type_traits/datasizeof.h" } - module decay { header "__type_traits/decay.h" } - module dependent_type { header "__type_traits/dependent_type.h" } - module desugars_to { header "__type_traits/desugars_to.h" } - module disjunction { header "__type_traits/disjunction.h" } - module enable_if { header "__type_traits/enable_if.h" } - module extent { header "__type_traits/extent.h" } - module has_unique_object_representation { header "__type_traits/has_unique_object_representation.h" } - module has_virtual_destructor { header "__type_traits/has_virtual_destructor.h" } - module integral_constant { header "__type_traits/integral_constant.h" } - module invoke { header "__type_traits/invoke.h" } - module is_abstract { - header "__type_traits/is_abstract.h" - export std_core.type_traits.integral_constant - } - module is_aggregate { - header "__type_traits/is_aggregate.h" - export std_core.type_traits.integral_constant - } - module is_allocator { - header "__type_traits/is_allocator.h" - export std_core.type_traits.integral_constant - } - module is_always_bitcastable { - header "__type_traits/is_always_bitcastable.h" - export std_core.type_traits.integral_constant - } - module is_arithmetic { - header "__type_traits/is_arithmetic.h" - export std_core.type_traits.integral_constant - } - module is_array { - header "__type_traits/is_array.h" - export std_core.type_traits.integral_constant - } - module is_assignable { - header "__type_traits/is_assignable.h" - export std_core.type_traits.integral_constant - } - module is_base_of { - header "__type_traits/is_base_of.h" - export std_core.type_traits.integral_constant - } - module is_bounded_array { - header "__type_traits/is_bounded_array.h" - export std_core.type_traits.integral_constant - } - module is_callable { - header "__type_traits/is_callable.h" - export std_core.type_traits.integral_constant - } - module is_char_like_type { - header "__type_traits/is_char_like_type.h" - export std_core.type_traits.integral_constant - } - module is_class { - header "__type_traits/is_class.h" - export std_core.type_traits.integral_constant - } - module is_compound { - header "__type_traits/is_compound.h" - export std_core.type_traits.integral_constant - } - module is_const { - header "__type_traits/is_const.h" - export std_core.type_traits.integral_constant - } - module is_constant_evaluated { - header "__type_traits/is_constant_evaluated.h" - export std_core.type_traits.integral_constant - } - module is_constructible { - header "__type_traits/is_constructible.h" - export std_core.type_traits.integral_constant - } - module is_convertible { - header "__type_traits/is_convertible.h" - export std_core.type_traits.integral_constant - } - module is_core_convertible { - header "__type_traits/is_core_convertible.h" - export std_core.type_traits.integral_constant - } - module is_destructible { - header "__type_traits/is_destructible.h" - export std_core.type_traits.integral_constant - } - module is_empty { - header "__type_traits/is_empty.h" - export std_core.type_traits.integral_constant - } - module is_enum { - header "__type_traits/is_enum.h" - export std_core.type_traits.integral_constant - } - module is_equality_comparable { - header "__type_traits/is_equality_comparable.h" - export std_core.type_traits.integral_constant - } - module is_execution_policy { - header "__type_traits/is_execution_policy.h" - export std_core.type_traits.integral_constant - } - module is_final { - header "__type_traits/is_final.h" - export std_core.type_traits.integral_constant - } - module is_floating_point { - header "__type_traits/is_floating_point.h" - export std_core.type_traits.integral_constant - } - module is_function { - header "__type_traits/is_function.h" - export std_core.type_traits.integral_constant - } - module is_fundamental { - header "__type_traits/is_fundamental.h" - export std_core.type_traits.integral_constant - } - module is_implicitly_default_constructible { - header "__type_traits/is_implicitly_default_constructible.h" - export std_core.type_traits.integral_constant - } - module is_integral { - header "__type_traits/is_integral.h" - export std_core.type_traits.integral_constant - } - module is_literal_type { - header "__type_traits/is_literal_type.h" - export std_core.type_traits.integral_constant - } - module is_member_pointer { - header "__type_traits/is_member_pointer.h" - export std_core.type_traits.integral_constant - } - module is_nothrow_assignable { - header "__type_traits/is_nothrow_assignable.h" - export std_core.type_traits.integral_constant - } - module is_nothrow_constructible { - header "__type_traits/is_nothrow_constructible.h" - export std_core.type_traits.integral_constant - } - module is_nothrow_convertible { - header "__type_traits/is_nothrow_convertible.h" - export std_core.type_traits.integral_constant - } - module is_nothrow_destructible { - header "__type_traits/is_nothrow_destructible.h" - export std_core.type_traits.integral_constant - } - module is_null_pointer { - header "__type_traits/is_null_pointer.h" - export std_core.type_traits.integral_constant - } - module is_object { - header "__type_traits/is_object.h" - export std_core.type_traits.integral_constant - } - module is_pod { - header "__type_traits/is_pod.h" - export std_core.type_traits.integral_constant - } - module is_pointer { - header "__type_traits/is_pointer.h" - export std_core.type_traits.integral_constant - } - module is_polymorphic { - header "__type_traits/is_polymorphic.h" - export std_core.type_traits.integral_constant - } - module is_primary_template { - header "__type_traits/is_primary_template.h" - export std_core.type_traits.integral_constant - } - module is_reference_wrapper { - header "__type_traits/is_reference_wrapper.h" - export std_core.type_traits.integral_constant - } - module is_reference { - header "__type_traits/is_reference.h" - export std_core.type_traits.integral_constant - } - module is_referenceable { - header "__type_traits/is_referenceable.h" - export std_core.type_traits.integral_constant - } - module is_same { - header "__type_traits/is_same.h" - export std_core.type_traits.integral_constant - } - module is_scalar { - header "__type_traits/is_scalar.h" - export std_core.type_traits.integral_constant - } - module is_signed_integer { - header "__type_traits/is_signed_integer.h" - export std_core.type_traits.integral_constant - } - module is_signed { - header "__type_traits/is_signed.h" - export std_core.type_traits.integral_constant - } - module is_specialization { - header "__type_traits/is_specialization.h" - export std_core.type_traits.integral_constant - } - module is_standard_layout { - header "__type_traits/is_standard_layout.h" - export std_core.type_traits.integral_constant - } - module is_swappable { - header "__type_traits/is_swappable.h" - export std_core.type_traits.integral_constant - } - module is_trivial { - header "__type_traits/is_trivial.h" - export std_core.type_traits.integral_constant - } - module is_trivially_assignable { - header "__type_traits/is_trivially_assignable.h" - export std_core.type_traits.integral_constant - } - module is_trivially_constructible { - header "__type_traits/is_trivially_constructible.h" - export std_core.type_traits.integral_constant - } - module is_trivially_copyable { - header "__type_traits/is_trivially_copyable.h" - export std_core.type_traits.integral_constant - } - module is_trivially_destructible { - header "__type_traits/is_trivially_destructible.h" - export std_core.type_traits.integral_constant - } - module is_trivially_lexicographically_comparable { - header "__type_traits/is_trivially_lexicographically_comparable.h" - export std_core.type_traits.integral_constant - } - module is_trivially_relocatable { - header "__type_traits/is_trivially_relocatable.h" - export std_core.type_traits.integral_constant - } - module is_unbounded_array { - header "__type_traits/is_unbounded_array.h" - export std_core.type_traits.integral_constant - } - module is_union { - header "__type_traits/is_union.h" - export std_core.type_traits.integral_constant - } - module is_unsigned_integer { - header "__type_traits/is_unsigned_integer.h" - export std_core.type_traits.integral_constant - } - module is_unsigned { - header "__type_traits/is_unsigned.h" - export std_core.type_traits.integral_constant - } - module is_valid_expansion { - header "__type_traits/is_valid_expansion.h" - export std_core.type_traits.integral_constant - } - module is_void { - header "__type_traits/is_void.h" - export std_core.type_traits.integral_constant - } - module is_volatile { - header "__type_traits/is_volatile.h" - export std_core.type_traits.integral_constant - } - module lazy { header "__type_traits/lazy.h" } - module make_32_64_or_128_bit { header "__type_traits/make_32_64_or_128_bit.h" } - module make_const_lvalue_ref { header "__type_traits/make_const_lvalue_ref.h" } - module make_signed { header "__type_traits/make_signed.h" } - module make_unsigned { header "__type_traits/make_unsigned.h" } - module maybe_const { header "__type_traits/maybe_const.h" } - module nat { header "__type_traits/nat.h" } - module negation { header "__type_traits/negation.h" } - module promote { header "__type_traits/promote.h" } - module rank { header "__type_traits/rank.h" } - module remove_all_extents { header "__type_traits/remove_all_extents.h" } - module remove_const_ref { header "__type_traits/remove_const_ref.h" } - module remove_const { header "__type_traits/remove_const.h" } - module remove_cv { header "__type_traits/remove_cv.h" } - module remove_cvref { header "__type_traits/remove_cvref.h" } - module remove_extent { header "__type_traits/remove_extent.h" } - module remove_pointer { header "__type_traits/remove_pointer.h" } - module remove_reference { header "__type_traits/remove_reference.h" } - module remove_volatile { header "__type_traits/remove_volatile.h" } - module result_of { header "__type_traits/result_of.h" } - module strip_signature { header "__type_traits/strip_signature.h" } - module type_identity { header "__type_traits/type_identity.h" } - module type_list { header "__type_traits/type_list.h" } - module underlying_type { header "__type_traits/underlying_type.h" } - module unwrap_ref { header "__type_traits/unwrap_ref.h" } - module void_t { header "__type_traits/void_t.h" } - - header "type_traits" - export * - } // module type_traits - - // Only the truly dependency-free parts of __utility are here - module utility_core { - module declval { header "__utility/declval.h" } - module empty { header "__utility/empty.h" } - module forward { header "__utility/forward.h" } - } -} // module std_core - -module std [system] { - module algorithm { - module adjacent_find { header "__algorithm/adjacent_find.h" } - module all_of { header "__algorithm/all_of.h" } - module any_of { header "__algorithm/any_of.h" } - module binary_search { header "__algorithm/binary_search.h" } - module clamp { header "__algorithm/clamp.h" } - module comp_ref_type { header "__algorithm/comp_ref_type.h" } - module comp { header "__algorithm/comp.h" } - module copy_backward { header "__algorithm/copy_backward.h" } - module copy_if { header "__algorithm/copy_if.h" } - module copy_move_common { header "__algorithm/copy_move_common.h" } - module copy_n { header "__algorithm/copy_n.h" } - module copy { header "__algorithm/copy.h" } - module count_if { header "__algorithm/count_if.h" } - module count { header "__algorithm/count.h" } - module equal_range { header "__algorithm/equal_range.h" } - module equal { header "__algorithm/equal.h" } - module fill_n { header "__algorithm/fill_n.h" } - module fill { header "__algorithm/fill.h" } - module find_end { header "__algorithm/find_end.h" } - module find_first_of { header "__algorithm/find_first_of.h" } - module find_if_not { header "__algorithm/find_if_not.h" } - module find_if { header "__algorithm/find_if.h" } - module find_segment_if { header "__algorithm/find_segment_if.h" } - module find { header "__algorithm/find.h" } - module for_each_n { header "__algorithm/for_each_n.h" } - module for_each_segment { header "__algorithm/for_each_segment.h" } - module for_each { header "__algorithm/for_each.h" } - module generate_n { header "__algorithm/generate_n.h" } - module generate { header "__algorithm/generate.h" } - module half_positive { header "__algorithm/half_positive.h" } - module in_found_result { header "__algorithm/in_found_result.h" } - module in_fun_result { header "__algorithm/in_fun_result.h" } - module in_in_out_result { header "__algorithm/in_in_out_result.h" } - module in_in_result { header "__algorithm/in_in_result.h" } - module in_out_out_result { header "__algorithm/in_out_out_result.h" } - module in_out_result { header "__algorithm/in_out_result.h" } - module includes { header "__algorithm/includes.h" } - module inplace_merge { header "__algorithm/inplace_merge.h" } - module is_heap_until { header "__algorithm/is_heap_until.h" } - module is_heap { header "__algorithm/is_heap.h" } - module is_partitioned { header "__algorithm/is_partitioned.h" } - module is_permutation { header "__algorithm/is_permutation.h" } - module is_sorted_until { header "__algorithm/is_sorted_until.h" } - module is_sorted { header "__algorithm/is_sorted.h" } - module iter_swap { header "__algorithm/iter_swap.h" } - module iterator_operations { - header "__algorithm/iterator_operations.h" - export std.iterator.advance - export std.iterator.distance - export std.iterator.iter_move - export std.iterator.iter_swap - export std.iterator.next - export std.iterator.prev - } - module lexicographical_compare_three_way { header "__algorithm/lexicographical_compare_three_way.h" } - module lexicographical_compare { header "__algorithm/lexicographical_compare.h" } - module lower_bound { header "__algorithm/lower_bound.h" } - module make_heap { header "__algorithm/make_heap.h" } - module make_projected { header "__algorithm/make_projected.h" } - module max_element { header "__algorithm/max_element.h" } - module max { header "__algorithm/max.h" } - module merge { header "__algorithm/merge.h" } - module min_element { header "__algorithm/min_element.h" } - module min_max_result { header "__algorithm/min_max_result.h" } - module min { header "__algorithm/min.h" } - module minmax_element { header "__algorithm/minmax_element.h" } - module minmax { - header "__algorithm/minmax.h" - export std.utility.pair // return type - } - module mismatch { - header "__algorithm/mismatch.h" - export std.utility.pair // return type - } - module move_backward { header "__algorithm/move_backward.h" } - module move { header "__algorithm/move.h" } - module next_permutation { header "__algorithm/next_permutation.h" } - module none_of { header "__algorithm/none_of.h" } - module nth_element { header "__algorithm/nth_element.h" } - module partial_sort_copy { header "__algorithm/partial_sort_copy.h" } - module partial_sort { header "__algorithm/partial_sort.h" } - module partition_copy { header "__algorithm/partition_copy.h" } - module partition_point { header "__algorithm/partition_point.h" } - module partition { header "__algorithm/partition.h" } - module pop_heap { header "__algorithm/pop_heap.h" } - module prev_permutation { header "__algorithm/prev_permutation.h" } - module pstl { header "__algorithm/pstl.h" } - module push_heap { header "__algorithm/push_heap.h" } - module ranges_adjacent_find { header "__algorithm/ranges_adjacent_find.h" } - module ranges_all_of { header "__algorithm/ranges_all_of.h" } - module ranges_any_of { header "__algorithm/ranges_any_of.h" } - module ranges_binary_search { - header "__algorithm/ranges_binary_search.h" - export std.functional.ranges_operations - } - module ranges_clamp { - header "__algorithm/ranges_clamp.h" - export std.functional.ranges_operations - } - module ranges_contains_subrange { - header "__algorithm/ranges_contains_subrange.h" - } - module ranges_contains { - header "__algorithm/ranges_contains.h" - } - module ranges_copy_backward { - header "__algorithm/ranges_copy_backward.h" - export std.algorithm.in_out_result - } - module ranges_copy_if { - header "__algorithm/ranges_copy_if.h" - export std.algorithm.in_out_result - } - module ranges_copy_n { - header "__algorithm/ranges_copy_n.h" - export std.algorithm.in_out_result - } - module ranges_copy { - header "__algorithm/ranges_copy.h" - export std.algorithm.in_out_result - } - module ranges_count_if { header "__algorithm/ranges_count_if.h" } - module ranges_count { header "__algorithm/ranges_count.h" } - module ranges_ends_with { header "__algorithm/ranges_ends_with.h" } - module ranges_equal_range { - header "__algorithm/ranges_equal_range.h" - export std.functional.ranges_operations - } - module ranges_equal { - header "__algorithm/ranges_equal.h" - export std.functional.identity - } - module ranges_fill_n { header "__algorithm/ranges_fill_n.h" } - module ranges_fill { header "__algorithm/ranges_fill.h" } - module ranges_find_end { header "__algorithm/ranges_find_end.h" } - module ranges_find_first_of { header "__algorithm/ranges_find_first_of.h" } - module ranges_find_if_not { header "__algorithm/ranges_find_if_not.h" } - module ranges_find_if { header "__algorithm/ranges_find_if.h" } - module ranges_find_last { header "__algorithm/ranges_find_last.h" } - module ranges_find { header "__algorithm/ranges_find.h" } - module ranges_fold { header "__algorithm/ranges_fold.h" } - module ranges_for_each_n { - header "__algorithm/ranges_for_each_n.h" - export std.algorithm.in_fun_result - } - module ranges_for_each { - header "__algorithm/ranges_for_each.h" - export std.algorithm.in_fun_result - } - module ranges_generate_n { - header "__algorithm/ranges_generate_n.h" - } - module ranges_generate { - header "__algorithm/ranges_generate.h" - } - module ranges_includes { - header "__algorithm/ranges_includes.h" - export std.functional.ranges_operations - } - module ranges_inplace_merge { - header "__algorithm/ranges_inplace_merge.h" - export std.functional.ranges_operations - } - module ranges_is_heap_until { - header "__algorithm/ranges_is_heap_until.h" - export std.functional.ranges_operations - } - module ranges_is_heap { - header "__algorithm/ranges_is_heap.h" - export std.functional.ranges_operations - } - module ranges_is_partitioned { - header "__algorithm/ranges_is_partitioned.h" - } - module ranges_is_permutation { - header "__algorithm/ranges_is_permutation.h" - } - module ranges_is_sorted_until { - header "__algorithm/ranges_is_sorted_until.h" - export std.functional.ranges_operations - } - module ranges_is_sorted { - header "__algorithm/ranges_is_sorted.h" - export std.functional.ranges_operations - } - module ranges_iterator_concept { - header "__algorithm/ranges_iterator_concept.h" - } - module ranges_lexicographical_compare { - header "__algorithm/ranges_lexicographical_compare.h" - export std.functional.ranges_operations - } - module ranges_lower_bound { - header "__algorithm/ranges_lower_bound.h" - export std.functional.ranges_operations - } - module ranges_make_heap { - header "__algorithm/ranges_make_heap.h" - export std.functional.ranges_operations - } - module ranges_max_element { - header "__algorithm/ranges_max_element.h" - export std.functional.ranges_operations - } - module ranges_max { - header "__algorithm/ranges_max.h" - export std.functional.ranges_operations - } - module ranges_merge { - header "__algorithm/ranges_merge.h" - export std.functional.ranges_operations - export std.algorithm.in_in_out_result - } - module ranges_min_element { - header "__algorithm/ranges_min_element.h" - export std.functional.ranges_operations - } - module ranges_min { - header "__algorithm/ranges_min.h" - export std.functional.ranges_operations - } - module ranges_minmax_element { - header "__algorithm/ranges_minmax_element.h" - export std.functional.ranges_operations - export std.algorithm.min_max_result - } - module ranges_minmax { - header "__algorithm/ranges_minmax.h" - export std.functional.ranges_operations - export std.algorithm.min_max_result - } - module ranges_mismatch { - header "__algorithm/ranges_mismatch.h" - export std.algorithm.in_in_result - } - module ranges_move_backward { - header "__algorithm/ranges_move_backward.h" - export std.algorithm.in_out_result - } - module ranges_move { - header "__algorithm/ranges_move.h" - export std.algorithm.in_out_result - } - module ranges_next_permutation { - header "__algorithm/ranges_next_permutation.h" - export std.functional.ranges_operations - export std.algorithm.in_found_result - } - module ranges_none_of { - header "__algorithm/ranges_none_of.h" - } - module ranges_nth_element { - header "__algorithm/ranges_nth_element.h" - export std.functional.ranges_operations - } - module ranges_partial_sort_copy { - header "__algorithm/ranges_partial_sort_copy.h" - export std.functional.ranges_operations - } - module ranges_partial_sort { - header "__algorithm/ranges_partial_sort.h" - export std.functional.ranges_operations - } - module ranges_partition_copy { - header "__algorithm/ranges_partition_copy.h" - export std.algorithm.in_out_out_result - } - module ranges_partition_point { - header "__algorithm/ranges_partition_point.h" - } - module ranges_partition { - header "__algorithm/ranges_partition.h" - } - module ranges_pop_heap { - header "__algorithm/ranges_pop_heap.h" - export std.functional.ranges_operations - } - module ranges_prev_permutation { - header "__algorithm/ranges_prev_permutation.h" - export std.functional.ranges_operations - export std.algorithm.in_found_result - } - module ranges_push_heap { - header "__algorithm/ranges_push_heap.h" - export std.functional.ranges_operations - } - module ranges_remove_copy_if { - header "__algorithm/ranges_remove_copy_if.h" - export std.algorithm.in_out_result - } - module ranges_remove_copy { - header "__algorithm/ranges_remove_copy.h" - export std.algorithm.in_out_result - } - module ranges_remove_if { - header "__algorithm/ranges_remove_if.h" - } - module ranges_remove { - header "__algorithm/ranges_remove.h" - } - module ranges_replace_copy_if { - header "__algorithm/ranges_replace_copy_if.h" - export std.algorithm.in_out_result - } - module ranges_replace_copy { - header "__algorithm/ranges_replace_copy.h" - export std.algorithm.in_out_result - } - module ranges_replace_if { - header "__algorithm/ranges_replace_if.h" - } - module ranges_replace { - header "__algorithm/ranges_replace.h" - } - module ranges_reverse_copy { - header "__algorithm/ranges_reverse_copy.h" - export std.algorithm.in_out_result - } - module ranges_reverse { - header "__algorithm/ranges_reverse.h" - } - module ranges_rotate_copy { - header "__algorithm/ranges_rotate_copy.h" - export std.algorithm.in_out_result - } - module ranges_rotate { header "__algorithm/ranges_rotate.h" } - module ranges_sample { header "__algorithm/ranges_sample.h" } - module ranges_search_n { header "__algorithm/ranges_search_n.h" } - module ranges_search { header "__algorithm/ranges_search.h" } - module ranges_set_difference { - header "__algorithm/ranges_set_difference.h" - export std.functional.ranges_operations - export std.algorithm.in_out_result - } - module ranges_set_intersection { - header "__algorithm/ranges_set_intersection.h" - export std.functional.ranges_operations - export std.algorithm.in_in_out_result - } - module ranges_set_symmetric_difference { - header "__algorithm/ranges_set_symmetric_difference.h" - export std.functional.ranges_operations - export std.algorithm.in_in_out_result - } - module ranges_set_union { - header "__algorithm/ranges_set_union.h" - export std.functional.ranges_operations - export std.algorithm.in_in_out_result - } - module ranges_shuffle { - header "__algorithm/ranges_shuffle.h" - } - module ranges_sort_heap { - header "__algorithm/ranges_sort_heap.h" - export std.functional.ranges_operations - } - module ranges_sort { - header "__algorithm/ranges_sort.h" - export std.functional.ranges_operations - } - module ranges_stable_partition { - header "__algorithm/ranges_stable_partition.h" - } - module ranges_stable_sort { - header "__algorithm/ranges_stable_sort.h" - export std.functional.ranges_operations - } - module ranges_starts_with { - header "__algorithm/ranges_starts_with.h" - } - module ranges_swap_ranges { - header "__algorithm/ranges_swap_ranges.h" - export std.algorithm.in_in_result - } - module ranges_transform { - header "__algorithm/ranges_transform.h" - export std.algorithm.in_out_result - export std.algorithm.in_in_out_result - } - module ranges_unique_copy { - header "__algorithm/ranges_unique_copy.h" - } - module ranges_unique { - header "__algorithm/ranges_unique.h" - } - module ranges_upper_bound { - header "__algorithm/ranges_upper_bound.h" - export std.functional.ranges_operations - } - module remove_copy_if { header "__algorithm/remove_copy_if.h" } - module remove_copy { header "__algorithm/remove_copy.h" } - module remove_if { header "__algorithm/remove_if.h" } - module remove { header "__algorithm/remove.h" } - module replace_copy_if { header "__algorithm/replace_copy_if.h" } - module replace_copy { header "__algorithm/replace_copy.h" } - module replace_if { header "__algorithm/replace_if.h" } - module replace { header "__algorithm/replace.h" } - module reverse_copy { header "__algorithm/reverse_copy.h" } - module reverse { header "__algorithm/reverse.h" } - module rotate_copy { header "__algorithm/rotate_copy.h" } - module rotate { header "__algorithm/rotate.h" } - module sample { header "__algorithm/sample.h" } - module search_n { header "__algorithm/search_n.h" } - module search { header "__algorithm/search.h" } - module set_difference { header "__algorithm/set_difference.h" } - module set_intersection { header "__algorithm/set_intersection.h" } - module set_symmetric_difference { header "__algorithm/set_symmetric_difference.h" } - module set_union { header "__algorithm/set_union.h" } - module shift_left { header "__algorithm/shift_left.h" } - module shift_right { header "__algorithm/shift_right.h" } - module shuffle { header "__algorithm/shuffle.h" } - module sift_down { header "__algorithm/sift_down.h" } - module simd_utils { header "__algorithm/simd_utils.h" } - module sort_heap { header "__algorithm/sort_heap.h" } - module sort { header "__algorithm/sort.h" } - module stable_partition { header "__algorithm/stable_partition.h" } - module stable_sort { header "__algorithm/stable_sort.h" } - module swap_ranges { header "__algorithm/swap_ranges.h" } - module three_way_comp_ref_type { header "__algorithm/three_way_comp_ref_type.h" } - module transform { header "__algorithm/transform.h" } - module uniform_random_bit_generator_adaptor { header "__algorithm/uniform_random_bit_generator_adaptor.h" } - module unique_copy { header "__algorithm/unique_copy.h" } - module unique { header "__algorithm/unique.h" } - module unwrap_iter { header "__algorithm/unwrap_iter.h" } - module unwrap_range { header "__algorithm/unwrap_range.h" } - module upper_bound { header "__algorithm/upper_bound.h" } - - header "algorithm" - export * - } // module algorithm - - module any { - header "any" - export * - } - - module array { - module fwd { header "__fwd/array.h" } - - header "array" - export * - } - - module atomic { - module aliases { header "__atomic/aliases.h" } - module atomic_base { header "__atomic/atomic_base.h" } - module atomic_flag { header "__atomic/atomic_flag.h" } - module atomic_init { header "__atomic/atomic_init.h" } - module atomic_lock_free { header "__atomic/atomic_lock_free.h" } - module atomic_ref { header "__atomic/atomic_ref.h" } - module atomic_sync { header "__atomic/atomic_sync.h" } - module atomic { - header "__atomic/atomic.h" - export std.atomic.atomic_base // most of std::atomic methods are defined there - } - module check_memory_order { header "__atomic/check_memory_order.h" } - module contention_t { header "__atomic/contention_t.h" } - module cxx_atomic_impl { header "__atomic/cxx_atomic_impl.h" } - module fence { header "__atomic/fence.h" } - module is_always_lock_free { header "__atomic/is_always_lock_free.h" } - module kill_dependency { header "__atomic/kill_dependency.h" } - module memory_order { header "__atomic/memory_order.h" } - module to_gcc_order { header "__atomic/to_gcc_order.h" } - - header "atomic" - export * - } - - module barrier { - header "barrier" - export * - } - - module bit { - module bit_cast { header "__bit/bit_cast.h" } - module bit_ceil { header "__bit/bit_ceil.h" } - module bit_floor { header "__bit/bit_floor.h" } - module bit_log2 { header "__bit/bit_log2.h" } - module bit_width { header "__bit/bit_width.h" } - module blsr { header "__bit/blsr.h" } - module byteswap { header "__bit/byteswap.h" } - module countl { header "__bit/countl.h" } - module countr { header "__bit/countr.h" } - module endian { header "__bit/endian.h" } - module has_single_bit { header "__bit/has_single_bit.h" } - module invert_if { header "__bit/invert_if.h" } - module popcount { header "__bit/popcount.h" } - module rotate { header "__bit/rotate.h" } - - header "bit" - export * - } - - module bitset { - header "bitset" - export * - } - - module charconv { - module chars_format { header "__charconv/chars_format.h" } - module from_chars_integral { header "__charconv/from_chars_integral.h" } - module from_chars_result { header "__charconv/from_chars_result.h" } - module tables { header "__charconv/tables.h" } - module to_chars { header "__charconv/to_chars.h" } - module to_chars_base_10 { header "__charconv/to_chars_base_10.h" } - module to_chars_floating_point { header "__charconv/to_chars_floating_point.h" } - module to_chars_integral { header "__charconv/to_chars_integral.h" } - module to_chars_result { header "__charconv/to_chars_result.h" } - module traits { header "__charconv/traits.h" } - - header "charconv" - export * - } - - module chrono { - module calendar { header "__chrono/calendar.h" } - module concepts { header "__chrono/concepts.h" } - module convert_to_timespec { header "__chrono/convert_to_timespec.h" } - module convert_to_tm { header "__chrono/convert_to_tm.h" } - module day { header "__chrono/day.h" } - module duration { header "__chrono/duration.h" } - module exception { header "__chrono/exception.h" } - module file_clock { header "__chrono/file_clock.h" } - module formatter { header "__chrono/formatter.h" } - module hh_mm_ss { header "__chrono/hh_mm_ss.h" } - module high_resolution_clock { - header "__chrono/high_resolution_clock.h" - export * - } - module leap_second { - header "__chrono/leap_second.h" - } - module literals { - header "__chrono/literals.h" - } - module local_info { - header "__chrono/local_info.h" - export std.chrono.sys_info - } - module month_weekday { header "__chrono/month_weekday.h" } - module month { header "__chrono/month.h" } - module monthday { header "__chrono/monthday.h" } - module ostream { header "__chrono/ostream.h" } - module parser_std_format_spec { header "__chrono/parser_std_format_spec.h" } - module statically_widen { header "__chrono/statically_widen.h" } - module steady_clock { - header "__chrono/steady_clock.h" - export std.chrono.time_point - } - module sys_info { - header "__chrono/sys_info.h" - } - module system_clock { - header "__chrono/system_clock.h" - export std.chrono.time_point - } - module time_point { header "__chrono/time_point.h" } - module time_zone_link { header "__chrono/time_zone_link.h" } - module time_zone { header "__chrono/time_zone.h" } - module tzdb_list { - header "__chrono/tzdb_list.h" - export std.forward_list // forward_list iterators are used to implement this API - export std.string_view // by-value argument of type std::string_view - } - module tzdb { - header "__chrono/tzdb.h" - export std.string // public data member of type std::string - export std.vector // public data members of type std::vector - } - module weekday { header "__chrono/weekday.h" } - module year_month_day { header "__chrono/year_month_day.h" } - module year_month_weekday { header "__chrono/year_month_weekday.h" } - module year_month { header "__chrono/year_month.h" } - module year { header "__chrono/year.h" } - module zoned_time { header "__chrono/zoned_time.h" } - - header "chrono" - export * - } // module chrono - - module codecvt { - header "codecvt" - export * - } - - module compare { - module common_comparison_category { header "__compare/common_comparison_category.h" } - module compare_partial_order_fallback { header "__compare/compare_partial_order_fallback.h" } - module compare_strong_order_fallback { header "__compare/compare_strong_order_fallback.h" } - module compare_three_way { header "__compare/compare_three_way.h" } - module compare_three_way_result { header "__compare/compare_three_way_result.h" } - module compare_weak_order_fallback { header "__compare/compare_weak_order_fallback.h" } - module is_eq { header "__compare/is_eq.h" } - module ordering { header "__compare/ordering.h" } - module partial_order { header "__compare/partial_order.h" } - module strong_order { header "__compare/strong_order.h" } - module synth_three_way { header "__compare/synth_three_way.h" } - module three_way_comparable { header "__compare/three_way_comparable.h" } - module weak_order { header "__compare/weak_order.h" } - - header "compare" - export * - } - - module complex { - module fwd { header "__fwd/complex.h" } - - header "complex" - export * - } - - module concepts { - module arithmetic { header "__concepts/arithmetic.h" } - module assignable { header "__concepts/assignable.h" } - module boolean_testable { header "__concepts/boolean_testable.h" } - module class_or_enum { header "__concepts/class_or_enum.h" } - module common_reference_with { header "__concepts/common_reference_with.h" } - module common_with { header "__concepts/common_with.h" } - module constructible { header "__concepts/constructible.h" } - module convertible_to { header "__concepts/convertible_to.h" } - module copyable { header "__concepts/copyable.h" } - module derived_from { header "__concepts/derived_from.h" } - module destructible { header "__concepts/destructible.h" } - module different_from { header "__concepts/different_from.h" } - module equality_comparable { header "__concepts/equality_comparable.h" } - module invocable { header "__concepts/invocable.h" } - module movable { header "__concepts/movable.h" } - module predicate { header "__concepts/predicate.h" } - module regular { header "__concepts/regular.h" } - module relation { header "__concepts/relation.h" } - module same_as { header "__concepts/same_as.h" } - module semiregular { header "__concepts/semiregular.h" } - module swappable { header "__concepts/swappable.h" } - module totally_ordered { header "__concepts/totally_ordered.h" } - - header "concepts" - export * - } - - module condition_variable { - module condition_variable { header "__condition_variable/condition_variable.h" } - - header "condition_variable" - export * - } - - module cassert { - textual header "cassert" // NDEBUG requires textual inclusion - } - - module ccomplex { - header "ccomplex" - export * - } - - module cctype { - header "cctype" - export * - } - - module cerrno { - header "cerrno" - export * - } - - module cfenv { - header "cfenv" - export * - } - - module cfloat { - header "cfloat" - export * - } - - module cinttypes { - header "cinttypes" - export * - } - - module ciso646 { - header "ciso646" - export * - } - - module climits { - header "climits" - export * - } - - module clocale { - header "clocale" - export * - } - - module cmath { - header "cmath" - export * - } - - // TODO: Make non-textual. This seems to cause problems when compiling against Glibc. - module csetjmp { - textual header "csetjmp" - } - - module csignal { - header "csignal" - export * - } - - module cstdarg { - header "cstdarg" - export * - } - - module cstdbool { - header "cstdbool" - export * - } - - module cstddef { - header "cstddef" - export * - } - - module cstdio { - header "cstdio" - export * - } - - module cstdlib { - header "cstdlib" - export * - } - - module cstring { - header "cstring" - export * - } - - module ctgmath { - header "ctgmath" - export * - } - - module ctime { - header "ctime" - export * - } - - module cuchar { - header "cuchar" - export * - } - - module cwchar { - header "cwchar" - export * - } - - module cwctype { - header "cwctype" - export * - } - - module deque { - module fwd { header "__fwd/deque.h" } - - header "deque" - export * - } - - module exception { - module exception { header "__exception/exception.h" } - module exception_ptr { header "__exception/exception_ptr.h" } - module nested_exception { header "__exception/nested_exception.h" } - module operations { header "__exception/operations.h" } - module terminate { header "__exception/terminate.h" } - - header "exception" - export * - } - - module execution { - header "execution" - export * - } - - module expected { - module bad_expected_access { header "__expected/bad_expected_access.h" } - module expected { header "__expected/expected.h" } - module unexpect { header "__expected/unexpect.h" } - module unexpected { header "__expected/unexpected.h" } - - header "expected" - export * - } - - module filesystem { - module copy_options { header "__filesystem/copy_options.h" } - module directory_entry { header "__filesystem/directory_entry.h" } - module directory_iterator { header "__filesystem/directory_iterator.h" } - module directory_options { header "__filesystem/directory_options.h" } - module file_status { header "__filesystem/file_status.h" } - module file_time_type { header "__filesystem/file_time_type.h" } - module file_type { header "__filesystem/file_type.h" } - module filesystem_error { header "__filesystem/filesystem_error.h" } - module operations { header "__filesystem/operations.h" } - module path_iterator { header "__filesystem/path_iterator.h" } - module path { - header "__filesystem/path.h" - export std.string // returned by various methods of filesystem::path - } - module perm_options { header "__filesystem/perm_options.h" } - module perms { header "__filesystem/perms.h" } - module recursive_directory_iterator { header "__filesystem/recursive_directory_iterator.h" } - module space_info { header "__filesystem/space_info.h" } - module u8path { header "__filesystem/u8path.h" } - - header "filesystem" - export * - } - - module format { - module buffer { header "__format/buffer.h" } - module concepts { header "__format/concepts.h" } - module container_adaptor { header "__format/container_adaptor.h" } - module enable_insertable { header "__format/enable_insertable.h" } - module escaped_output_table { header "__format/escaped_output_table.h" } - module extended_grapheme_cluster_table { header "__format/extended_grapheme_cluster_table.h" } - module format_arg { header "__format/format_arg.h" } - module format_arg_store { header "__format/format_arg_store.h" } - module format_args { header "__format/format_args.h" } - module format_context { - header "__format/format_context.h" - export std.optional // default argument for __format_context_create - } - module format_error { - header "__format/format_error.h" - } - module format_functions { - header "__format/format_functions.h" - export std.string // returned by the functions in that header - } - module format_parse_context { header "__format/format_parse_context.h" } - module format_string { header "__format/format_string.h" } - module format_to_n_result { header "__format/format_to_n_result.h" } - module formatter { header "__format/formatter.h" } - module formatter_bool { header "__format/formatter_bool.h" } - module formatter_char { header "__format/formatter_char.h" } - module formatter_floating_point { header "__format/formatter_floating_point.h" } - module formatter_integer { header "__format/formatter_integer.h" } - module formatter_integral { header "__format/formatter_integral.h" } - module formatter_output { header "__format/formatter_output.h" } - module formatter_pointer { header "__format/formatter_pointer.h" } - module formatter_string { header "__format/formatter_string.h" } - module formatter_tuple { header "__format/formatter_tuple.h" } - module fwd { header "__fwd/format.h" } - module indic_conjunct_break_table { header "__format/indic_conjunct_break_table.h" } - module parser_std_format_spec { header "__format/parser_std_format_spec.h" } - module range_default_formatter { header "__format/range_default_formatter.h" } - module range_formatter { header "__format/range_formatter.h" } - module unicode { header "__format/unicode.h" } - module width_estimation_table { header "__format/width_estimation_table.h" } - module write_escaped { header "__format/write_escaped.h" } - - header "format" - export * - } // module format - - module forward_list { - header "forward_list" - export * - } - - module fstream { - module fwd { header "__fwd/fstream.h" } - - header "fstream" - export * - } - - module functional { - module binary_function { header "__functional/binary_function.h" } - module binary_negate { header "__functional/binary_negate.h" } - module bind_back { - header "__functional/bind_back.h" - export std.functional.perfect_forward // inherited from and using its operators - } - module bind_front { - header "__functional/bind_front.h" - export std.functional.perfect_forward // inherited from and using its operators - } - module bind { header "__functional/bind.h" } - module binder1st { header "__functional/binder1st.h" } - module binder2nd { header "__functional/binder2nd.h" } - module boyer_moore_searcher { - header "__functional/boyer_moore_searcher.h" - export std.memory.shared_ptr - } - module compose { - header "__functional/compose.h" - export std.functional.perfect_forward // inherited from and using its operators - } - module default_searcher { header "__functional/default_searcher.h" } - module function { header "__functional/function.h" } - module hash { header "__functional/hash.h" } - module identity { header "__functional/identity.h" } - module invoke { header "__functional/invoke.h" } - module is_transparent { header "__functional/is_transparent.h" } - module mem_fn { header "__functional/mem_fn.h" } - module mem_fun_ref { header "__functional/mem_fun_ref.h" } - module not_fn { - header "__functional/not_fn.h" - export std.functional.perfect_forward // inherited from and using its operators - } - module operations { header "__functional/operations.h" } - module perfect_forward { - header "__functional/perfect_forward.h" - export std.tuple - } - module pointer_to_binary_function { header "__functional/pointer_to_binary_function.h" } - module pointer_to_unary_function { header "__functional/pointer_to_unary_function.h" } - module ranges_operations { header "__functional/ranges_operations.h" } - module reference_wrapper { header "__functional/reference_wrapper.h" } - module unary_function { header "__functional/unary_function.h" } - module unary_negate { header "__functional/unary_negate.h" } - module weak_result_type { header "__functional/weak_result_type.h" } - - header "functional" - export * - } // module functional - - module future { - header "future" - export * - } - - module initializer_list { - header "initializer_list" - export * - } - - module iomanip { - header "iomanip" - export * - } - - module ios { - module fwd { header "__fwd/ios.h" } - module fpos { header "__ios/fpos.h" } - - header "ios" - export * - } - - module iosfwd { - header "iosfwd" - export * - } - - module iostream { - header "iostream" - export * - } - - module istream { - module fwd { header "__fwd/istream.h" } - - header "istream" - export std.ios // base class - } - - module iterator { - module access { header "__iterator/access.h" } - module advance { header "__iterator/advance.h" } - module aliasing_iterator { header "__iterator/aliasing_iterator.h" } - module back_insert_iterator { header "__iterator/back_insert_iterator.h" } - module bounded_iter { header "__iterator/bounded_iter.h" } - module common_iterator { header "__iterator/common_iterator.h" } - module concepts { - header "__iterator/concepts.h" - export std_core.type_traits.common_reference - } - module counted_iterator { header "__iterator/counted_iterator.h" } - module cpp17_iterator_concepts { header "__iterator/cpp17_iterator_concepts.h" } - module data { header "__iterator/data.h" } - module default_sentinel { header "__iterator/default_sentinel.h" } - module distance { header "__iterator/distance.h" } - module empty { header "__iterator/empty.h" } - module erase_if_container { header "__iterator/erase_if_container.h" } - module front_insert_iterator { header "__iterator/front_insert_iterator.h" } - module incrementable_traits { header "__iterator/incrementable_traits.h" } - module indirectly_comparable { header "__iterator/indirectly_comparable.h" } - module insert_iterator { header "__iterator/insert_iterator.h" } - module istream_iterator { header "__iterator/istream_iterator.h" } - module istreambuf_iterator { header "__iterator/istreambuf_iterator.h" } - module iter_move { header "__iterator/iter_move.h" } - module iter_swap { header "__iterator/iter_swap.h" } - module iterator_traits { - header "__iterator/iterator_traits.h" - export std_core.type_traits.integral_constant - } - module iterator_with_data { header "__iterator/iterator_with_data.h" } - module iterator { header "__iterator/iterator.h" } - module mergeable { header "__iterator/mergeable.h" } - module move_iterator { header "__iterator/move_iterator.h" } - module move_sentinel { header "__iterator/move_sentinel.h" } - module next { header "__iterator/next.h" } - module ostream_iterator { header "__iterator/ostream_iterator.h" } - module ostreambuf_iterator { - header "__iterator/ostreambuf_iterator.h" - export iosfwd // for default template argument of ostreambuf_iterator - } - module permutable { header "__iterator/permutable.h" } - module prev { header "__iterator/prev.h" } - module projected { header "__iterator/projected.h" } - module ranges_iterator_traits { header "__iterator/ranges_iterator_traits.h" } - module readable_traits { header "__iterator/readable_traits.h" } - module reverse_access { header "__iterator/reverse_access.h" } - module reverse_iterator { header "__iterator/reverse_iterator.h" } - module segmented_iterator { header "__iterator/segmented_iterator.h" } - module size { header "__iterator/size.h" } - module sortable { header "__iterator/sortable.h" } - module unreachable_sentinel { header "__iterator/unreachable_sentinel.h" } - module wrap_iter { header "__iterator/wrap_iter.h" } - - header "iterator" - export * - } - - module latch { - header "latch" - export * - } - - module list { - header "list" - export * - } - - module locale { - header "locale" - header "__locale_dir/locale_base_api.h" - header "__locale_dir/locale_base_api/locale_guard.h" - module locale_base_api { - textual header "__locale_dir/locale_base_api/android.h" - textual header "__locale_dir/locale_base_api/bsd_locale_defaults.h" - textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h" - textual header "__locale_dir/locale_base_api/fuchsia.h" - textual header "__locale_dir/locale_base_api/ibm.h" - textual header "__locale_dir/locale_base_api/musl.h" - textual header "__locale_dir/locale_base_api/newlib.h" - textual header "__locale_dir/locale_base_api/openbsd.h" - textual header "__locale_dir/locale_base_api/win32.h" - } - export * - } - - // TODO: Understand why this needs to live in its own module - module locale_base [system] { - header "__locale" - export * - } - - module map { - header "map" - export * - } - - module mdspan { - module default_accessor { header "__mdspan/default_accessor.h" } - module extents { header "__mdspan/extents.h" } - module fwd { header "__fwd/mdspan.h" } - module layout_left { header "__mdspan/layout_left.h" } - module layout_right { header "__mdspan/layout_right.h" } - module layout_stride { header "__mdspan/layout_stride.h" } - module mdspan { - header "__mdspan/mdspan.h" - export std.array // returned by some methods - } - - header "mdspan" - export * - } - - module memory { - module addressof { header "__memory/addressof.h" } - module align { header "__memory/align.h" } - module aligned_alloc { header "__memory/aligned_alloc.h" } - module allocate_at_least { header "__memory/allocate_at_least.h" } - module allocation_guard { header "__memory/allocation_guard.h" } - module allocator { header "__memory/allocator.h" } - module allocator_arg_t { header "__memory/allocator_arg_t.h" } - module allocator_destructor { header "__memory/allocator_destructor.h" } - module allocator_traits { header "__memory/allocator_traits.h" } - module assume_aligned { header "__memory/assume_aligned.h" } - module array_cookie { header "__memory/array_cookie.h" } - module auto_ptr { header "__memory/auto_ptr.h" } - module builtin_new_allocator { header "__memory/builtin_new_allocator.h" } - module compressed_pair { header "__memory/compressed_pair.h" } - module concepts { header "__memory/concepts.h" } - module construct_at { header "__memory/construct_at.h" } - module destruct_n { header "__memory/destruct_n.h" } - module fwd { header "__fwd/memory.h" } - module inout_ptr { header "__memory/inout_ptr.h" } - module noexcept_move_assign_container { header "__memory/noexcept_move_assign_container.h" } - module out_ptr { header "__memory/out_ptr.h" } - module pointer_traits { header "__memory/pointer_traits.h" } - module ranges_construct_at { header "__memory/ranges_construct_at.h" } - module ranges_uninitialized_algorithms { - header "__memory/ranges_uninitialized_algorithms.h" - export std.algorithm.in_out_result - } - module raw_storage_iterator { header "__memory/raw_storage_iterator.h" } - module shared_ptr { header "__memory/shared_ptr.h" } - module swap_allocator { header "__memory/swap_allocator.h" } - module temp_value { header "__memory/temp_value.h" } - module temporary_buffer { - header "__memory/temporary_buffer.h" - export std.utility.pair // return type of std::get_temporary_buffer() - } - module uninitialized_algorithms { - header "__memory/uninitialized_algorithms.h" - } - module unique_ptr { - header "__memory/unique_ptr.h" - } - module unique_temporary_buffer { - header "__memory/unique_temporary_buffer.h" - export std.memory.unique_ptr - export std_core.type_traits.is_constant_evaluated - } - module uses_allocator { header "__memory/uses_allocator.h" } - module uses_allocator_construction { header "__memory/uses_allocator_construction.h" } - module voidify { header "__memory/voidify.h" } - - header "memory" - export * - } - - module memory_resource { - module fwd { header "__fwd/memory_resource.h" } - module memory_resource { header "__memory_resource/memory_resource.h" } - module monotonic_buffer_resource { header "__memory_resource/monotonic_buffer_resource.h" } - module polymorphic_allocator { header "__memory_resource/polymorphic_allocator.h" } - module pool_options { header "__memory_resource/pool_options.h" } - module synchronized_pool_resource { header "__memory_resource/synchronized_pool_resource.h" } - module unsynchronized_pool_resource { header "__memory_resource/unsynchronized_pool_resource.h" } - - header "memory_resource" - export * - } - - module mutex { - module lock_guard { header "__mutex/lock_guard.h" } - module mutex { header "__mutex/mutex.h" } - module once_flag { header "__mutex/once_flag.h" } - module tag_types { header "__mutex/tag_types.h" } - module unique_lock { header "__mutex/unique_lock.h" } - - header "mutex" - export * - } - - module new { - header "new" - export * - } - - module numbers { - header "numbers" - export * - } - - module numeric { - module accumulate { header "__numeric/accumulate.h" } - module adjacent_difference { header "__numeric/adjacent_difference.h" } - module exclusive_scan { header "__numeric/exclusive_scan.h" } - module gcd_lcm { header "__numeric/gcd_lcm.h" } - module inclusive_scan { header "__numeric/inclusive_scan.h" } - module inner_product { header "__numeric/inner_product.h" } - module iota { header "__numeric/iota.h" } - module midpoint { header "__numeric/midpoint.h" } - module partial_sum { header "__numeric/partial_sum.h" } - module pstl { header "__numeric/pstl.h" } - module reduce { header "__numeric/reduce.h" } - module saturation_arithmetic { header "__numeric/saturation_arithmetic.h" } - module transform_exclusive_scan { header "__numeric/transform_exclusive_scan.h" } - module transform_inclusive_scan { header "__numeric/transform_inclusive_scan.h" } - module transform_reduce { header "__numeric/transform_reduce.h" } - - header "numeric" - export * - } - - module optional { - header "optional" - export * - } - - module ostream { - module basic_ostream { - header "__ostream/basic_ostream.h" - export std.ios // base class - } - module fwd { - header "__fwd/ostream.h" - } - module print { - header "__ostream/print.h" - export * - } - - header "ostream" - export * - } - - module print { - header "print" - export * - } - - module queue { - module fwd { header "__fwd/queue.h" } - - header "queue" - export * - } - - module random { - module bernoulli_distribution { header "__random/bernoulli_distribution.h" } - module binomial_distribution { header "__random/binomial_distribution.h" } - module cauchy_distribution { header "__random/cauchy_distribution.h" } - module chi_squared_distribution { header "__random/chi_squared_distribution.h" } - module clamp_to_integral { header "__random/clamp_to_integral.h" } - module default_random_engine { header "__random/default_random_engine.h" } - module discard_block_engine { header "__random/discard_block_engine.h" } - module discrete_distribution { header "__random/discrete_distribution.h" } - module exponential_distribution { header "__random/exponential_distribution.h" } - module extreme_value_distribution { header "__random/extreme_value_distribution.h" } - module fisher_f_distribution { header "__random/fisher_f_distribution.h" } - module gamma_distribution { header "__random/gamma_distribution.h" } - module generate_canonical { header "__random/generate_canonical.h" } - module geometric_distribution { header "__random/geometric_distribution.h" } - module independent_bits_engine { header "__random/independent_bits_engine.h" } - module is_seed_sequence { header "__random/is_seed_sequence.h" } - module is_valid { - header "__random/is_valid.h" - export std_core.type_traits.integral_constant - } - module knuth_b { header "__random/knuth_b.h" } - module linear_congruential_engine { header "__random/linear_congruential_engine.h" } - module log2 { header "__random/log2.h" } - module lognormal_distribution { header "__random/lognormal_distribution.h" } - module mersenne_twister_engine { header "__random/mersenne_twister_engine.h" } - module negative_binomial_distribution { header "__random/negative_binomial_distribution.h" } - module normal_distribution { header "__random/normal_distribution.h" } - module piecewise_constant_distribution { header "__random/piecewise_constant_distribution.h" } - module piecewise_linear_distribution { header "__random/piecewise_linear_distribution.h" } - module poisson_distribution { header "__random/poisson_distribution.h" } - module random_device { header "__random/random_device.h" } - module ranlux { header "__random/ranlux.h" } - module seed_seq { header "__random/seed_seq.h" } - module shuffle_order_engine { header "__random/shuffle_order_engine.h" } - module student_t_distribution { header "__random/student_t_distribution.h" } - module subtract_with_carry_engine { header "__random/subtract_with_carry_engine.h" } - module uniform_int_distribution { header "__random/uniform_int_distribution.h" } - module uniform_random_bit_generator { header "__random/uniform_random_bit_generator.h" } - module uniform_real_distribution { header "__random/uniform_real_distribution.h" } - module weibull_distribution { header "__random/weibull_distribution.h" } - - header "random" - export * - } - - module ranges { - module access { header "__ranges/access.h" } - module all { header "__ranges/all.h" } - module as_rvalue_view { header "__ranges/as_rvalue_view.h" } - module chunk_by_view { - header "__ranges/chunk_by_view.h" - export std.functional.bind_back - } - module common_view { header "__ranges/common_view.h" } - module concepts { header "__ranges/concepts.h" } - module container_compatible_range { header "__ranges/container_compatible_range.h" } - module counted { - header "__ranges/counted.h" - export std.span // return type of views::counted - export std.ranges.subrange // return type of views::counted - } - module dangling { - header "__ranges/dangling.h" - } - module data { - header "__ranges/data.h" - } - module drop_view { - header "__ranges/drop_view.h" - export std.functional.bind_back - } - module drop_while_view { - header "__ranges/drop_while_view.h" - export std.functional.bind_back - } - module elements_view { header "__ranges/elements_view.h" } - module empty { header "__ranges/empty.h" } - module empty_view { header "__ranges/empty_view.h" } - module enable_borrowed_range { header "__ranges/enable_borrowed_range.h" } - module enable_view { header "__ranges/enable_view.h" } - module filter_view { - header "__ranges/filter_view.h" - export std.functional.bind_back - } - module from_range { header "__ranges/from_range.h" } - module iota_view { header "__ranges/iota_view.h" } - module istream_view { header "__ranges/istream_view.h" } - module join_view { header "__ranges/join_view.h" } - module lazy_split_view { - header "__ranges/lazy_split_view.h" - export std.functional.bind_back - } - module movable_box { header "__ranges/movable_box.h" } - module non_propagating_cache { header "__ranges/non_propagating_cache.h" } - module owning_view { header "__ranges/owning_view.h" } - module range_adaptor { header "__ranges/range_adaptor.h" } - module rbegin { header "__ranges/rbegin.h" } - module ref_view { header "__ranges/ref_view.h" } - module rend { header "__ranges/rend.h" } - module repeat_view { header "__ranges/repeat_view.h" } - module reverse_view { header "__ranges/reverse_view.h" } - module single_view { header "__ranges/single_view.h" } - module size { header "__ranges/size.h" } - module split_view { - header "__ranges/split_view.h" - export std.functional.bind_back - } - module subrange { - header "__ranges/subrange.h" - export std.ranges.subrange_fwd - } - module subrange_fwd { - header "__fwd/subrange.h" - } - module take_view { - header "__ranges/take_view.h" - export std.functional.bind_back - } - module take_while_view { - header "__ranges/take_while_view.h" - export std.functional.bind_back - } - module to { - header "__ranges/to.h" - export std.functional.bind_back - } - module transform_view { - header "__ranges/transform_view.h" - export std.functional.bind_back - } - module view_interface { - header "__ranges/view_interface.h" - } - module views { - header "__ranges/views.h" - } - module zip_view { - header "__ranges/zip_view.h" - export std.utility.pair - } - - header "ranges" - export * - } // module ranges - - module ratio { - header "ratio" - export * - } - - module regex { - header "regex" - export * - } - - module scoped_allocator { - header "scoped_allocator" - export * - } - - module semaphore { - header "semaphore" - export * - } - - module set { - header "set" - export * - } - - module shared_mutex { - header "shared_mutex" - export * - } - - module source_location { - header "source_location" - export * - } - - module span { - module fwd { header "__fwd/span.h" } - - header "span" - export * - } - - module sstream { - module fwd { header "__fwd/sstream.h" } - - header "sstream" - export * - } - - module stack { - module fwd { header "__fwd/stack.h" } - - header "stack" - export * - } - - module stdexcept { - header "stdexcept" - export * - } - - module stop_token { - module atomic_unique_lock { header "__stop_token/atomic_unique_lock.h" } - module intrusive_list_view { header "__stop_token/intrusive_list_view.h" } - module intrusive_shared_ptr { header "__stop_token/intrusive_shared_ptr.h" } - module stop_callback { header "__stop_token/stop_callback.h" } - module stop_source { header "__stop_token/stop_source.h" } - module stop_state { header "__stop_token/stop_state.h" } - module stop_token { header "__stop_token/stop_token.h" } - - header "stop_token" - export * - } - - module streambuf { - module fwd { header "__fwd/streambuf.h" } - - header "streambuf" - export * - } - - module string { - module char_traits { header "__string/char_traits.h" } - module constexpr_c_functions { header "__string/constexpr_c_functions.h" } - module extern_template_lists { header "__string/extern_template_lists.h" } - module fwd { header "__fwd/string.h" } - - header "string" - export * - } - - module string_view { - module fwd { header "__fwd/string_view.h" } - - header "string_view" - export * - } - - module strstream { - header "strstream" - export * - } - - module syncstream { - header "syncstream" - export * - } - - module system_error { - module errc { header "__system_error/errc.h" } - module error_category { header "__system_error/error_category.h" } - module error_code { - header "__system_error/error_code.h" - export std.system_error.error_category // methods of error_code return that type - } - module error_condition { header "__system_error/error_condition.h" } - module system_error { header "__system_error/system_error.h" } - - header "system_error" - export * - } - - module thread { - module formatter { header "__thread/formatter.h" } - module id { header "__thread/id.h" } - module jthread { header "__thread/jthread.h" } - module poll_with_backoff { header "__thread/poll_with_backoff.h" } - module this_thread { header "__thread/this_thread.h" } - module thread { header "__thread/thread.h" } - module timed_backoff_policy { header "__thread/timed_backoff_policy.h" } - - module support { - header "__thread/support.h" - export * - } - module support_impl { - textual header "__thread/support/c11.h" - textual header "__thread/support/external.h" - textual header "__thread/support/pthread.h" - textual header "__thread/support/windows.h" - } - - header "thread" - export * - } - - module tuple { - module find_index { header "__tuple/find_index.h" } - module ignore { header "__tuple/ignore.h" } - module make_tuple_types { header "__tuple/make_tuple_types.h" } - module sfinae_helpers { header "__tuple/sfinae_helpers.h" } - module tuple_element { header "__tuple/tuple_element.h" } - module tuple_indices { header "__tuple/tuple_indices.h" } - module tuple_like_ext { header "__tuple/tuple_like_ext.h" } - module tuple_like_no_subrange { header "__tuple/tuple_like_no_subrange.h" } - module tuple_like { header "__tuple/tuple_like.h" } - module tuple_size { header "__tuple/tuple_size.h" } - module tuple_types { header "__tuple/tuple_types.h" } - - header "tuple" - export * +module std_any [system] { + header "any" + export * +} +module std_array [system] { + header "array" + export * +} +module std_atomic [system] { + header "atomic" + export * +} +module std_barrier [system] { + header "barrier" + export * +} +module std_bit [system] { + header "bit" + export * +} +module std_bitset [system] { + header "bitset" + export * +} +module std_charconv [system] { + header "charconv" + module chars_format { header "__charconv/chars_format.h" } + module from_chars_integral { header "__charconv/from_chars_integral.h" } + module from_chars_result { header "__charconv/from_chars_result.h" } + module tables { header "__charconv/tables.h" } + module to_chars { header "__charconv/to_chars.h" } + module to_chars_base_10 { header "__charconv/to_chars_base_10.h" } + module to_chars_floating_point { header "__charconv/to_chars_floating_point.h" } + module to_chars_integral { header "__charconv/to_chars_integral.h" } + module to_chars_result { header "__charconv/to_chars_result.h" } + module traits { header "__charconv/traits.h" } + export * +} +module std_chrono [system] { + header "chrono" + export * +} +module std_codecvt [system] { + header "codecvt" + export * +} +module std_compare [system] { + header "compare" + export * +} +module std_complex [system] { + header "complex" + export * +} +module std_concepts [system] { + header "concepts" + export * +} +module std_condition_variable [system] { + header "condition_variable" + module condition_variable { header "__condition_variable/condition_variable.h" } + export * +} +module std_coroutine [system] { + header "coroutine" + module coroutine_handle { header "__coroutine/coroutine_handle.h" } + module coroutine_traits { header "__coroutine/coroutine_traits.h" } + module noop_coroutine_handle { header "__coroutine/noop_coroutine_handle.h" } + module trivial_awaitables { header "__coroutine/trivial_awaitables.h" } + export * +} +module std_deque [system] { + header "deque" + export * +} +module std_exception [system] { + header "exception" + export * +} +module std_execution [system] { + header "execution" + export * +} +module std_expected [system] { + header "expected" + export * +} +module std_filesystem [system] { + header "filesystem" + module copy_options { header "__filesystem/copy_options.h" } + module directory_entry { header "__filesystem/directory_entry.h" } + module directory_iterator { header "__filesystem/directory_iterator.h" } + module directory_options { header "__filesystem/directory_options.h" } + module file_status { header "__filesystem/file_status.h" } + module file_time_type { header "__filesystem/file_time_type.h" } + module file_type { header "__filesystem/file_type.h" } + module filesystem_error { + header "__filesystem/filesystem_error.h" + export std_private_memory_shared_ptr + } + module operations { header "__filesystem/operations.h" } + module path { + header "__filesystem/path.h" + export std_string // returned by various methods + } + module path_iterator { header "__filesystem/path_iterator.h" } + module perm_options { header "__filesystem/perm_options.h" } + module perms { header "__filesystem/perms.h" } + module recursive_directory_iterator { header "__filesystem/recursive_directory_iterator.h" } + module space_info { header "__filesystem/space_info.h" } + module u8path { header "__filesystem/u8path.h" } + export * +} +module std_format [system] { + header "format" + export * +} +module std_forward_list [system] { + header "forward_list" + export * +} +module std_fstream [system] { + header "fstream" + export * +} +module std_functional [system] { + header "functional" + export * +} +module std_future [system] { + header "future" + export * +} +module std_initializer_list [system] { + header "initializer_list" + export * +} +module std_iomanip [system] { + header "iomanip" + export * +} +module std_ios [system] { + header "ios" + export * +} +module std_iosfwd [system] { + header "iosfwd" + export * +} +module std_iostream [system] { + header "iostream" + export * +} +module std_istream [system] { + header "istream" + export * +} +module std_iterator [system] { + header "iterator" + export * +} +module std_latch [system] { + header "latch" + export * +} +module std_limits [system] { + header "limits" + export * +} +module std_list [system] { + header "list" + export * +} +module std_locale [system] { + header "locale" + export * +} +module std_map [system] { + header "map" + export * +} +module std_mdspan [system] { + header "mdspan" + module default_accessor { header "__mdspan/default_accessor.h" } + module extents { header "__mdspan/extents.h" } + module fwd { header "__fwd/mdspan.h" } + module layout_left { header "__mdspan/layout_left.h" } + module layout_right { header "__mdspan/layout_right.h" } + module layout_stride { header "__mdspan/layout_stride.h" } + module mdspan { + header "__mdspan/mdspan.h" + export std_array // for strides() } + export * +} +module std_memory [system] { + header "memory" + export * +} +module std_memory_resource [system] { + header "memory_resource" + export * +} +module std_mutex [system] { + header "mutex" + export * +} +module std_new [system] { + header "new" + export * +} +module std_numbers [system] { + header "numbers" + export * +} +module std_numeric [system] { + header "numeric" + export * +} +module std_optional [system] { + header "optional" + export * +} +module std_ostream [system] { + header "ostream" + export * +} +module std_print [system] { + header "print" + export * +} +module std_queue [system] { + header "queue" + export * +} +module std_random [system] { + header "random" + export * +} +module std_ranges [system] { + header "ranges" + export * +} +module std_ratio [system] { + header "ratio" + export * +} +module std_regex [system] { + header "regex" + export * +} +module std_scoped_allocator [system] { + header "scoped_allocator" + export * +} +module std_semaphore [system] { + header "semaphore" + export * +} +module std_set [system] { + header "set" + export * +} +module std_shared_mutex [system] { + header "shared_mutex" + export std_version +} +module std_source_location [system] { + header "source_location" + export * +} +module std_span [system] { + header "span" + export std_private_ranges_enable_borrowed_range + export std_version + export std_private_span_span_fwd +} +module std_sstream [system] { + header "sstream" + export * +} +module std_stack [system] { + header "stack" + export * +} +module std_stdexcept [system] { + header "stdexcept" + export * +} +module std_stop_token [system] { + header "stop_token" + private header "__stop_token/atomic_unique_lock.h" + private header "__stop_token/intrusive_list_view.h" + private header "__stop_token/intrusive_shared_ptr.h" + private header "__stop_token/stop_callback.h" + private header "__stop_token/stop_source.h" + private header "__stop_token/stop_state.h" + private header "__stop_token/stop_token.h" + export * +} +module std_streambuf [system] { + header "streambuf" + export * +} +module std_string [system] { + header "string" + export * +} +module std_string_view [system] { + header "string_view" + export * +} +module std_strstream [system] { + header "strstream" + export * +} +module std_syncstream [system] { + header "syncstream" + export * +} +module std_system_error [system] { + header "system_error" + export * +} +module std_thread [system] { + header "thread" + export * +} +module std_tuple [system] { + header "tuple" + export * +} +module std_type_traits [system] { + header "type_traits" + export * +} +module std_typeindex [system] { + header "typeindex" + export * +} +module std_typeinfo [system] { + header "typeinfo" + export * +} +module std_unordered_map [system] { + header "unordered_map" + export * +} +module std_unordered_set [system] { + header "unordered_set" + export * +} +module std_utility [system] { + header "utility" + export * +} +module std_valarray [system] { + header "valarray" + export * +} +module std_variant [system] { + header "variant" + export * +} +module std_vector [system] { + header "vector" + export * +} +module std_version [system] { + header "version" + export * +} - module typeindex { - header "typeindex" - export * - } +// C standard library interface wrappers +module std_cassert [system] { + // 's use of NDEBUG requires textual inclusion. + textual header "cassert" +} +module std_ccomplex [system] { + header "ccomplex" + export * +} +module std_cctype [system] { + header "cctype" + export * +} +module std_cerrno [system] { + header "cerrno" + export * +} +module std_cfenv [system] { + header "cfenv" + export * +} +module std_cfloat [system] { + header "cfloat" + export * +} +module std_cinttypes [system] { + header "cinttypes" + export * +} +module std_ciso646 [system] { + header "ciso646" + export * +} +module std_climits [system] { + header "climits" + export * +} +module std_clocale [system] { + header "clocale" + export * +} +module std_cmath [system] { + header "cmath" + export * +} +module std_csetjmp [system] { + header "csetjmp" + export * +} +module std_csignal [system] { + header "csignal" + export * +} +// FIXME: is missing. +module std_cstdarg [system] { + header "cstdarg" + export * +} +module std_cstdbool [system] { + header "cstdbool" + export * +} +module std_cstddef [system] { + header "cstddef" + module byte { header "__cstddef/byte.h" } + module max_align_t { header "__cstddef/max_align_t.h" } + module nullptr_t { header "__cstddef/nullptr_t.h" } + module ptrdiff_t { header "__cstddef/ptrdiff_t.h" } + module size_t { header "__cstddef/size_t.h" } + export * +} +module std_cstdint [system] { + header "cstdint" + export * +} +module std_cstdio [system] { + header "cstdio" + export * +} +module std_cstdlib [system] { + header "cstdlib" + export * +} +module std_cstring [system] { + header "cstring" + export * +} +module std_ctgmath [system] { + header "ctgmath" + export * +} +module std_ctime [system] { + header "ctime" + export * +} +module std_cuchar [system] { + header "cuchar" + export * +} +module std_cwchar [system] { + header "cwchar" + export * +} +module std_cwctype [system] { + header "cwctype" + export * +} - module typeinfo { - header "typeinfo" - export * - } +// C standard library interfaces augmented/replaced in C++ +// provided by C library. +module std_complex_h [system] { + header "complex.h" + export * +} +module std_ctype_h [system] { + header "ctype.h" + export * +} +module std_errno_h [system] { + header "errno.h" + export * +} +module std_fenv_h [system] { + header "fenv.h" + export * +} +module std_float_h [system] { + header "float.h" + export * +} +module std_inttypes_h [system] { + header "inttypes.h" + export * +} +// provided by compiler. +module std_locale_h [system] { + header "locale.h" + export * +} +module std_math_h [system] { + header "math.h" + export * +} +// provided by C library. +// provided by C library. +// FIXME: is missing. +// provided by compiler. +module std_stdatomic_h [system] { + header "stdatomic.h" + export * +} +module std_stdbool_h [system] { + // 's __bool_true_false_are_defined macro requires textual inclusion. + textual header "stdbool.h" + export * +} +module std_stddef_h [system] { + // 's __need_* macros require textual inclusion. + textual header "stddef.h" + export * +} +module std_stdint_h [system] { + header "stdint.h" + export * +} +module std_stdio_h [system] { + // 's __need_* macros require textual inclusion. + textual header "stdio.h" + export * +} +module std_stdlib_h [system] { + // 's __need_* macros require textual inclusion. + textual header "stdlib.h" + export * +} +module std_string_h [system] { + header "string.h" + export * +} +module std_tgmath_h [system] { + header "tgmath.h" + export * +} +module std_uchar_h [system] { + header "uchar.h" + export * +} +// provided by C library. +module std_wchar_h [system] { + // 's __need_* macros require textual inclusion. + textual header "wchar.h" + export * +} +module std_wctype_h [system] { + header "wctype.h" + export * +} - module unordered_map { - header "unordered_map" +// Experimental C++ standard library interfaces +module std_experimental [system] { + module iterator { + header "experimental/iterator" export * } - - module unordered_set { - header "unordered_set" + module memory { + header "experimental/memory" export * } - - module utility { - module as_const { header "__utility/as_const.h" } - module as_lvalue { header "__utility/as_lvalue.h" } - module auto_cast { - header "__utility/auto_cast.h" - export std_core.type_traits.decay // the macro expansion uses that trait - } - module cmp { header "__utility/cmp.h" } - module convert_to_integral { header "__utility/convert_to_integral.h" } - module exception_guard { header "__utility/exception_guard.h" } - module exchange { header "__utility/exchange.h" } - module forward_like { header "__utility/forward_like.h" } - module in_place { - header "__utility/in_place.h" - export std_core.type_traits.integral_constant - } - module integer_sequence { header "__utility/integer_sequence.h" } - module is_pointer_in_range { header "__utility/is_pointer_in_range.h" } - module is_valid_range { header "__utility/is_valid_range.h" } - module move { header "__utility/move.h" } - module no_destroy { header "__utility/no_destroy.h" } - module pair { header "__utility/pair.h" } - module piecewise_construct { header "__utility/piecewise_construct.h" } - module priority_tag { header "__utility/priority_tag.h" } - module private_constructor_tag { header "__utility/private_constructor_tag.h" } - module rel_ops { header "__utility/rel_ops.h" } - module small_buffer { header "__utility/small_buffer.h" } - module swap { header "__utility/swap.h" } - module to_underlying { header "__utility/to_underlying.h" } - module unreachable { header "__utility/unreachable.h" } - - header "utility" + module propagate_const { + header "experimental/propagate_const" export * } + module simd { + module aligned_tag { private header "experimental/__simd/aligned_tag.h" } + module declaration { private header "experimental/__simd/declaration.h" } + module reference { private header "experimental/__simd/reference.h" } + module scalar { private header "experimental/__simd/scalar.h" } + module simd { private header "experimental/__simd/simd.h" } + module simd_mask { private header "experimental/__simd/simd_mask.h" } + module traits { private header "experimental/__simd/traits.h" } + module utility { private header "experimental/__simd/utility.h" } + module vec_ext { private header "experimental/__simd/vec_ext.h" } - module valarray { - header "valarray" + header "experimental/simd" export * } - - module variant { - module fwd { header "__fwd/variant.h" } - module monostate { header "__variant/monostate.h" } - - header "variant" + module type_traits { + header "experimental/type_traits" export * } - - module vector { - module fwd { header "__fwd/vector.h" } - - header "vector" + module utility { + header "experimental/utility" export * } +} - // Experimental C++ Standard Library interfaces - module experimental { - module iterator { header "experimental/iterator" } - module memory { header "experimental/memory" } - module propagate_const { header "experimental/propagate_const" } - module type_traits { header "experimental/type_traits" } - module utility { header "experimental/utility" } - module simd { - private header "experimental/__simd/aligned_tag.h" - private header "experimental/__simd/declaration.h" - private header "experimental/__simd/reference.h" - private header "experimental/__simd/scalar.h" - private header "experimental/__simd/simd_mask.h" - private header "experimental/__simd/simd.h" - private header "experimental/__simd/traits.h" - private header "experimental/__simd/utility.h" - private header "experimental/__simd/vec_ext.h" - header "experimental/simd" - export * - } - } - - // Implementation detail headers that are private to libc++. These modules - // must not be directly imported. - module debug_utils { - module randomize_range { header "__debug_utils/randomize_range.h" } - module sanitizers { header "__debug_utils/sanitizers.h" } - module strict_weak_ordering_check { header "__debug_utils/strict_weak_ordering_check.h" } - } +// Convenience method to get all of the above modules in a single import statement. +// Importing only the needed modules is likely to be more performant. +module std [system] { + header "__std_clang_module" + export * +} - module get_fwd { - header "__fwd/get.h" - export std_core.fwd.pair - export std_core.fwd.tuple - export std.array.fwd - export std.complex.fwd - export std.ranges.subrange_fwd - export std.variant.fwd - } +// Implementation detail headers that are private to libc++. These modules +// must not be directly imported. +module std_private_assert [system] { + header "__assert" + export * +} +module std_private_bit_reference [system] { + header "__bit_reference" + export * +} +module std_private_fwd_bit_reference [system] { + header "__fwd/bit_reference.h" +} +module std_private_fwd_byte [system] { + header "__fwd/byte.h" +} +module std_private_config [system] { + textual header "__config" + textual header "__configuration/abi.h" + textual header "__configuration/availability.h" + textual header "__configuration/compiler.h" + textual header "__configuration/language.h" + textual header "__configuration/platform.h" + export * +} +module std_private_hash_table [system] { + header "__hash_table" + export * +} +module std_private_locale [system] { + header "__locale" + export * +} +module std_private_mbstate_t [system] { + header "__mbstate_t.h" + export * +} +module std_private_node_handle [system] { + header "__node_handle" + export * +} +module std_private_split_buffer [system] { + header "__split_buffer" + export * +} +module std_private_std_mbstate_t [system] { + header "__std_mbstate_t.h" + export * +} +module std_private_tree [system] { + header "__tree" + export * +} +module std_private_undef_macros [system] { + textual header "__undef_macros" + export * +} +module std_private_verbose_abort [system] { + header "__verbose_abort" + export * +} - module pstl { - module backend_fwd { - header "__pstl/backend_fwd.h" - } - module backend { - header "__pstl/backend.h" - export * // need to export everything from whatever backend is currently configured - } - module backends { - module default { - header "__pstl/backends/default.h" - export std_core.utility_core.empty - } - module libdispatch { - header "__pstl/backends/libdispatch.h" - export std.pstl.cpu_algos - export std_core.utility_core.empty - } - module serial { - header "__pstl/backends/serial.h" - export std_core.utility_core.empty - } - module std_thread { - header "__pstl/backends/std_thread.h" - export std.pstl.cpu_algos - export std_core.utility_core.empty - } - } - module cpu_algos { - module any_of { - header "__pstl/cpu_algos/any_of.h" - } - module cpu_traits { - header "__pstl/cpu_algos/cpu_traits.h" - } - module fill { - header "__pstl/cpu_algos/fill.h" - export std_core.utility_core.empty - } - module find_if { - header "__pstl/cpu_algos/find_if.h" - } - module for_each { - header "__pstl/cpu_algos/for_each.h" - export std_core.utility_core.empty - } - module merge { - header "__pstl/cpu_algos/merge.h" - } - module stable_sort { - header "__pstl/cpu_algos/stable_sort.h" - export std_core.utility_core.empty - } - module transform { - header "__pstl/cpu_algos/transform.h" - } - module transform_reduce { - header "__pstl/cpu_algos/transform_reduce.h" - } - } - module dispatch { header "__pstl/dispatch.h" } - module handle_exception { header "__pstl/handle_exception.h" } - } +module std_private_algorithm_adjacent_find [system] { header "__algorithm/adjacent_find.h" } +module std_private_algorithm_all_of [system] { header "__algorithm/all_of.h" } +module std_private_algorithm_any_of [system] { header "__algorithm/any_of.h" } +module std_private_algorithm_binary_search [system] { header "__algorithm/binary_search.h" } +module std_private_algorithm_clamp [system] { header "__algorithm/clamp.h" } +module std_private_algorithm_comp [system] { header "__algorithm/comp.h" } +module std_private_algorithm_comp_ref_type [system] { header "__algorithm/comp_ref_type.h" } +module std_private_algorithm_copy [system] { + header "__algorithm/copy.h" + export std_private_algorithm_copy_move_common +} +module std_private_algorithm_copy_backward [system] { header "__algorithm/copy_backward.h" } +module std_private_algorithm_copy_if [system] { header "__algorithm/copy_if.h" } +module std_private_algorithm_copy_move_common [system] { + header "__algorithm/copy_move_common.h" + export std_private_type_traits_is_trivially_copyable +} +module std_private_algorithm_copy_n [system] { header "__algorithm/copy_n.h" } +module std_private_algorithm_count [system] { header "__algorithm/count.h" } +module std_private_algorithm_count_if [system] { header "__algorithm/count_if.h" } +module std_private_algorithm_equal [system] { header "__algorithm/equal.h" } +module std_private_algorithm_equal_range [system] { header "__algorithm/equal_range.h" } +module std_private_algorithm_fill [system] { header "__algorithm/fill.h" } +module std_private_algorithm_fill_n [system] { header "__algorithm/fill_n.h" } +module std_private_algorithm_find [system] { + header "__algorithm/find.h" + export std_private_algorithm_unwrap_iter +} +module std_private_algorithm_find_end [system] { header "__algorithm/find_end.h" } +module std_private_algorithm_find_first_of [system] { header "__algorithm/find_first_of.h" } +module std_private_algorithm_find_if [system] { header "__algorithm/find_if.h" } +module std_private_algorithm_find_if_not [system] { header "__algorithm/find_if_not.h" } +module std_private_algorithm_find_segment_if [system] { header "__algorithm/find_segment_if.h" } +module std_private_algorithm_for_each [system] { header "__algorithm/for_each.h" } +module std_private_algorithm_for_each_n [system] { header "__algorithm/for_each_n.h" } +module std_private_algorithm_for_each_segment [system] { header "__algorithm/for_each_segment.h" } +module std_private_algorithm_generate [system] { header "__algorithm/generate.h" } +module std_private_algorithm_generate_n [system] { header "__algorithm/generate_n.h" } +module std_private_algorithm_half_positive [system] { header "__algorithm/half_positive.h" } +module std_private_algorithm_in_found_result [system] { header "__algorithm/in_found_result.h" } +module std_private_algorithm_in_fun_result [system] { header "__algorithm/in_fun_result.h" } +module std_private_algorithm_in_in_out_result [system] { header "__algorithm/in_in_out_result.h" } +module std_private_algorithm_in_in_result [system] { header "__algorithm/in_in_result.h" } +module std_private_algorithm_in_out_out_result [system] { header "__algorithm/in_out_out_result.h" } +module std_private_algorithm_in_out_result [system] { header "__algorithm/in_out_result.h" } +module std_private_algorithm_includes [system] { header "__algorithm/includes.h" } +module std_private_algorithm_inplace_merge [system] { header "__algorithm/inplace_merge.h" } +module std_private_algorithm_is_heap [system] { header "__algorithm/is_heap.h" } +module std_private_algorithm_is_heap_until [system] { header "__algorithm/is_heap_until.h" } +module std_private_algorithm_is_partitioned [system] { header "__algorithm/is_partitioned.h" } +module std_private_algorithm_is_permutation [system] { header "__algorithm/is_permutation.h" } +module std_private_algorithm_is_sorted [system] { header "__algorithm/is_sorted.h" } +module std_private_algorithm_is_sorted_until [system] { header "__algorithm/is_sorted_until.h" } +module std_private_algorithm_iter_swap [system] { header "__algorithm/iter_swap.h" } +module std_private_algorithm_iterator_operations [system] { + header "__algorithm/iterator_operations.h" + export * +} +module std_private_algorithm_lexicographical_compare [system] { header "__algorithm/lexicographical_compare.h" } +module std_private_algorithm_lexicographical_compare_three_way [system] { header "__algorithm/lexicographical_compare_three_way.h" } +module std_private_algorithm_lower_bound [system] { header "__algorithm/lower_bound.h" } +module std_private_algorithm_make_heap [system] { header "__algorithm/make_heap.h" } +module std_private_algorithm_make_projected [system] { header "__algorithm/make_projected.h" } +module std_private_algorithm_max [system] { header "__algorithm/max.h" } +module std_private_algorithm_max_element [system] { header "__algorithm/max_element.h" } +module std_private_algorithm_merge [system] { header "__algorithm/merge.h" } +module std_private_algorithm_min [system] { header "__algorithm/min.h" } +module std_private_algorithm_min_element [system] { header "__algorithm/min_element.h" } +module std_private_algorithm_min_max_result [system] { header "__algorithm/min_max_result.h" } +module std_private_algorithm_minmax [system] { + header "__algorithm/minmax.h" + export * +} +module std_private_algorithm_minmax_element [system] { header "__algorithm/minmax_element.h" } +module std_private_algorithm_mismatch [system] { + header "__algorithm/mismatch.h" + export std_private_algorithm_simd_utils + export std_private_iterator_aliasing_iterator +} +module std_private_algorithm_move [system] { header "__algorithm/move.h" } +module std_private_algorithm_move_backward [system] { header "__algorithm/move_backward.h" } +module std_private_algorithm_next_permutation [system] { header "__algorithm/next_permutation.h" } +module std_private_algorithm_none_of [system] { header "__algorithm/none_of.h" } +module std_private_algorithm_nth_element [system] { header "__algorithm/nth_element.h" } +module std_private_algorithm_partial_sort [system] { header "__algorithm/partial_sort.h" } +module std_private_algorithm_partial_sort_copy [system] { header "__algorithm/partial_sort_copy.h" } +module std_private_algorithm_partition [system] { header "__algorithm/partition.h" } +module std_private_algorithm_partition_copy [system] { header "__algorithm/partition_copy.h" } +module std_private_algorithm_partition_point [system] { header "__algorithm/partition_point.h" } +module std_private_algorithm_pop_heap [system] { header "__algorithm/pop_heap.h" } +module std_private_algorithm_prev_permutation [system] { header "__algorithm/prev_permutation.h" } +module std_private_algorithm_pstl [system] { + header "__algorithm/pstl.h" + export * +} +module std_private_algorithm_push_heap [system] { header "__algorithm/push_heap.h" } +module std_private_algorithm_ranges_adjacent_find [system] { header "__algorithm/ranges_adjacent_find.h" } +module std_private_algorithm_ranges_all_of [system] { header "__algorithm/ranges_all_of.h" } +module std_private_algorithm_ranges_any_of [system] { header "__algorithm/ranges_any_of.h" } +module std_private_algorithm_ranges_binary_search [system] { + header "__algorithm/ranges_binary_search.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_clamp [system] { + header "__algorithm/ranges_clamp.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_contains [system] { header "__algorithm/ranges_contains.h" } +module std_private_algorithm_ranges_contains_subrange [system] { header "__algorithm/ranges_contains_subrange.h" } +module std_private_algorithm_ranges_copy [system] { + header "__algorithm/ranges_copy.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_copy_backward [system] { + header "__algorithm/ranges_copy_backward.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_copy_if [system] { + header "__algorithm/ranges_copy_if.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_copy_n [system] { + header "__algorithm/ranges_copy_n.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_count [system] { header "__algorithm/ranges_count.h" } +module std_private_algorithm_ranges_count_if [system] { header "__algorithm/ranges_count_if.h" } +module std_private_algorithm_ranges_ends_with [system] { header "__algorithm/ranges_ends_with.h" } +module std_private_algorithm_ranges_equal [system] { header "__algorithm/ranges_equal.h" } +module std_private_algorithm_ranges_equal_range [system] { + header "__algorithm/ranges_equal_range.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_fill [system] { header "__algorithm/ranges_fill.h" } +module std_private_algorithm_ranges_fill_n [system] { header "__algorithm/ranges_fill_n.h" } +module std_private_algorithm_ranges_find [system] { header "__algorithm/ranges_find.h" } +module std_private_algorithm_ranges_find_end [system] { header "__algorithm/ranges_find_end.h" } +module std_private_algorithm_ranges_find_first_of [system] { header "__algorithm/ranges_find_first_of.h" } +module std_private_algorithm_ranges_find_if [system] { header "__algorithm/ranges_find_if.h" } +module std_private_algorithm_ranges_find_if_not [system] { header "__algorithm/ranges_find_if_not.h" } +module std_private_algorithm_ranges_find_last [system] { header "__algorithm/ranges_find_last.h" } +module std_private_algorithm_ranges_fold [system] { header "__algorithm/ranges_fold.h" } +module std_private_algorithm_ranges_for_each [system] { + header "__algorithm/ranges_for_each.h" + export std_private_algorithm_in_fun_result +} +module std_private_algorithm_ranges_for_each_n [system] { + header "__algorithm/ranges_for_each_n.h" + export std_private_algorithm_in_fun_result +} +module std_private_algorithm_ranges_generate [system] { header "__algorithm/ranges_generate.h" } +module std_private_algorithm_ranges_generate_n [system] { header "__algorithm/ranges_generate_n.h" } +module std_private_algorithm_ranges_includes [system] { + header "__algorithm/ranges_includes.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_inplace_merge [system] { + header "__algorithm/ranges_inplace_merge.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_is_heap [system] { + header "__algorithm/ranges_is_heap.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_is_heap_until [system] { + header "__algorithm/ranges_is_heap_until.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_is_partitioned [system] { header "__algorithm/ranges_is_partitioned.h" } +module std_private_algorithm_ranges_is_permutation [system] { header "__algorithm/ranges_is_permutation.h" } +module std_private_algorithm_ranges_is_sorted [system] { + header "__algorithm/ranges_is_sorted.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_is_sorted_until [system] { + header "__algorithm/ranges_is_sorted_until.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_iterator_concept [system] { header "__algorithm/ranges_iterator_concept.h" } +module std_private_algorithm_ranges_lexicographical_compare [system] { + header "__algorithm/ranges_lexicographical_compare.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_lower_bound [system] { + header "__algorithm/ranges_lower_bound.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_make_heap [system] { + header "__algorithm/ranges_make_heap.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_max [system] { + header "__algorithm/ranges_max.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_max_element [system] { + header "__algorithm/ranges_max_element.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_merge [system] { + header "__algorithm/ranges_merge.h" + export std_private_algorithm_in_in_out_result +} +module std_private_algorithm_ranges_min [system] { + header "__algorithm/ranges_min.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_min_element [system] { + header "__algorithm/ranges_min_element.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_minmax [system] { + header "__algorithm/ranges_minmax.h" + export std_private_functional_ranges_operations + export std_private_algorithm_min_max_result +} +module std_private_algorithm_ranges_minmax_element [system] { + header "__algorithm/ranges_minmax_element.h" + export std_private_functional_ranges_operations + export std_private_algorithm_min_max_result +} +module std_private_algorithm_ranges_mismatch [system] { + header "__algorithm/ranges_mismatch.h" + export std_private_algorithm_in_in_result +} +module std_private_algorithm_ranges_move [system] { + header "__algorithm/ranges_move.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_move_backward [system] { + header "__algorithm/ranges_move_backward.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_next_permutation [system] { + header "__algorithm/ranges_next_permutation.h" + export std_private_algorithm_in_found_result + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_none_of [system] { header "__algorithm/ranges_none_of.h" } +module std_private_algorithm_ranges_nth_element [system] { + header "__algorithm/ranges_nth_element.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_partial_sort [system] { + header "__algorithm/ranges_partial_sort.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_partial_sort_copy [system] { + header "__algorithm/ranges_partial_sort_copy.h" + export std_private_algorithm_in_out_result + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_partition [system] { header "__algorithm/ranges_partition.h" } +module std_private_algorithm_ranges_partition_copy [system] { header "__algorithm/ranges_partition_copy.h" } +module std_private_algorithm_ranges_partition_point [system] { header "__algorithm/ranges_partition_point.h" } +module std_private_algorithm_ranges_pop_heap [system] { + header "__algorithm/ranges_pop_heap.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_prev_permutation [system] { + header "__algorithm/ranges_prev_permutation.h" + export std_private_algorithm_in_found_result + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_push_heap [system] { + header "__algorithm/ranges_push_heap.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_remove [system] { header "__algorithm/ranges_remove.h" } +module std_private_algorithm_ranges_remove_copy [system] { + header "__algorithm/ranges_remove_copy.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_remove_copy_if [system] { + header "__algorithm/ranges_remove_copy_if.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_remove_if [system] { header "__algorithm/ranges_remove_if.h" } +module std_private_algorithm_ranges_replace [system] { header "__algorithm/ranges_replace.h" } +module std_private_algorithm_ranges_replace_copy [system] { + header "__algorithm/ranges_replace_copy.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_replace_copy_if [system] { + header "__algorithm/ranges_replace_copy_if.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_replace_if [system] { header "__algorithm/ranges_replace_if.h" } +module std_private_algorithm_ranges_reverse [system] { header "__algorithm/ranges_reverse.h" } +module std_private_algorithm_ranges_reverse_copy [system] { + header "__algorithm/ranges_reverse_copy.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_rotate [system] { header "__algorithm/ranges_rotate.h" } +module std_private_algorithm_ranges_rotate_copy [system] { + header "__algorithm/ranges_rotate_copy.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_sample [system] { header "__algorithm/ranges_sample.h" } +module std_private_algorithm_ranges_search [system] { header "__algorithm/ranges_search.h" } +module std_private_algorithm_ranges_search_n [system] { header "__algorithm/ranges_search_n.h" } +module std_private_algorithm_ranges_set_difference [system] { + header "__algorithm/ranges_set_difference.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_set_intersection [system] { + header "__algorithm/ranges_set_intersection.h" + export std_private_algorithm_in_in_out_result +} +module std_private_algorithm_ranges_set_symmetric_difference [system] { + header "__algorithm/ranges_set_symmetric_difference.h" + export std_private_algorithm_in_in_out_result + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_set_union [system] { + header "__algorithm/ranges_set_union.h" + export std_private_algorithm_in_in_out_result + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_shuffle [system] { header "__algorithm/ranges_shuffle.h" } +module std_private_algorithm_ranges_sort [system] { + header "__algorithm/ranges_sort.h" + export std_private_algorithm_make_projected + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_sort_heap [system] { + header "__algorithm/ranges_sort_heap.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_stable_partition [system] { header "__algorithm/ranges_stable_partition.h" } +module std_private_algorithm_ranges_stable_sort [system] { + header "__algorithm/ranges_stable_sort.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_ranges_starts_with [system] { header "__algorithm/ranges_starts_with.h" } +module std_private_algorithm_ranges_swap_ranges [system] { + header "__algorithm/ranges_swap_ranges.h" + export std_private_algorithm_in_in_result +} +module std_private_algorithm_ranges_transform [system] { + header "__algorithm/ranges_transform.h" + export std_private_algorithm_in_in_out_result + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_unique [system] { header "__algorithm/ranges_unique.h" } +module std_private_algorithm_ranges_unique_copy [system] { + header "__algorithm/ranges_unique_copy.h" + export std_private_algorithm_in_out_result +} +module std_private_algorithm_ranges_upper_bound [system] { + header "__algorithm/ranges_upper_bound.h" + export std_private_functional_ranges_operations +} +module std_private_algorithm_remove [system] { header "__algorithm/remove.h" } +module std_private_algorithm_remove_copy [system] { header "__algorithm/remove_copy.h" } +module std_private_algorithm_remove_copy_if [system] { header "__algorithm/remove_copy_if.h" } +module std_private_algorithm_remove_if [system] { header "__algorithm/remove_if.h" } +module std_private_algorithm_replace [system] { header "__algorithm/replace.h" } +module std_private_algorithm_replace_copy [system] { header "__algorithm/replace_copy.h" } +module std_private_algorithm_replace_copy_if [system] { header "__algorithm/replace_copy_if.h" } +module std_private_algorithm_replace_if [system] { header "__algorithm/replace_if.h" } +module std_private_algorithm_reverse [system] { header "__algorithm/reverse.h" } +module std_private_algorithm_reverse_copy [system] { header "__algorithm/reverse_copy.h" } +module std_private_algorithm_rotate [system] { header "__algorithm/rotate.h" } +module std_private_algorithm_rotate_copy [system] { header "__algorithm/rotate_copy.h" } +module std_private_algorithm_sample [system] { header "__algorithm/sample.h" } +module std_private_algorithm_search [system] { header "__algorithm/search.h" } +module std_private_algorithm_search_n [system] { header "__algorithm/search_n.h" } +module std_private_algorithm_set_difference [system] { header "__algorithm/set_difference.h" } +module std_private_algorithm_set_intersection [system] { header "__algorithm/set_intersection.h" } +module std_private_algorithm_set_symmetric_difference [system] { header "__algorithm/set_symmetric_difference.h" } +module std_private_algorithm_set_union [system] { header "__algorithm/set_union.h" } +module std_private_algorithm_shift_left [system] { header "__algorithm/shift_left.h" } +module std_private_algorithm_shift_right [system] { header "__algorithm/shift_right.h" } +module std_private_algorithm_shuffle [system] { header "__algorithm/shuffle.h" } +module std_private_algorithm_sift_down [system] { header "__algorithm/sift_down.h" } +module std_private_algorithm_sort [system] { + header "__algorithm/sort.h" + export std_private_debug_utils_strict_weak_ordering_check +} +module std_private_algorithm_simd_utils [system] { header "__algorithm/simd_utils.h" } +module std_private_algorithm_sort_heap [system] { header "__algorithm/sort_heap.h" } +module std_private_algorithm_stable_partition [system] { header "__algorithm/stable_partition.h" } +module std_private_algorithm_stable_sort [system] { header "__algorithm/stable_sort.h" } +module std_private_algorithm_swap_ranges [system] { + header "__algorithm/swap_ranges.h" + export std_private_algorithm_iterator_operations +} +module std_private_algorithm_three_way_comp_ref_type [system] { header "__algorithm/three_way_comp_ref_type.h" } +module std_private_algorithm_transform [system] { header "__algorithm/transform.h" } +module std_private_algorithm_uniform_random_bit_generator_adaptor [system] { header "__algorithm/uniform_random_bit_generator_adaptor.h" } +module std_private_algorithm_unique [system] { header "__algorithm/unique.h" } +module std_private_algorithm_unique_copy [system] { header "__algorithm/unique_copy.h" } +module std_private_algorithm_unwrap_iter [system] { + header "__algorithm/unwrap_iter.h" + export std_private_iterator_iterator_traits +} +module std_private_algorithm_unwrap_range [system] { + header "__algorithm/unwrap_range.h" + export std_private_utility_pair +} +module std_private_algorithm_upper_bound [system] { header "__algorithm/upper_bound.h" } - // Miscellaneous modules for top-level headers - module bit_reference_fwd { - header "__fwd/bit_reference.h" - } - module bit_reference { - header "__bit_reference" - export std.bit_reference_fwd - } - module hash_table { header "__hash_table" } - module node_handle { header "__node_handle" } - module split_buffer { header "__split_buffer" } - module tree { header "__tree" } - module std_mbstate_t { - header "__std_mbstate_t.h" - export * - } - module verbose_abort { - header "__verbose_abort" - } - module internal_assert { - header "__assert" - export * - } +module std_private_array_array_fwd [system] { header "__fwd/array.h" } - module undef_macros { - textual header "__undef_macros" - } +module std_private_atomic_aliases [system] { + header "__atomic/aliases.h" + export std_private_atomic_atomic +} +module std_private_atomic_atomic [system] { + header "__atomic/atomic.h" + export std_private_atomic_atomic_base +} +module std_private_atomic_atomic_base [system] { header "__atomic/atomic_base.h" } +module std_private_atomic_atomic_flag [system] { + header "__atomic/atomic_flag.h" + export * +} +module std_private_atomic_atomic_init [system] { header "__atomic/atomic_init.h" } +module std_private_atomic_atomic_lock_free [system] { header "__atomic/atomic_lock_free.h" } +module std_private_atomic_atomic_ref [system] { header "__atomic/atomic_ref.h" } +module std_private_atomic_atomic_sync [system] { + header "__atomic/atomic_sync.h" + export std_private_atomic_to_gcc_order +} +module std_private_atomic_check_memory_order [system] { header "__atomic/check_memory_order.h" } +module std_private_atomic_contention_t [system] { header "__atomic/contention_t.h" } +module std_private_atomic_cxx_atomic_impl [system] { header "__atomic/cxx_atomic_impl.h" } +module std_private_atomic_fence [system] { header "__atomic/fence.h" } +module std_private_atomic_is_always_lock_free [system] { header "__atomic/is_always_lock_free.h" } +module std_private_atomic_kill_dependency [system] { header "__atomic/kill_dependency.h" } +module std_private_atomic_memory_order [system] { header "__atomic/memory_order.h" } +module std_private_atomic_to_gcc_order [system] { + header "__atomic/to_gcc_order.h" + export std_private_atomic_memory_order +} - // This module needs to appear after __tree to work around issues with modules in Objective-C++ mode. - module coroutine { - module coroutine_handle { header "__coroutine/coroutine_handle.h" } - module coroutine_traits { header "__coroutine/coroutine_traits.h" } - module noop_coroutine_handle { header "__coroutine/noop_coroutine_handle.h" } - module trivial_awaitables { header "__coroutine/trivial_awaitables.h" } +module std_private_bit_bit_cast [system] { header "__bit/bit_cast.h" } +module std_private_bit_bit_ceil [system] { header "__bit/bit_ceil.h" } +module std_private_bit_bit_floor [system] { header "__bit/bit_floor.h" } +module std_private_bit_bit_log2 [system] { header "__bit/bit_log2.h" } +module std_private_bit_bit_width [system] { header "__bit/bit_width.h" } +module std_private_bit_blsr [system] { header "__bit/blsr.h" } +module std_private_bit_byteswap [system] { header "__bit/byteswap.h" } +module std_private_bit_countl [system] { header "__bit/countl.h" } +module std_private_bit_countr [system] { header "__bit/countr.h" } +module std_private_bit_endian [system] { header "__bit/endian.h" } +module std_private_bit_has_single_bit [system] { header "__bit/has_single_bit.h" } +module std_private_bit_invert_if [system] { header "__bit/invert_if.h" } +module std_private_bit_popcount [system] { header "__bit/popcount.h" } +module std_private_bit_rotate [system] { header "__bit/rotate.h" } + +module std_private_chrono_calendar [system] { header "__chrono/calendar.h" } +module std_private_chrono_concepts [system] { header "__chrono/concepts.h" } +module std_private_chrono_convert_to_timespec [system] { header "__chrono/convert_to_timespec.h" } +module std_private_chrono_convert_to_tm [system] { header "__chrono/convert_to_tm.h" } +module std_private_chrono_day [system] { header "__chrono/day.h" } +module std_private_chrono_duration [system] { + header "__chrono/duration.h" + export std_private_type_traits_is_convertible +} +module std_private_chrono_exception [system] { header "__chrono/exception.h" } +module std_private_chrono_file_clock [system] { header "__chrono/file_clock.h" } +module std_private_chrono_formatter [system] { + header "__chrono/formatter.h" +} +module std_private_chrono_hh_mm_ss [system] { header "__chrono/hh_mm_ss.h" } +module std_private_chrono_high_resolution_clock [system] { + header "__chrono/high_resolution_clock.h" + export std_private_chrono_steady_clock + export std_private_chrono_system_clock +} +module std_private_chrono_leap_second [system] { header "__chrono/leap_second.h" } +module std_private_chrono_literals [system] { header "__chrono/literals.h" } +module std_private_chrono_local_info [system] { + header "__chrono/local_info.h" + export std_private_chrono_sys_info +} +module std_private_chrono_month [system] { header "__chrono/month.h" } +module std_private_chrono_month_weekday [system] { header "__chrono/month_weekday.h" } +module std_private_chrono_monthday [system] { header "__chrono/monthday.h" } +module std_private_chrono_ostream [system] { + header "__chrono/ostream.h" +} +module std_private_chrono_parser_std_format_spec [system] { + header "__chrono/parser_std_format_spec.h" +} +module std_private_chrono_statically_widen [system] { header "__chrono/statically_widen.h" } +module std_private_chrono_steady_clock [system] { + header "__chrono/steady_clock.h" + export std_private_chrono_time_point +} +module std_private_chrono_time_zone [system] { + header "__chrono/time_zone.h" + export std_private_memory_unique_ptr +} +module std_private_chrono_time_zone_link [system] { + header "__chrono/time_zone_link.h" +} +module std_private_chrono_sys_info [system] { + header "__chrono/sys_info.h" +} +module std_private_chrono_system_clock [system] { + header "__chrono/system_clock.h" + export std_private_chrono_time_point +} +module std_private_chrono_tzdb [system] { + header "__chrono/tzdb.h" + export * +} +module std_private_chrono_tzdb_list [system] { + header "__chrono/tzdb_list.h" + export * +} +module std_private_chrono_time_point [system] { header "__chrono/time_point.h" } +module std_private_chrono_weekday [system] { header "__chrono/weekday.h" } +module std_private_chrono_year [system] { header "__chrono/year.h" } +module std_private_chrono_year_month [system] { header "__chrono/year_month.h" } +module std_private_chrono_year_month_day [system] { header "__chrono/year_month_day.h" } +module std_private_chrono_year_month_weekday [system] { header "__chrono/year_month_weekday.h" } +module std_private_chrono_zoned_time [system] { header "__chrono/zoned_time.h" } + +module std_private_compare_common_comparison_category [system] { header "__compare/common_comparison_category.h" } +module std_private_compare_compare_partial_order_fallback [system] { header "__compare/compare_partial_order_fallback.h" } +module std_private_compare_compare_strong_order_fallback [system] { header "__compare/compare_strong_order_fallback.h" } +module std_private_compare_compare_three_way [system] { header "__compare/compare_three_way.h" } +module std_private_compare_compare_three_way_result [system] { header "__compare/compare_three_way_result.h" } +module std_private_compare_compare_weak_order_fallback [system] { header "__compare/compare_weak_order_fallback.h" } +module std_private_compare_is_eq [system] { header "__compare/is_eq.h" } +module std_private_compare_ordering [system] { header "__compare/ordering.h" } +module std_private_compare_partial_order [system] { header "__compare/partial_order.h" } +module std_private_compare_strong_order [system] { header "__compare/strong_order.h" } +module std_private_compare_synth_three_way [system] { header "__compare/synth_three_way.h" } +module std_private_compare_three_way_comparable [system] { header "__compare/three_way_comparable.h" } +module std_private_compare_weak_order [system] { header "__compare/weak_order.h" } + +module std_private_complex_complex_fwd [system] { header "__fwd/complex.h" } + +module std_private_concepts_arithmetic [system] { header "__concepts/arithmetic.h" } +module std_private_concepts_assignable [system] { header "__concepts/assignable.h" } +module std_private_concepts_boolean_testable [system] { header "__concepts/boolean_testable.h" } +module std_private_concepts_class_or_enum [system] { header "__concepts/class_or_enum.h" } +module std_private_concepts_common_reference_with [system] { header "__concepts/common_reference_with.h" } +module std_private_concepts_common_with [system] { header "__concepts/common_with.h" } +module std_private_concepts_constructible [system] { + header "__concepts/constructible.h" + export std_private_concepts_destructible +} +module std_private_concepts_convertible_to [system] { header "__concepts/convertible_to.h" } +module std_private_concepts_copyable [system] { header "__concepts/copyable.h" } +module std_private_concepts_derived_from [system] { header "__concepts/derived_from.h" } +module std_private_concepts_destructible [system] { + header "__concepts/destructible.h" + export std_private_type_traits_is_nothrow_destructible +} +module std_private_concepts_different_from [system] { header "__concepts/different_from.h" } +module std_private_concepts_equality_comparable [system] { + header "__concepts/equality_comparable.h" + export std_private_type_traits_common_reference +} +module std_private_concepts_invocable [system] { header "__concepts/invocable.h" } +module std_private_concepts_movable [system] { + header "__concepts/movable.h" + export std_private_type_traits_is_object +} +module std_private_concepts_predicate [system] { header "__concepts/predicate.h" } +module std_private_concepts_regular [system] { header "__concepts/regular.h" } +module std_private_concepts_relation [system] { header "__concepts/relation.h" } +module std_private_concepts_same_as [system] { + header "__concepts/same_as.h" + export std_private_type_traits_is_same +} +module std_private_concepts_semiregular [system] { header "__concepts/semiregular.h" } +module std_private_concepts_swappable [system] { header "__concepts/swappable.h" } +module std_private_concepts_totally_ordered [system] { header "__concepts/totally_ordered.h" } + +module std_private_debug_utils_randomize_range [system] { header "__debug_utils/randomize_range.h" } +module std_private_debug_utils_sanitizers [system] { header "__debug_utils/sanitizers.h" } +module std_private_debug_utils_strict_weak_ordering_check [system] { + header "__debug_utils/strict_weak_ordering_check.h" + export std_private_type_traits_is_constant_evaluated +} - header "coroutine" - export * - } -} // module std +module std_private_deque_fwd [system] { header "__fwd/deque.h" } -// C compatibility headers -// -// These modules need to be their own top-level modules because they depend on the system-provided -// headers (via include_next), which are then free to include other C headers provided by libc++. -// If we group these headers in a single module, we would end up with circular dependencies. -module std_complex_h [system] { - header "complex.h" - export * +module std_private_exception_exception [system] { header "__exception/exception.h" } +module std_private_exception_exception_ptr [system] { + header "__exception/exception_ptr.h" + export std_private_exception_operations } -module std_ctype_h [system] { - header "ctype.h" +module std_private_exception_nested_exception [system] { header "__exception/nested_exception.h" } +module std_private_exception_operations [system] { header "__exception/operations.h" } +module std_private_exception_terminate [system] { header "__exception/terminate.h" } + +module std_private_expected_bad_expected_access [system] { header "__expected/bad_expected_access.h" } +module std_private_expected_expected [system] { header "__expected/expected.h" } +module std_private_expected_unexpect [system] { header "__expected/unexpect.h" } +module std_private_expected_unexpected [system] { header "__expected/unexpected.h" } + +module std_private_format_buffer [system] { header "__format/buffer.h" } +module std_private_format_concepts [system] { header "__format/concepts.h" } +module std_private_format_container_adaptor [system] { header "__format/container_adaptor.h" } +module std_private_format_enable_insertable [system] { header "__format/enable_insertable.h" } +module std_private_format_escaped_output_table [system] { header "__format/escaped_output_table.h" } +module std_private_format_extended_grapheme_cluster_table [system] { header "__format/extended_grapheme_cluster_table.h" } +module std_private_format_format_arg [system] { header "__format/format_arg.h" } +module std_private_format_format_arg_store [system] { header "__format/format_arg_store.h" } +module std_private_format_format_args [system] { header "__format/format_args.h" } +module std_private_format_format_context [system] { + header "__format/format_context.h" export * } -module std_errno_h [system] { - header "errno.h" +module std_private_format_format_error [system] { header "__format/format_error.h" } +module std_private_format_format_functions [system] { + header "__format/format_functions.h" + export std_string +} +module std_private_format_fwd [system] { header "__fwd/format.h" } +module std_private_format_format_parse_context [system] { header "__format/format_parse_context.h" } +module std_private_format_format_string [system] { header "__format/format_string.h" } +module std_private_format_format_to_n_result [system] { + header "__format/format_to_n_result.h" + export std_private_iterator_incrementable_traits +} +module std_private_format_formatter [system] { header "__format/formatter.h" } +module std_private_format_formatter_bool [system] { header "__format/formatter_bool.h" } +module std_private_format_formatter_char [system] { header "__format/formatter_char.h" } +module std_private_format_formatter_floating_point [system] { header "__format/formatter_floating_point.h" } +module std_private_format_formatter_integer [system] { header "__format/formatter_integer.h" } +module std_private_format_formatter_integral [system] { header "__format/formatter_integral.h" } +module std_private_format_formatter_output [system] { header "__format/formatter_output.h" } +module std_private_format_formatter_pointer [system] { header "__format/formatter_pointer.h" } +module std_private_format_formatter_string [system] { header "__format/formatter_string.h" } +module std_private_format_formatter_tuple [system] { header "__format/formatter_tuple.h" } +module std_private_format_indic_conjunct_break_table [system] { header "__format/indic_conjunct_break_table.h" } +module std_private_format_parser_std_format_spec [system] { header "__format/parser_std_format_spec.h" } +module std_private_format_range_default_formatter [system] { header "__format/range_default_formatter.h" } +module std_private_format_range_formatter [system] { header "__format/range_formatter.h" } +module std_private_format_unicode [system] { + header "__format/unicode.h" + export std_private_format_extended_grapheme_cluster_table + export std_private_format_indic_conjunct_break_table +} +module std_private_format_width_estimation_table [system] { header "__format/width_estimation_table.h" } +module std_private_format_write_escaped [system] { header "__format/write_escaped.h" } + +module std_private_functional_binary_function [system] { header "__functional/binary_function.h" } +module std_private_functional_binary_negate [system] { header "__functional/binary_negate.h" } +module std_private_functional_bind [system] { header "__functional/bind.h" } +module std_private_functional_bind_back [system] { header "__functional/bind_back.h" } +module std_private_functional_bind_front [system] { header "__functional/bind_front.h" } +module std_private_functional_binder1st [system] { header "__functional/binder1st.h" } +module std_private_functional_binder2nd [system] { header "__functional/binder2nd.h" } +module std_private_functional_boyer_moore_searcher [system] { + header "__functional/boyer_moore_searcher.h" + export std_private_memory_shared_ptr +} +module std_private_functional_compose [system] { + header "__functional/compose.h" + export std_private_functional_perfect_forward +} +module std_private_functional_default_searcher [system] { header "__functional/default_searcher.h" } +module std_private_functional_function [system] { header "__functional/function.h" } +module std_private_functional_hash [system] { + header "__functional/hash.h" + export std_cstdint + export std_private_type_traits_underlying_type + export std_private_utility_pair +} +module std_private_functional_fwd [system] { header "__fwd/functional.h" } +module std_private_functional_identity [system] { header "__functional/identity.h" } +module std_private_functional_invoke [system] { + header "__functional/invoke.h" export * } -module std_fenv_h [system] { - header "fenv.h" +module std_private_functional_is_transparent [system] { header "__functional/is_transparent.h" } +module std_private_functional_mem_fn [system] { header "__functional/mem_fn.h" } +module std_private_functional_mem_fun_ref [system] { header "__functional/mem_fun_ref.h" } +module std_private_functional_not_fn [system] { + header "__functional/not_fn.h" + export std_private_functional_perfect_forward +} +module std_private_functional_operations [system] { header "__functional/operations.h" } +module std_private_functional_perfect_forward [system] { + header "__functional/perfect_forward.h" export * } -module std_float_h [system] { - header "float.h" +module std_private_functional_pointer_to_binary_function [system] { header "__functional/pointer_to_binary_function.h" } +module std_private_functional_pointer_to_unary_function [system] { header "__functional/pointer_to_unary_function.h" } +module std_private_functional_ranges_operations [system] { header "__functional/ranges_operations.h" } +module std_private_functional_reference_wrapper [system] { header "__functional/reference_wrapper.h" } +module std_private_functional_unary_function [system] { header "__functional/unary_function.h" } +module std_private_functional_unary_negate [system] { header "__functional/unary_negate.h" } +module std_private_functional_weak_result_type [system] { header "__functional/weak_result_type.h" } + +module std_private_ios_fpos [system] { header "__ios/fpos.h" } + +module std_private_iosfwd_fstream_fwd [system] { header "__fwd/fstream.h" } +module std_private_iosfwd_ios_fwd [system] { header "__fwd/ios.h" } +module std_private_iosfwd_istream_fwd [system] { header "__fwd/istream.h" } +module std_private_iosfwd_ostream_fwd [system] { header "__fwd/ostream.h" } +module std_private_iosfwd_sstream_fwd [system] { header "__fwd/sstream.h" } +module std_private_iosfwd_streambuf_fwd [system] { header "__fwd/streambuf.h" } + +module std_private_iterator_access [system] { header "__iterator/access.h" } +module std_private_iterator_advance [system] { header "__iterator/advance.h" } +module std_private_iterator_aliasing_iterator [system] { header "__iterator/aliasing_iterator.h" } +module std_private_iterator_back_insert_iterator [system] { header "__iterator/back_insert_iterator.h" } +module std_private_iterator_bounded_iter [system] { header "__iterator/bounded_iter.h" } +module std_private_iterator_common_iterator [system] { header "__iterator/common_iterator.h" } +module std_private_iterator_concepts [system] { + header "__iterator/concepts.h" + export std_private_concepts_constructible + export std_private_concepts_equality_comparable + export std_private_concepts_movable + export std_private_type_traits_common_reference + export std_private_type_traits_is_reference + export std_private_type_traits_remove_cvref +} +module std_private_iterator_counted_iterator [system] { header "__iterator/counted_iterator.h" } +module std_private_iterator_cpp17_iterator_concepts [system] { header "__iterator/cpp17_iterator_concepts.h" } +module std_private_iterator_data [system] { header "__iterator/data.h" } +module std_private_iterator_default_sentinel [system] { header "__iterator/default_sentinel.h" } +module std_private_iterator_distance [system] { + header "__iterator/distance.h" + export std_private_ranges_size +} +module std_private_iterator_empty [system] { header "__iterator/empty.h" } +module std_private_iterator_erase_if_container [system] { header "__iterator/erase_if_container.h" } +module std_private_iterator_front_insert_iterator [system] { header "__iterator/front_insert_iterator.h" } +module std_private_iterator_incrementable_traits [system] { header "__iterator/incrementable_traits.h" } +module std_private_iterator_indirectly_comparable [system] { header "__iterator/indirectly_comparable.h" } +module std_private_iterator_insert_iterator [system] { header "__iterator/insert_iterator.h" } +module std_private_iterator_istream_iterator [system] { header "__iterator/istream_iterator.h" } +module std_private_iterator_istreambuf_iterator [system] { header "__iterator/istreambuf_iterator.h" } +module std_private_iterator_iter_move [system] { header "__iterator/iter_move.h" } +module std_private_iterator_iter_swap [system] { header "__iterator/iter_swap.h" } +module std_private_iterator_iterator [system] { header "__iterator/iterator.h" } +module std_private_iterator_iterator_traits [system] { + header "__iterator/iterator_traits.h" + export std_private_type_traits_is_primary_template + export std_private_type_traits_integral_constant +} +module std_private_iterator_iterator_with_data [system] { header "__iterator/iterator_with_data.h" } +module std_private_iterator_mergeable [system] { + header "__iterator/mergeable.h" + export std_private_functional_ranges_operations +} +module std_private_iterator_move_iterator [system] { header "__iterator/move_iterator.h" } +module std_private_iterator_move_sentinel [system] { header "__iterator/move_sentinel.h" } +module std_private_iterator_next [system] { header "__iterator/next.h" } +module std_private_iterator_ostream_iterator [system] { header "__iterator/ostream_iterator.h" } +module std_private_iterator_ostreambuf_iterator [system] { + header "__iterator/ostreambuf_iterator.h" export * } -module std_inttypes_h [system] { - header "inttypes.h" +module std_private_iterator_permutable [system] { header "__iterator/permutable.h" } +module std_private_iterator_prev [system] { header "__iterator/prev.h" } +module std_private_iterator_projected [system] { header "__iterator/projected.h" } +module std_private_iterator_ranges_iterator_traits [system] { header "__iterator/ranges_iterator_traits.h" } +module std_private_iterator_readable_traits [system] { header "__iterator/readable_traits.h" } +module std_private_iterator_reverse_access [system] { header "__iterator/reverse_access.h" } +module std_private_iterator_reverse_iterator [system] { header "__iterator/reverse_iterator.h" } +module std_private_iterator_segmented_iterator [system] { header "__iterator/segmented_iterator.h" } +module std_private_iterator_size [system] { header "__iterator/size.h" } +module std_private_iterator_sortable [system] { + header "__iterator/sortable.h" + export std_private_functional_ranges_operations +} +module std_private_iterator_unreachable_sentinel [system] { header "__iterator/unreachable_sentinel.h" } +module std_private_iterator_wrap_iter [system] { header "__iterator/wrap_iter.h" } + +module std_private_locale_locale_base_api_android [system] { textual header "__locale_dir/locale_base_api/android.h" } +module std_private_locale_locale_base_api_bsd_locale_defaults [system] { textual header "__locale_dir/locale_base_api/bsd_locale_defaults.h" } +module std_private_locale_locale_base_api_bsd_locale_fallbacks [system] { textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h" } +module std_private_locale_locale_base_api_fuchsia [system] { textual header "__locale_dir/locale_base_api/fuchsia.h" } +module std_private_locale_locale_base_api_ibm [system] { textual header "__locale_dir/locale_base_api/ibm.h" } +module std_private_locale_locale_base_api_locale_guard [system] { header "__locale_dir/locale_base_api/locale_guard.h" } +module std_private_locale_locale_base_api_musl [system] { textual header "__locale_dir/locale_base_api/musl.h" } +module std_private_locale_locale_base_api_newlib [system] { textual header "__locale_dir/locale_base_api/newlib.h" } +module std_private_locale_locale_base_api_openbsd [system] { textual header "__locale_dir/locale_base_api/openbsd.h" } +module std_private_locale_locale_base_api_win32 [system] { textual header "__locale_dir/locale_base_api/win32.h" } +module std_private_locale_locale_base_api [system] { + header "__locale_dir/locale_base_api.h" export * } -module std_locale_h [system] { - header "locale.h" + +module std_private_math_abs [system] { header "__math/abs.h" } +module std_private_math_copysign [system] { header "__math/copysign.h" } +module std_private_math_error_functions [system] { header "__math/error_functions.h" } +module std_private_math_exponential_functions [system] { header "__math/exponential_functions.h" } +module std_private_math_fdim [system] { header "__math/fdim.h" } +module std_private_math_fma [system] { header "__math/fma.h" } +module std_private_math_gamma [system] { header "__math/gamma.h" } +module std_private_math_hyperbolic_functions [system] { header "__math/hyperbolic_functions.h" } +module std_private_math_hypot [system] { header "__math/hypot.h" } +module std_private_math_inverse_hyperbolic_functions [system] { header "__math/inverse_hyperbolic_functions.h" } +module std_private_math_inverse_trigonometric_functions [system] { header "__math/inverse_trigonometric_functions.h" } +module std_private_math_logarithms [system] { header "__math/logarithms.h" } +module std_private_math_min_max [system] { header "__math/min_max.h" } +module std_private_math_modulo [system] { header "__math/modulo.h" } +module std_private_math_remainder [system] { header "__math/remainder.h" } +module std_private_math_roots [system] { header "__math/roots.h" } +module std_private_math_rounding_functions [system] { header "__math/rounding_functions.h" } +module std_private_math_special_functions [system] { header "__math/special_functions.h" } +module std_private_math_traits [system] { header "__math/traits.h" } +module std_private_math_trigonometric_functions [system] { header "__math/trigonometric_functions.h" } + +module std_private_memory_addressof [system] { header "__memory/addressof.h" } +module std_private_memory_align [system] { header "__memory/align.h" } +module std_private_memory_aligned_alloc [system] { header "__memory/aligned_alloc.h" } +module std_private_memory_allocate_at_least [system] { header "__memory/allocate_at_least.h" } +module std_private_memory_allocation_guard [system] { header "__memory/allocation_guard.h" } +module std_private_memory_allocator [system] { header "__memory/allocator.h" } +module std_private_memory_allocator_arg_t [system] { header "__memory/allocator_arg_t.h" } +module std_private_memory_allocator_destructor [system] { header "__memory/allocator_destructor.h" } +module std_private_memory_allocator_traits [system] { header "__memory/allocator_traits.h" } +module std_private_memory_array_cookie [system] { header "__memory/array_cookie.h" } +module std_private_memory_assume_aligned [system] { header "__memory/assume_aligned.h" } +module std_private_memory_auto_ptr [system] { header "__memory/auto_ptr.h" } +module std_private_memory_builtin_new_allocator [system] { + header "__memory/builtin_new_allocator.h" export * } -module std_math_h [system] { - header "math.h" +module std_private_memory_compressed_pair [system] { header "__memory/compressed_pair.h" } +module std_private_memory_concepts [system] { + header "__memory/concepts.h" + export std_private_type_traits_remove_reference +} +module std_private_memory_construct_at [system] { header "__memory/construct_at.h" } +module std_private_memory_destruct_n [system] { header "__memory/destruct_n.h" } +module std_private_memory_fwd [system] { header "__fwd/memory.h" } +module std_private_memory_inout_ptr [system] { header "__memory/inout_ptr.h" } +module std_private_memory_noexcept_move_assign_container [system] { header "__memory/noexcept_move_assign_container.h" } +module std_private_memory_out_ptr [system] { header "__memory/out_ptr.h" } +module std_private_memory_pointer_traits [system] { header "__memory/pointer_traits.h" } +module std_private_memory_ranges_construct_at [system] { header "__memory/ranges_construct_at.h" } +module std_private_memory_ranges_uninitialized_algorithms [system] { + header "__memory/ranges_uninitialized_algorithms.h" + export std_private_algorithm_in_out_result +} +module std_private_memory_raw_storage_iterator [system] { header "__memory/raw_storage_iterator.h" } +module std_private_memory_shared_ptr [system] { + header "__memory/shared_ptr.h" + export std_private_memory_uninitialized_algorithms +} +module std_private_memory_swap_allocator [system] { header "__memory/swap_allocator.h" } +module std_private_memory_temp_value [system] { header "__memory/temp_value.h" } +module std_private_memory_temporary_buffer [system] { + header "__memory/temporary_buffer.h" + export std_private_utility_pair +} +module std_private_memory_uninitialized_algorithms [system] { + header "__memory/uninitialized_algorithms.h" + export std_private_algorithm_copy +} +module std_private_memory_unique_ptr [system] { + header "__memory/unique_ptr.h" + export std_private_type_traits_add_lvalue_reference + export std_private_type_traits_is_pointer + export std_private_type_traits_type_identity +} +module std_private_memory_unique_temporary_buffer [system] { + header "__memory/unique_temporary_buffer.h" + export std_private_memory_unique_ptr + export std_private_type_traits_is_constant_evaluated +} +module std_private_memory_uses_allocator [system] { header "__memory/uses_allocator.h" } +module std_private_memory_uses_allocator_construction [system] { header "__memory/uses_allocator_construction.h" } +module std_private_memory_voidify [system] { header "__memory/voidify.h" } + +module std_private_memory_resource_memory_resource [system] { header "__memory_resource/memory_resource.h" } +module std_private_memory_resource_memory_resource_fwd [system] { header "__fwd/memory_resource.h" } +module std_private_memory_resource_monotonic_buffer_resource [system] { header "__memory_resource/monotonic_buffer_resource.h" } +module std_private_memory_resource_polymorphic_allocator [system] { header "__memory_resource/polymorphic_allocator.h" } +module std_private_memory_resource_pool_options [system] { header "__memory_resource/pool_options.h" } +module std_private_memory_resource_synchronized_pool_resource [system] { + header "__memory_resource/synchronized_pool_resource.h" export * } -module std_stdatomic_h [system] { - header "stdatomic.h" +module std_private_memory_resource_unsynchronized_pool_resource [system] { header "__memory_resource/unsynchronized_pool_resource.h" } + +module std_private_mutex_lock_guard [system] { header "__mutex/lock_guard.h" } +module std_private_mutex_mutex [system] { header "__mutex/mutex.h" } +module std_private_mutex_once_flag [system] { header "__mutex/once_flag.h" } +module std_private_mutex_tag_types [system] { header "__mutex/tag_types.h" } +module std_private_mutex_unique_lock [system] { header "__mutex/unique_lock.h" } + +module std_private_numeric_accumulate [system] { header "__numeric/accumulate.h" } +module std_private_numeric_adjacent_difference [system] { header "__numeric/adjacent_difference.h" } +module std_private_numeric_exclusive_scan [system] { header "__numeric/exclusive_scan.h" } +module std_private_numeric_gcd_lcm [system] { header "__numeric/gcd_lcm.h" } +module std_private_numeric_inclusive_scan [system] { header "__numeric/inclusive_scan.h" } +module std_private_numeric_inner_product [system] { header "__numeric/inner_product.h" } +module std_private_numeric_iota [system] { header "__numeric/iota.h" } +module std_private_numeric_midpoint [system] { header "__numeric/midpoint.h" } +module std_private_numeric_partial_sum [system] { header "__numeric/partial_sum.h" } +module std_private_numeric_pstl [system] { + header "__numeric/pstl.h" export * } -module std_stdbool_h [system] { - // 's __bool_true_false_are_defined macro requires textual inclusion. - textual header "stdbool.h" +module std_private_numeric_reduce [system] { header "__numeric/reduce.h" } +module std_private_numeric_saturation_arithmetic [system] { header "__numeric/saturation_arithmetic.h" } +module std_private_numeric_transform_exclusive_scan [system] { header "__numeric/transform_exclusive_scan.h" } +module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" } +module std_private_numeric_transform_reduce [system] { header "__numeric/transform_reduce.h" } + +module std_private_pstl [system] { + header "__pstl/backend.h" + header "__pstl/backend_fwd.h" + header "__pstl/backends/default.h" + header "__pstl/backends/libdispatch.h" + header "__pstl/backends/serial.h" + header "__pstl/backends/std_thread.h" + header "__pstl/cpu_algos/any_of.h" + header "__pstl/cpu_algos/cpu_traits.h" + header "__pstl/cpu_algos/fill.h" + header "__pstl/cpu_algos/find_if.h" + header "__pstl/cpu_algos/for_each.h" + header "__pstl/cpu_algos/merge.h" + header "__pstl/cpu_algos/stable_sort.h" + header "__pstl/cpu_algos/transform.h" + header "__pstl/cpu_algos/transform_reduce.h" + header "__pstl/dispatch.h" + header "__pstl/handle_exception.h" } -module std_stddef_h [system] { - // 's __need_* macros require textual inclusion. - textual header "stddef.h" + +module std_private_queue_fwd [system] { header "__fwd/queue.h" } + +module std_private_ostream_basic_ostream [system] { + header "__ostream/basic_ostream.h" + export std_streambuf } -module std_stdint_h [system] { - header "stdint.h" +module std_private_ostream_print [system] { + header "__ostream/print.h" + export std_print +} + +module std_private_random_bernoulli_distribution [system] { header "__random/bernoulli_distribution.h" } +module std_private_random_binomial_distribution [system] { header "__random/binomial_distribution.h" } +module std_private_random_cauchy_distribution [system] { header "__random/cauchy_distribution.h" } +module std_private_random_chi_squared_distribution [system] { header "__random/chi_squared_distribution.h" } +module std_private_random_clamp_to_integral [system] { header "__random/clamp_to_integral.h" } +module std_private_random_default_random_engine [system] { header "__random/default_random_engine.h" } +module std_private_random_discard_block_engine [system] { header "__random/discard_block_engine.h" } +module std_private_random_discrete_distribution [system] { + header "__random/discrete_distribution.h" export * } -module std_stdio_h [system] { - // 's __need_* macros require textual inclusion. - textual header "stdio.h" +module std_private_random_exponential_distribution [system] { header "__random/exponential_distribution.h" } +module std_private_random_extreme_value_distribution [system] { header "__random/extreme_value_distribution.h" } +module std_private_random_fisher_f_distribution [system] { header "__random/fisher_f_distribution.h" } +module std_private_random_gamma_distribution [system] { header "__random/gamma_distribution.h" } +module std_private_random_generate_canonical [system] { header "__random/generate_canonical.h" } +module std_private_random_geometric_distribution [system] { header "__random/geometric_distribution.h" } +module std_private_random_independent_bits_engine [system] { header "__random/independent_bits_engine.h" } +module std_private_random_is_seed_sequence [system] { header "__random/is_seed_sequence.h" } +module std_private_random_is_valid [system] { header "__random/is_valid.h" } +module std_private_random_knuth_b [system] { header "__random/knuth_b.h" } +module std_private_random_linear_congruential_engine [system] { header "__random/linear_congruential_engine.h" } +module std_private_random_log2 [system] { header "__random/log2.h" } +module std_private_random_lognormal_distribution [system] { header "__random/lognormal_distribution.h" } +module std_private_random_mersenne_twister_engine [system] { header "__random/mersenne_twister_engine.h" } +module std_private_random_negative_binomial_distribution [system] { header "__random/negative_binomial_distribution.h" } +module std_private_random_normal_distribution [system] { header "__random/normal_distribution.h" } +module std_private_random_piecewise_constant_distribution [system] { + header "__random/piecewise_constant_distribution.h" + export * } -module std_stdlib_h [system] { - // 's __need_* macros require textual inclusion. - textual header "stdlib.h" +module std_private_random_piecewise_linear_distribution [system] { + header "__random/piecewise_linear_distribution.h" + export * } -module std_string_h [system] { - header "string.h" +module std_private_random_poisson_distribution [system] { header "__random/poisson_distribution.h" } +module std_private_random_random_device [system] { + header "__random/random_device.h" export * } -module std_tgmath_h [system] { - header "tgmath.h" +module std_private_random_ranlux [system] { header "__random/ranlux.h" } +module std_private_random_seed_seq [system] { + header "__random/seed_seq.h" export * } -module std_uchar_h [system] { - header "uchar.h" +module std_private_random_shuffle_order_engine [system] { header "__random/shuffle_order_engine.h" } +module std_private_random_student_t_distribution [system] { header "__random/student_t_distribution.h" } +module std_private_random_subtract_with_carry_engine [system] { header "__random/subtract_with_carry_engine.h" } +module std_private_random_uniform_int_distribution [system] { header "__random/uniform_int_distribution.h" } +module std_private_random_uniform_random_bit_generator [system] { header "__random/uniform_random_bit_generator.h" } +module std_private_random_uniform_real_distribution [system] { header "__random/uniform_real_distribution.h" } +module std_private_random_weibull_distribution [system] { header "__random/weibull_distribution.h" } + +module std_private_ranges_access [system] { header "__ranges/access.h" } +module std_private_ranges_all [system] { + header "__ranges/all.h" + export std_private_functional_compose + export std_private_functional_perfect_forward + export std_private_ranges_owning_view +} +module std_private_ranges_as_rvalue_view [system] { header "__ranges/as_rvalue_view.h" } +module std_private_ranges_chunk_by_view [system] { header "__ranges/chunk_by_view.h" } +module std_private_ranges_common_view [system] { header "__ranges/common_view.h" } +module std_private_ranges_concepts [system] { + header "__ranges/concepts.h" + export std_private_iterator_concepts +} +module std_private_ranges_container_compatible_range [system] { header "__ranges/container_compatible_range.h" } +module std_private_ranges_counted [system] { + header "__ranges/counted.h" + export std_span +} +module std_private_ranges_dangling [system] { header "__ranges/dangling.h" } +module std_private_ranges_data [system] { header "__ranges/data.h" } +module std_private_ranges_drop_view [system] { header "__ranges/drop_view.h" } +module std_private_ranges_drop_while_view [system] { header "__ranges/drop_while_view.h" } +module std_private_ranges_elements_view [system] { header "__ranges/elements_view.h" } +module std_private_ranges_empty [system] { header "__ranges/empty.h" } +module std_private_ranges_empty_view [system] { header "__ranges/empty_view.h" } +module std_private_ranges_enable_borrowed_range [system] { header "__ranges/enable_borrowed_range.h" } +module std_private_ranges_enable_view [system] { header "__ranges/enable_view.h" } +module std_private_ranges_filter_view [system] { + header "__ranges/filter_view.h" + export std_private_ranges_range_adaptor +} +module std_private_ranges_from_range [system] { header "__ranges/from_range.h" } +module std_private_ranges_iota_view [system] { header "__ranges/iota_view.h" } +module std_private_ranges_istream_view [system] { + header "__ranges/istream_view.h" +} +module std_private_ranges_join_view [system] { + header "__ranges/join_view.h" + export std_private_iterator_iterator_with_data + export std_private_iterator_segmented_iterator +} +module std_private_ranges_lazy_split_view [system] { + header "__ranges/lazy_split_view.h" + export std_private_ranges_non_propagating_cache +} +module std_private_ranges_movable_box [system] { header "__ranges/movable_box.h" } +module std_private_ranges_non_propagating_cache [system] { header "__ranges/non_propagating_cache.h" } +module std_private_ranges_owning_view [system] { header "__ranges/owning_view.h" } +module std_private_ranges_range_adaptor [system] { header "__ranges/range_adaptor.h" } +module std_private_ranges_rbegin [system] { header "__ranges/rbegin.h" } +module std_private_ranges_ref_view [system] { header "__ranges/ref_view.h" } +module std_private_ranges_rend [system] { header "__ranges/rend.h" } +module std_private_ranges_repeat_view [system] { header "__ranges/repeat_view.h" } +module std_private_ranges_reverse_view [system] { header "__ranges/reverse_view.h" } +module std_private_ranges_single_view [system] { header "__ranges/single_view.h" } +module std_private_ranges_size [system] { + header "__ranges/size.h" + export std_private_type_traits_make_unsigned +} +module std_private_ranges_split_view [system] { header "__ranges/split_view.h" } +module std_private_ranges_subrange [system] { + header "__ranges/subrange.h" + export std_private_ranges_subrange_fwd +} +module std_private_ranges_subrange_fwd [system] { + header "__fwd/subrange.h" + export std_private_iterator_concepts +} +module std_private_ranges_take_view [system] { header "__ranges/take_view.h" } +module std_private_ranges_take_while_view [system] { header "__ranges/take_while_view.h" } +module std_private_ranges_to [system] { header "__ranges/to.h" } +module std_private_ranges_transform_view [system] { + header "__ranges/transform_view.h" + export std_private_functional_bind_back + export std_private_functional_perfect_forward + export std_private_ranges_movable_box +} +module std_private_ranges_view_interface [system] { header "__ranges/view_interface.h" } +module std_private_ranges_views [system] { header "__ranges/views.h" } +module std_private_ranges_zip_view [system] { + header "__ranges/zip_view.h" + export std_private_utility_pair +} + +module std_private_span_span_fwd [system] { header "__fwd/span.h" } + +module std_private_stack_fwd [system] { header "__fwd/stack.h" } + +module std_private_string_char_traits [system] { + header "__string/char_traits.h" export * } -module std_wchar_h [system] { - // 's __need_* macros require textual inclusion. - textual header "wchar.h" +module std_private_string_constexpr_c_functions [system] { + header "__string/constexpr_c_functions.h" + export std_private_type_traits_is_equality_comparable } -module std_wctype_h [system] { - header "wctype.h" +module std_private_string_extern_template_lists [system] { header "__string/extern_template_lists.h" } +module std_private_string_string_fwd [system] { header "__fwd/string.h" } + +module std_private_string_view_string_view_fwd [system] { header "__fwd/string_view.h" } + +module std_private_system_error_errc [system] { header "__system_error/errc.h" } +module std_private_system_error_error_category [system] { header "__system_error/error_category.h" } +module std_private_system_error_error_code [system] { + header "__system_error/error_code.h" + export std_private_functional_hash + export std_private_functional_unary_function +} +module std_private_system_error_error_condition [system] { + header "__system_error/error_condition.h" + export std_private_functional_hash + export std_private_functional_unary_function +} +module std_private_system_error_system_error [system] { header "__system_error/system_error.h" } + +module std_private_thread_formatter [system] { header "__thread/formatter.h" } +module std_private_thread_id [system] { header "__thread/id.h" } +module std_private_thread_jthread [system] { + header "__thread/jthread.h" + export * +} +module std_private_thread_poll_with_backoff [system] { header "__thread/poll_with_backoff.h" } +module std_private_thread_support [system] { + header "__thread/support.h" export * } +module std_private_thread_support_c11 [system] { textual header "__thread/support/c11.h" } +module std_private_thread_support_external [system] { textual header "__thread/support/external.h" } +module std_private_thread_support_pthread [system] { textual header "__thread/support/pthread.h" } +module std_private_thread_support_windows [system] { textual header "__thread/support/windows.h" } +module std_private_thread_this_thread [system] { header "__thread/this_thread.h" } +module std_private_thread_thread [system] { + header "__thread/thread.h" + export * +} +module std_private_thread_timed_backoff_policy [system] { header "__thread/timed_backoff_policy.h" } -// This header is used by other C compatibility headers so it needs to be in its own module. -module std_private_mbstate_t [system] { - header "__mbstate_t.h" +module std_private_tuple_find_index [system] { header "__tuple/find_index.h" } +module std_private_tuple_ignore [system] { header "__tuple/ignore.h" } +module std_private_tuple_make_tuple_types [system] { header "__tuple/make_tuple_types.h" } +module std_private_tuple_tuple_like_no_subrange [system] { + header "__tuple/tuple_like_no_subrange.h" +} +module std_private_tuple_sfinae_helpers [system] { header "__tuple/sfinae_helpers.h" } +module std_private_tuple_tuple_element [system] { header "__tuple/tuple_element.h" } +module std_private_tuple_tuple_fwd [system] { header "__fwd/tuple.h" } +module std_private_get_fwd [system] { + header "__fwd/get.h" + export std_private_array_array_fwd + export std_private_complex_complex_fwd + export std_private_ranges_subrange_fwd + export std_private_tuple_tuple_fwd + export std_private_utility_pair_fwd + export std_private_variant_fwd +} +module std_private_tuple_tuple_indices [system] { header "__tuple/tuple_indices.h" } +module std_private_tuple_tuple_like [system] { + header "__tuple/tuple_like.h" export * } +module std_private_tuple_tuple_like_ext [system] { header "__tuple/tuple_like_ext.h" } +module std_private_tuple_tuple_size [system] { + header "__tuple/tuple_size.h" + export std_private_type_traits_integral_constant +} +module std_private_tuple_tuple_types [system] { header "__tuple/tuple_types.h" } + +module std_private_type_traits_add_const [system] { header "__type_traits/add_const.h" } +module std_private_type_traits_add_cv [system] { header "__type_traits/add_cv.h" } +module std_private_type_traits_add_lvalue_reference [system] { + header "__type_traits/add_lvalue_reference.h" + export std_private_type_traits_is_referenceable +} +module std_private_type_traits_add_pointer [system] { header "__type_traits/add_pointer.h" } +module std_private_type_traits_add_rvalue_reference [system] { header "__type_traits/add_rvalue_reference.h" } +module std_private_type_traits_add_volatile [system] { header "__type_traits/add_volatile.h" } +module std_private_type_traits_aligned_storage [system] { header "__type_traits/aligned_storage.h" } +module std_private_type_traits_aligned_union [system] { header "__type_traits/aligned_union.h" } +module std_private_type_traits_alignment_of [system] { header "__type_traits/alignment_of.h" } +module std_private_type_traits_can_extract_key [system] { header "__type_traits/can_extract_key.h" } +module std_private_type_traits_common_reference [system] { + header "__type_traits/common_reference.h" + export std_private_type_traits_remove_cvref +} +module std_private_type_traits_common_type [system] { + header "__type_traits/common_type.h" + export std_private_type_traits_type_identity + export std_private_utility_declval + export std_private_utility_empty +} +module std_private_type_traits_conditional [system] { header "__type_traits/conditional.h" } +module std_private_type_traits_conjunction [system] { header "__type_traits/conjunction.h" } +module std_private_type_traits_copy_cv [system] { header "__type_traits/copy_cv.h" } +module std_private_type_traits_copy_cvref [system] { header "__type_traits/copy_cvref.h" } +module std_private_type_traits_datasizeof [system] { header "__type_traits/datasizeof.h" } +module std_private_type_traits_decay [system] { + header "__type_traits/decay.h" + export std_private_type_traits_add_pointer +} +module std_private_type_traits_dependent_type [system] { header "__type_traits/dependent_type.h" } +module std_private_type_traits_desugars_to [system] { header "__type_traits/desugars_to.h" } +module std_private_type_traits_disjunction [system] { header "__type_traits/disjunction.h" } +module std_private_type_traits_enable_if [system] { header "__type_traits/enable_if.h" } +module std_private_type_traits_extent [system] { header "__type_traits/extent.h" } +module std_private_type_traits_has_unique_object_representation [system] { header "__type_traits/has_unique_object_representation.h" } +module std_private_type_traits_has_virtual_destructor [system] { header "__type_traits/has_virtual_destructor.h" } +module std_private_type_traits_integral_constant [system] { header "__type_traits/integral_constant.h" } +module std_private_type_traits_invoke [system] { + header "__type_traits/invoke.h" + export std_private_type_traits_conditional + export std_private_type_traits_decay + export std_private_type_traits_decay + export std_private_type_traits_enable_if + export std_private_type_traits_is_base_of + export std_private_type_traits_is_core_convertible + export std_private_type_traits_is_reference_wrapper + export std_private_type_traits_is_same + export std_private_type_traits_is_void + export std_private_type_traits_nat + export std_private_type_traits_remove_cv +} +module std_private_type_traits_is_abstract [system] { header "__type_traits/is_abstract.h" } +module std_private_type_traits_is_aggregate [system] { header "__type_traits/is_aggregate.h" } +module std_private_type_traits_is_allocator [system] { header "__type_traits/is_allocator.h" } +module std_private_type_traits_is_always_bitcastable [system] { header "__type_traits/is_always_bitcastable.h" } +module std_private_type_traits_is_arithmetic [system] { + header "__type_traits/is_arithmetic.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_array [system] { + header "__type_traits/is_array.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_assignable [system] { header "__type_traits/is_assignable.h" } +module std_private_type_traits_is_base_of [system] { header "__type_traits/is_base_of.h" } +module std_private_type_traits_is_bounded_array [system] { header "__type_traits/is_bounded_array.h" } +module std_private_type_traits_is_callable [system] { + header "__type_traits/is_callable.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_char_like_type [system] { header "__type_traits/is_char_like_type.h" } +module std_private_type_traits_is_class [system] { header "__type_traits/is_class.h" } +module std_private_type_traits_is_compound [system] { header "__type_traits/is_compound.h" } +module std_private_type_traits_is_const [system] { header "__type_traits/is_const.h" } +module std_private_type_traits_is_constant_evaluated [system] { header "__type_traits/is_constant_evaluated.h" } +module std_private_type_traits_is_constructible [system] { header "__type_traits/is_constructible.h" } +module std_private_type_traits_is_convertible [system] { + header "__type_traits/is_convertible.h" + export std_private_type_traits_is_array +} +module std_private_type_traits_is_copy_assignable [system] { header "__type_traits/is_copy_assignable.h" } +module std_private_type_traits_is_copy_constructible [system] { header "__type_traits/is_copy_constructible.h" } +module std_private_type_traits_is_core_convertible [system] { + header "__type_traits/is_core_convertible.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_destructible [system] { header "__type_traits/is_destructible.h" } +module std_private_type_traits_is_empty [system] { header "__type_traits/is_empty.h" } +module std_private_type_traits_is_enum [system] { + header "__type_traits/is_enum.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_equality_comparable [system] { + header "__type_traits/is_equality_comparable.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_execution_policy [system] { + header "__type_traits/is_execution_policy.h" + export std_private_type_traits_remove_cvref +} +module std_private_type_traits_is_final [system] { header "__type_traits/is_final.h" } +module std_private_type_traits_is_floating_point [system] { header "__type_traits/is_floating_point.h" } +module std_private_type_traits_is_function [system] { header "__type_traits/is_function.h" } +module std_private_type_traits_is_fundamental [system] { header "__type_traits/is_fundamental.h" } +module std_private_type_traits_is_implicitly_default_constructible [system] { + header "__type_traits/is_implicitly_default_constructible.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_integral [system] { + header "__type_traits/is_integral.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_literal_type [system] { header "__type_traits/is_literal_type.h" } +module std_private_type_traits_is_member_pointer [system] { header "__type_traits/is_member_pointer.h" } +module std_private_type_traits_is_nothrow_assignable [system] { header "__type_traits/is_nothrow_assignable.h" } +module std_private_type_traits_is_nothrow_constructible [system] { + header "__type_traits/is_nothrow_constructible.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_nothrow_convertible [system] { header "__type_traits/is_nothrow_convertible.h" } +module std_private_type_traits_is_nothrow_destructible [system] { + header "__type_traits/is_nothrow_destructible.h" + export std_private_type_traits_is_destructible +} +module std_private_type_traits_is_null_pointer [system] { + header "__type_traits/is_null_pointer.h" + export std_cstddef +} +module std_private_type_traits_is_object [system] { + header "__type_traits/is_object.h" + export std_private_type_traits_is_scalar +} +module std_private_type_traits_is_pod [system] { header "__type_traits/is_pod.h" } +module std_private_type_traits_is_pointer [system] { header "__type_traits/is_pointer.h" } +module std_private_type_traits_is_polymorphic [system] { header "__type_traits/is_polymorphic.h" } +module std_private_type_traits_is_primary_template [system] { + header "__type_traits/is_primary_template.h" + export std_private_type_traits_enable_if +} +module std_private_type_traits_is_reference [system] { + header "__type_traits/is_reference.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_reference_wrapper [system] { header "__type_traits/is_reference_wrapper.h" } +module std_private_type_traits_is_referenceable [system] { header "__type_traits/is_referenceable.h" } +module std_private_type_traits_is_same [system] { + header "__type_traits/is_same.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_scalar [system] { + header "__type_traits/is_scalar.h" + export std_private_type_traits_is_null_pointer +} +module std_private_type_traits_is_signed [system] { header "__type_traits/is_signed.h" } +module std_private_type_traits_is_signed_integer [system] { header "__type_traits/is_signed_integer.h" } +module std_private_type_traits_is_specialization [system] { header "__type_traits/is_specialization.h" } +module std_private_type_traits_is_standard_layout [system] { header "__type_traits/is_standard_layout.h" } +module std_private_type_traits_is_swappable [system] { + header "__type_traits/is_swappable.h" + export std_private_type_traits_is_move_constructible +} +module std_private_type_traits_is_trivial [system] { header "__type_traits/is_trivial.h" } +module std_private_type_traits_is_trivially_assignable [system] { header "__type_traits/is_trivially_assignable.h" } +module std_private_type_traits_is_trivially_constructible [system] { header "__type_traits/is_trivially_constructible.h" } +module std_private_type_traits_is_trivially_copyable [system] { + header "__type_traits/is_trivially_copyable.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_trivially_destructible [system] { header "__type_traits/is_trivially_destructible.h" } +module std_private_type_traits_is_trivially_lexicographically_comparable [system] { header "__type_traits/is_trivially_lexicographically_comparable.h" } +module std_private_type_traits_is_trivially_relocatable [system] { header "__type_traits/is_trivially_relocatable.h" } +module std_private_type_traits_is_unbounded_array [system] { header "__type_traits/is_unbounded_array.h" } +module std_private_type_traits_is_union [system] { header "__type_traits/is_union.h" } +module std_private_type_traits_is_unsigned [system] { header "__type_traits/is_unsigned.h" } +module std_private_type_traits_is_unsigned_integer [system] { header "__type_traits/is_unsigned_integer.h" } +module std_private_type_traits_is_valid_expansion [system] { header "__type_traits/is_valid_expansion.h" } +module std_private_type_traits_is_void [system] { + header "__type_traits/is_void.h" + export std_private_type_traits_integral_constant +} +module std_private_type_traits_is_volatile [system] { header "__type_traits/is_volatile.h" } +module std_private_type_traits_lazy [system] { header "__type_traits/lazy.h" } +module std_private_type_traits_make_32_64_or_128_bit [system] { header "__type_traits/make_32_64_or_128_bit.h" } +module std_private_type_traits_make_const_lvalue_ref [system] { header "__type_traits/make_const_lvalue_ref.h" } +module std_private_type_traits_make_signed [system] { header "__type_traits/make_signed.h" } +module std_private_type_traits_make_unsigned [system] { + header "__type_traits/make_unsigned.h" + export std_private_type_traits_is_unsigned +} +module std_private_type_traits_maybe_const [system] { header "__type_traits/maybe_const.h" } +module std_private_type_traits_nat [system] { header "__type_traits/nat.h" } +module std_private_type_traits_negation [system] { header "__type_traits/negation.h" } +module std_private_type_traits_promote [system] { header "__type_traits/promote.h" } +module std_private_type_traits_rank [system] { header "__type_traits/rank.h" } +module std_private_type_traits_remove_all_extents [system] { header "__type_traits/remove_all_extents.h" } +module std_private_type_traits_remove_const [system] { header "__type_traits/remove_const.h" } +module std_private_type_traits_remove_const_ref [system] { header "__type_traits/remove_const_ref.h" } +module std_private_type_traits_remove_cv [system] { + header "__type_traits/remove_cv.h" + export std_private_type_traits_remove_const + export std_private_type_traits_remove_volatile +} +module std_private_type_traits_remove_cvref [system] { header "__type_traits/remove_cvref.h" } +module std_private_type_traits_remove_extent [system] { header "__type_traits/remove_extent.h" } +module std_private_type_traits_remove_pointer [system] { header "__type_traits/remove_pointer.h" } +module std_private_type_traits_remove_reference [system] { header "__type_traits/remove_reference.h" } +module std_private_type_traits_remove_volatile [system] { header "__type_traits/remove_volatile.h" } +module std_private_type_traits_result_of [system] { header "__type_traits/result_of.h" } +module std_private_type_traits_strip_signature [system] { header "__type_traits/strip_signature.h" } +module std_private_type_traits_type_identity [system] { header "__type_traits/type_identity.h" } +module std_private_type_traits_type_list [system] { header "__type_traits/type_list.h" } +module std_private_type_traits_underlying_type [system] { + header "__type_traits/underlying_type.h" + export std_private_type_traits_is_enum +} +module std_private_type_traits_unwrap_ref [system] { header "__type_traits/unwrap_ref.h" } +module std_private_type_traits_void_t [system] { header "__type_traits/void_t.h" } + +module std_private_utility_as_const [system] { header "__utility/as_const.h" } +module std_private_utility_as_lvalue [system] { header "__utility/as_lvalue.h" } +module std_private_utility_auto_cast [system] { + header "__utility/auto_cast.h" + export std_private_type_traits_decay +} +module std_private_utility_cmp [system] { + header "__utility/cmp.h" + export std_private_type_traits_make_unsigned +} +module std_private_utility_convert_to_integral [system] { header "__utility/convert_to_integral.h" } +module std_private_utility_declval [system] { header "__utility/declval.h" } +module std_private_utility_empty [system] { header "__utility/empty.h" } +module std_private_utility_exception_guard [system] { header "__utility/exception_guard.h" } +module std_private_utility_exchange [system] { header "__utility/exchange.h" } +module std_private_utility_forward [system] { header "__utility/forward.h" } +module std_private_utility_forward_like [system] { header "__utility/forward_like.h" } +module std_private_utility_in_place [system] { + header "__utility/in_place.h" + export std_private_type_traits_integral_constant +} +module std_private_utility_integer_sequence [system] { header "__utility/integer_sequence.h" } +module std_private_utility_is_pointer_in_range [system] { header "__utility/is_pointer_in_range.h" } +module std_private_utility_is_valid_range [system] { header "__utility/is_valid_range.h" } +module std_private_utility_move [system] { + header "__utility/move.h" + export std_private_type_traits_is_copy_constructible + export std_private_type_traits_is_nothrow_move_constructible + export std_private_type_traits_remove_reference +} +module std_private_utility_no_destroy [system] { header "__utility/no_destroy.h" } +module std_private_utility_pair [system] { + header "__utility/pair.h" + export std_private_ranges_subrange_fwd + export std_private_tuple_pair_like + export std_private_type_traits_is_assignable + export std_private_type_traits_is_constructible + export std_private_type_traits_is_convertible + export std_private_type_traits_is_copy_assignable + export std_private_type_traits_is_move_assignable + export std_private_type_traits_is_nothrow_copy_constructible + export std_private_type_traits_is_nothrow_default_constructible + export std_private_type_traits_is_nothrow_move_assignable + export std_private_utility_pair_fwd +} +module std_private_utility_pair_fwd [system] { header "__fwd/pair.h" } +module std_private_utility_piecewise_construct [system] { header "__utility/piecewise_construct.h" } +module std_private_utility_priority_tag [system] { header "__utility/priority_tag.h" } +module std_private_utility_private_constructor_tag [system] { header "__utility/private_constructor_tag.h" } +module std_private_utility_rel_ops [system] { header "__utility/rel_ops.h" } +module std_private_utility_small_buffer [system] { header "__utility/small_buffer.h" } +module std_private_utility_swap [system] { + header "__utility/swap.h" + export std_private_type_traits_is_swappable +} +module std_private_utility_to_underlying [system] { header "__utility/to_underlying.h" } +module std_private_utility_unreachable [system] { header "__utility/unreachable.h" } + +module std_private_variant_monostate [system] { header "__variant/monostate.h" } +module std_private_variant_fwd [system] { header "__fwd/variant.h" } + +module std_private_vector_fwd [system] { header "__fwd/vector.h" } diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py index bc028f2a0809a..f0421b2e73813 100644 --- a/libcxx/test/libcxx/clang_modules_include.gen.py +++ b/libcxx/test/libcxx/clang_modules_include.gen.py @@ -37,17 +37,13 @@ // TODO: Investigate this failure // UNSUPPORTED: LIBCXX-FREEBSD-FIXME -// TODO: Investigate why this doesn't work on Picolibc once the locale base API is refactored -// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME - {lit_header_restrictions.get(header, '')} #include <{header}> """) -print( - f"""\ -//--- import_std.compile.pass.mm +print(f"""\ +//--- __std_clang_module.compile.pass.mm // RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only // REQUIRES: clang-modules-build @@ -65,10 +61,6 @@ // TODO: Investigate this failure // UNSUPPORTED: LIBCXX-FREEBSD-FIXME -// TODO: Investigate why this doesn't work on Picolibc once the locale base API is refactored -// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME - @import std; -""" -) +""") diff --git a/libcxx/test/libcxx/diagnostics/iterator.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/iterator.nodiscard.verify.cpp index 8f9bc3e411f90..c7cd2f5ce5767 100644 --- a/libcxx/test/libcxx/diagnostics/iterator.nodiscard.verify.cpp +++ b/libcxx/test/libcxx/diagnostics/iterator.nodiscard.verify.cpp @@ -15,12 +15,24 @@ #include #include +#include "test_macros.h" + void test() { std::vector container; int c_array[] = {1, 2, 3}; std::initializer_list initializer_list; - std::empty(container); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} - std::empty(c_array); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} - std::empty(initializer_list); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::empty(container); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::empty(c_array); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::empty(initializer_list); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::prev(c_array); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::next(c_array); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +#if TEST_STD_VER >= 20 + std::ranges::prev(c_array); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::ranges::prev(container.end(), 2); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::ranges::next(container.end(), 2, container.begin()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::ranges::next(c_array); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::ranges::next(container.begin(), 2); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::ranges::next(container.end(), 1, container.end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +#endif } diff --git a/libcxx/test/libcxx/iterators/assert.next.pass.cpp b/libcxx/test/libcxx/iterators/assert.next.pass.cpp index 242a0c6f0f7ce..f6fd24284bbfd 100644 --- a/libcxx/test/libcxx/iterators/assert.next.pass.cpp +++ b/libcxx/test/libcxx/iterators/assert.next.pass.cpp @@ -23,8 +23,8 @@ int main(int, char**) { int a[] = {1, 2, 3}; forward_iterator it(a+1); - std::next(it, 1); // should work fine - std::next(it, 0); // should work fine + (void)std::next(it, 1); // should work fine + (void)std::next(it, 0); // should work fine TEST_LIBCPP_ASSERT_FAILURE(std::next(it, -1), "Attempt to next(it, n) with negative n on a non-bidirectional iterator"); return 0; diff --git a/libcxx/test/libcxx/iterators/assert.prev.pass.cpp b/libcxx/test/libcxx/iterators/assert.prev.pass.cpp index a5a04f1bbeb6b..08cbe5e03dd5f 100644 --- a/libcxx/test/libcxx/iterators/assert.prev.pass.cpp +++ b/libcxx/test/libcxx/iterators/assert.prev.pass.cpp @@ -24,13 +24,13 @@ int main(int, char**) { int a[] = {1, 2, 3}; bidirectional_iterator bidi(a+1); - std::prev(bidi, -1); // should work fine - std::prev(bidi, 0); // should work fine - std::prev(bidi, 1); // should work fine + (void)std::prev(bidi, -1); // should work fine + (void)std::prev(bidi, 0); // should work fine + (void)std::prev(bidi, 1); // should work fine forward_iterator it(a+1); - std::prev(it, -1); // should work fine - std::prev(it, 0); // should work fine + (void)std::prev(it, -1); // should work fine + (void)std::prev(it, 0); // should work fine TEST_LIBCPP_ASSERT_FAILURE(std::prev(it, 1), "Attempt to prev(it, n) with a positive n on a non-bidirectional iterator"); return 0; diff --git a/libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp b/libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp new file mode 100644 index 0000000000000..7e27adfab1971 --- /dev/null +++ b/libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +#include + +#include "test_macros.h" + +#ifndef _LIBCPP_UTILITY +# error " must include " +#endif + +int main(int, char**) +{ + + return 0; +} diff --git a/libcxx/utils/CMakeLists.txt b/libcxx/utils/CMakeLists.txt index 027e485fc15ef..1116531fa0653 100644 --- a/libcxx/utils/CMakeLists.txt +++ b/libcxx/utils/CMakeLists.txt @@ -2,6 +2,10 @@ add_custom_target(libcxx-generate-feature-test-macros COMMAND "${Python3_EXECUTABLE}" "${LIBCXX_SOURCE_DIR}/utils/generate_feature_test_macro_components.py" COMMENT "Generate the header and tests for feature test macros.") +add_custom_target(libcxx-generate-std-clang-module-header + COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/generate_std_clang_module_header.py" + COMMENT "Generate the <__std_clang_module> header") + add_custom_target(libcxx-generate-std-cppm-in-file COMMAND "${Python3_EXECUTABLE}" @@ -53,6 +57,7 @@ add_custom_target(libcxx-indic-conjunct-break-table add_custom_target(libcxx-generate-files DEPENDS libcxx-generate-feature-test-macros + libcxx-generate-std-clang-module-header libcxx-generate-std-cppm-in-file libcxx-generate-std-compat-cppm-in-file libcxx-generate-extended-grapheme-cluster-tables diff --git a/libcxx/utils/generate_std_clang_module_header.py b/libcxx/utils/generate_std_clang_module_header.py new file mode 100644 index 0000000000000..33c9acf395379 --- /dev/null +++ b/libcxx/utils/generate_std_clang_module_header.py @@ -0,0 +1,63 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +import os.path + +import libcxx.header_information + +header_restrictions = libcxx.header_information.header_restrictions + +libcxx_include_directory = os.path.join( + os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "include" +) +with open( + os.path.join(libcxx_include_directory, "__std_clang_module"), "w" +) as std_clang_module_header: + std_clang_module_header.write( + """\ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// WARNING, this entire header is generated by +// utils/generate_std_clang_module_header.py +// DO NOT MODIFY! + +// This header should not be directly included, it's exclusively to import all +// of the libc++ public clang modules for the `std` clang module to export. In +// other words, it's to facilitate `@import std;` in Objective-C++ and `import std` +// in Swift to expose all of the libc++ interfaces. This is generally not +// recommended, however there are some clients that need to import all of libc++ +// without knowing what "all" is. +#if !__building_module(std) +# error "Do not include this header directly, include individual headers instead" +#endif + +#include <__config> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +""" + ) + # Include the angle brackets in sorting so that sorts before + # like check-format wants. + for include, header in sorted([(f"<{header}>", header) for header in libcxx.header_information.public_headers]): + header_restriction = header_restrictions.get(header) + if header_restriction: + std_clang_module_header.write(f"#if {header_restriction}\n") + std_clang_module_header.write(f"# include {include}\n") + std_clang_module_header.write(f"#endif\n") + else: + std_clang_module_header.write(f"#include {include}\n") diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp index 17786a3a486fc..eb32b4679aff0 100644 --- a/libcxxabi/test/test_demangle.pass.cpp +++ b/libcxxabi/test/test_demangle.pass.cpp @@ -30128,11 +30128,15 @@ const char* cases[][2] = // C++20 concepts, see https://github.com/itanium-cxx-abi/cxx-abi/issues/24. {"_Z2f0IiE1SIX1CIT_EEEv", "S> f0()"}, {"_ZN5test21AIiEF1fEzQ4TrueIT_E", "test2::A::friend f(...) requires True"}, - {"_ZN5test2F1gIvEEvzQaa4TrueIT_E4TrueITL0__E", "void test2::friend g(...) requires True && True"}, + {"_ZN5test21AIbEF1fEzQ4TrueIT_E", "test2::A::friend f(...) requires True"}, + {"_ZN5test21AIiEF1gIvEEvzQaa4TrueIT_E4TrueITL0__E", "void test2::A::friend g(...) requires True && True"}, + {"_ZN5test21AIbEF1gIvEEvzQaa4TrueIT_E4TrueITL0__E", "void test2::A::friend g(...) requires True && True"}, {"_ZN5test21hIvEEvzQ4TrueITL0__E", "void test2::h(...) requires True"}, - {"_ZN5test2F1iIvQaa4TrueIT_E4TrueITL0__EEEvz", "void test2::friend i(...)"}, + {"_ZN5test21AIiEF1iIvQaa4TrueIT_E4TrueITL0__EEEvz", "void test2::A::friend i(...)"}, + {"_ZN5test21AIbEF1iIvQaa4TrueIT_E4TrueITL0__EEEvz", "void test2::A::friend i(...)"}, {"_ZN5test21jIvQ4TrueITL0__EEEvz", "void test2::j(...)"}, - {"_ZN5test2F1kITk4TruevQ4TrueIT_EEEvz", "void test2::friend k(...)"}, + {"_ZN5test21AIiEF1kITk4TruevQ4TrueIT_EEEvz", "void test2::A::friend k(...)"}, + {"_ZN5test21AIbEF1kITk4TruevQ4TrueIT_EEEvz", "void test2::A::friend k(...)"}, {"_ZN5test21lITk4TruevEEvz", "void test2::l(...)"}, {"_ZN5test31dITnDaLi0EEEvv", "void test3::d<0>()"}, {"_ZN5test31eITnDcLi0EEEvv", "void test3::e<0>()"}, diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp index fb70e66b6d756..cfea605e2da60 100644 --- a/lld/ELF/Arch/AArch64.cpp +++ b/lld/ELF/Arch/AArch64.cpp @@ -31,7 +31,7 @@ uint64_t elf::getAArch64Page(uint64_t expr) { namespace { class AArch64 : public TargetInfo { public: - AArch64(); + AArch64(Ctx &); RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; RelType getDynRel(RelType type) const override; @@ -76,7 +76,7 @@ static uint64_t getBits(uint64_t val, int start, int end) { return (val >> start) & mask; } -AArch64::AArch64() { +AArch64::AArch64(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_AARCH64_COPY; relativeRel = R_AARCH64_RELATIVE; iRelativeRel = R_AARCH64_IRELATIVE; @@ -960,7 +960,7 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { namespace { class AArch64BtiPac final : public AArch64 { public: - AArch64BtiPac(); + AArch64BtiPac(Ctx &); void writePltHeader(uint8_t *buf) const override; void writePlt(uint8_t *buf, const Symbol &sym, uint64_t pltEntryAddr) const override; @@ -971,7 +971,7 @@ class AArch64BtiPac final : public AArch64 { }; } // namespace -AArch64BtiPac::AArch64BtiPac() { +AArch64BtiPac::AArch64BtiPac(Ctx &ctx) : AArch64(ctx) { btiHeader = (ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI); // A BTI (Branch Target Indicator) Plt Entry is only required if the // address of the PLT entry can be taken by the program, which permits an @@ -1073,18 +1073,6 @@ void AArch64BtiPac::writePlt(uint8_t *buf, const Symbol &sym, memcpy(buf + sizeof(addrInst) + sizeof(stdBr), nopData, sizeof(nopData)); } -static TargetInfo *getTargetInfo() { - if ((ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) || - ctx.arg.zPacPlt) { - static AArch64BtiPac t; - return &t; - } - static AArch64 t; - return &t; -} - -TargetInfo *elf::getAArch64TargetInfo() { return getTargetInfo(); } - template static void addTaggedSymbolReferences(InputSectionBase &sec, @@ -1187,3 +1175,13 @@ void lld::elf::createTaggedSymbols(const SmallVector &files) { symbol->setIsTagged(true); } } + +TargetInfo *elf::getAArch64TargetInfo(Ctx &ctx) { + if ((ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) || + ctx.arg.zPacPlt) { + static AArch64BtiPac t(ctx); + return &t; + } + static AArch64 t(ctx); + return &t; +} diff --git a/lld/ELF/Arch/AMDGPU.cpp b/lld/ELF/Arch/AMDGPU.cpp index d9440acec9dda..29b21f1b95398 100644 --- a/lld/ELF/Arch/AMDGPU.cpp +++ b/lld/ELF/Arch/AMDGPU.cpp @@ -28,7 +28,7 @@ class AMDGPU final : public TargetInfo { uint32_t calcEFlagsV6() const; public: - AMDGPU(); + AMDGPU(Ctx &); uint32_t calcEFlags() const override; void relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const override; @@ -39,7 +39,7 @@ class AMDGPU final : public TargetInfo { }; } // namespace -AMDGPU::AMDGPU() { +AMDGPU::AMDGPU(Ctx &ctx) : TargetInfo(ctx) { relativeRel = R_AMDGPU_RELATIVE64; gotRel = R_AMDGPU_ABS64; symbolicRel = R_AMDGPU_ABS64; @@ -219,7 +219,7 @@ int64_t AMDGPU::getImplicitAddend(const uint8_t *buf, RelType type) const { } } -TargetInfo *elf::getAMDGPUTargetInfo() { - static AMDGPU target; +TargetInfo *elf::getAMDGPUTargetInfo(Ctx &ctx) { + static AMDGPU target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 1bbd2e1f21d7c..77bdd656dd8cd 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -28,7 +28,7 @@ using namespace llvm::object; namespace { class ARM final : public TargetInfo { public: - ARM(); + ARM(Ctx &); uint32_t calcEFlags() const override; RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; @@ -54,7 +54,7 @@ enum class CodeState { Data = 0, Thumb = 2, Arm = 4 }; static DenseMap> sectionMap{}; -ARM::ARM() { +ARM::ARM(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_ARM_COPY; relativeRel = R_ARM_RELATIVE; iRelativeRel = R_ARM_IRELATIVE; @@ -491,9 +491,10 @@ bool ARM::inBranchRange(RelType type, uint64_t src, uint64_t dst) const { // Helper to produce message text when LLD detects that a CALL relocation to // a non STT_FUNC symbol that may result in incorrect interworking between ARM // or Thumb. -static void stateChangeWarning(uint8_t *loc, RelType relt, const Symbol &s) { +static void stateChangeWarning(Ctx &ctx, uint8_t *loc, RelType relt, + const Symbol &s) { assert(!s.isFunc()); - const ErrorPlace place = getErrorPlace(loc); + const ErrorPlace place = getErrorPlace(ctx, loc); std::string hint; if (!place.srcLoc.empty()) hint = "; " + place.srcLoc; @@ -630,7 +631,7 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { // lld 10.0 and before always used bit0Thumb when deciding to write a BLX // even when type not STT_FUNC. if (!rel.sym->isFunc() && isBlx != bit0Thumb) - stateChangeWarning(loc, rel.type, *rel.sym); + stateChangeWarning(ctx, loc, rel.type, *rel.sym); if (rel.sym->isFunc() ? bit0Thumb : isBlx) { // The BLX encoding is 0xfa:H:imm24 where Val = imm24:H:'1' checkInt(loc, val, 26, rel); @@ -687,7 +688,7 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { // lld 10.0 and before always used bit0Thumb when deciding to write a BLX // even when type not STT_FUNC. if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb) - stateChangeWarning(loc, rel.type, *rel.sym); + stateChangeWarning(ctx, loc, rel.type, *rel.sym); if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) { // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As // the BLX instruction may only be two byte aligned. This must be done @@ -1260,7 +1261,7 @@ static std::string checkCmseSymAttributes(Symbol *acleSeSym, Symbol *sym) { // name with __acle_se_. // Both these symbols are Thumb function symbols with external linkage. // may be redefined in .gnu.sgstubs. -void elf::processArmCmseSymbols() { +void elf::processArmCmseSymbols(Ctx &ctx) { if (!ctx.arg.cmseImplib) return; // Only symbols with external linkage end up in ctx.symtab, so no need to do @@ -1532,8 +1533,8 @@ template void elf::writeARMCmseImportLib() { "': " + toString(std::move(e))); } -TargetInfo *elf::getARMTargetInfo() { - static ARM target; +TargetInfo *elf::getARMTargetInfo(Ctx &ctx) { + static ARM target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/AVR.cpp b/lld/ELF/Arch/AVR.cpp index 2275f86942871..cc2d9fa3daf79 100644 --- a/lld/ELF/Arch/AVR.cpp +++ b/lld/ELF/Arch/AVR.cpp @@ -43,7 +43,7 @@ using namespace lld::elf; namespace { class AVR final : public TargetInfo { public: - AVR() { needsThunks = true; } + AVR(Ctx &ctx) : TargetInfo(ctx) { needsThunks = true; } uint32_t calcEFlags() const override; RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; @@ -267,8 +267,8 @@ void AVR::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { } } -TargetInfo *elf::getAVRTargetInfo() { - static AVR target; +TargetInfo *elf::getAVRTargetInfo(Ctx &ctx) { + static AVR target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp index a492d0a630b46..d689fc2a15210 100644 --- a/lld/ELF/Arch/Hexagon.cpp +++ b/lld/ELF/Arch/Hexagon.cpp @@ -24,7 +24,7 @@ using namespace lld::elf; namespace { class Hexagon final : public TargetInfo { public: - Hexagon(); + Hexagon(Ctx &); uint32_t calcEFlags() const override; RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; @@ -38,7 +38,7 @@ class Hexagon final : public TargetInfo { }; } // namespace -Hexagon::Hexagon() { +Hexagon::Hexagon(Ctx &ctx) : TargetInfo(ctx) { pltRel = R_HEX_JMP_SLOT; relativeRel = R_HEX_RELATIVE; gotRel = R_HEX_GLOB_DAT; @@ -404,7 +404,7 @@ int64_t Hexagon::getImplicitAddend(const uint8_t *buf, RelType type) const { } } -TargetInfo *elf::getHexagonTargetInfo() { - static Hexagon target; +TargetInfo *elf::getHexagonTargetInfo(Ctx &ctx) { + static Hexagon target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index 662dcb2ef3c62..3e86488063f0e 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -24,7 +24,7 @@ using namespace lld::elf; namespace { class LoongArch final : public TargetInfo { public: - LoongArch(); + LoongArch(Ctx &); uint32_t calcEFlags() const override; int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override; void writeGotPlt(uint8_t *buf, const Symbol &s) const override; @@ -170,7 +170,7 @@ static void handleUleb128(uint8_t *loc, uint64_t val) { encodeULEB128((orig + val) & mask, loc, count); } -LoongArch::LoongArch() { +LoongArch::LoongArch(Ctx &ctx) : TargetInfo(ctx) { // The LoongArch ISA itself does not have a limit on page sizes. According to // the ISA manual, the PS (page size) field in MTLB entries and CSR.STLBPS is // 6 bits wide, meaning the maximum page size is 2^63 which is equivalent to @@ -598,7 +598,7 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel, // immediate fields, the relocation range is [-128G - 0x20000, +128G - // 0x20000) (of course must be 4-byte aligned). if (((int64_t)val + 0x20000) != llvm::SignExtend64(val + 0x20000, 38)) - reportRangeError(loc, rel, Twine(val), llvm::minIntN(38) - 0x20000, + reportRangeError(ctx, loc, rel, Twine(val), llvm::minIntN(38) - 0x20000, llvm::maxIntN(38) - 0x20000); checkAlignment(loc, val, 4, rel); // Since jirl performs sign extension on the offset immediate, adds (1<<17) @@ -893,7 +893,7 @@ void LoongArch::finalizeRelax(int passes) const { } } -TargetInfo *elf::getLoongArchTargetInfo() { - static LoongArch target; +TargetInfo *elf::getLoongArchTargetInfo(Ctx &ctx) { + static LoongArch target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/MSP430.cpp b/lld/ELF/Arch/MSP430.cpp index 378b2878d442b..7563f7cfaa02c 100644 --- a/lld/ELF/Arch/MSP430.cpp +++ b/lld/ELF/Arch/MSP430.cpp @@ -31,7 +31,7 @@ using namespace lld::elf; namespace { class MSP430 final : public TargetInfo { public: - MSP430(); + MSP430(Ctx &); RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; void relocate(uint8_t *loc, const Relocation &rel, @@ -39,7 +39,7 @@ class MSP430 final : public TargetInfo { }; } // namespace -MSP430::MSP430() { +MSP430::MSP430(Ctx &ctx) : TargetInfo(ctx) { // mov.b #0, r3 trapInstr = {0x43, 0x43, 0x43, 0x43}; } @@ -88,7 +88,7 @@ void MSP430::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { } } -TargetInfo *elf::getMSP430TargetInfo() { - static MSP430 target; +TargetInfo *elf::getMSP430TargetInfo(Ctx &ctx) { + static MSP430 target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp index 8822be5ea8d5b..0e65df347031e 100644 --- a/lld/ELF/Arch/Mips.cpp +++ b/lld/ELF/Arch/Mips.cpp @@ -23,7 +23,7 @@ using namespace lld::elf; namespace { template class MIPS final : public TargetInfo { public: - MIPS(); + MIPS(Ctx &); uint32_t calcEFlags() const override; RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; @@ -42,7 +42,7 @@ template class MIPS final : public TargetInfo { }; } // namespace -template MIPS::MIPS() { +template MIPS::MIPS(Ctx &ctx) : TargetInfo(ctx) { gotPltHeaderEntriesNum = 2; defaultMaxPageSize = 65536; pltEntrySize = 16; @@ -778,16 +778,29 @@ template bool elf::isMipsPIC(const Defined *sym) { return cast>(file)->getObj().getHeader().e_flags & EF_MIPS_PIC; } -template TargetInfo *elf::getMipsTargetInfo() { - static MIPS target; - return ⌖ +TargetInfo *elf::getMipsTargetInfo(Ctx &ctx) { + switch (ctx.arg.ekind) { + case ELF32LEKind: { + static MIPS t(ctx); + return &t; + } + case ELF32BEKind: { + static MIPS t(ctx); + return &t; + } + case ELF64LEKind: { + static MIPS t(ctx); + return &t; + } + case ELF64BEKind: { + static MIPS t(ctx); + return &t; + } + default: + llvm_unreachable("unsupported target"); + } } -template TargetInfo *elf::getMipsTargetInfo(); -template TargetInfo *elf::getMipsTargetInfo(); -template TargetInfo *elf::getMipsTargetInfo(); -template TargetInfo *elf::getMipsTargetInfo(); - template bool elf::isMipsPIC(const Defined *); template bool elf::isMipsPIC(const Defined *); template bool elf::isMipsPIC(const Defined *); diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp index c5f9de5a2f2a5..2d6355fac13e6 100644 --- a/lld/ELF/Arch/PPC.cpp +++ b/lld/ELF/Arch/PPC.cpp @@ -26,7 +26,7 @@ using namespace lld::elf; namespace { class PPC final : public TargetInfo { public: - PPC(); + PPC(Ctx &); RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; RelType getDynRel(RelType type) const override; @@ -79,7 +79,7 @@ void elf::writePPC32GlinkSection(uint8_t *buf, size_t numEntries) { if (!ctx.arg.isPic) { for (const Symbol *sym : cast(*ctx.in.plt).canonical_plts) { - writePPC32PltCallStub(buf, sym->getGotPltVA(), nullptr, 0); + writePPC32PltCallStub(ctx, buf, sym->getGotPltVA(), nullptr, 0); buf += 16; glink += 16; } @@ -152,7 +152,7 @@ void elf::writePPC32GlinkSection(uint8_t *buf, size_t numEntries) { write32(buf, 0x60000000); } -PPC::PPC() { +PPC::PPC(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_PPC_COPY; gotRel = R_PPC_GLOB_DAT; pltRel = R_PPC_JMP_SLOT; @@ -181,7 +181,7 @@ void PPC::writeIplt(uint8_t *buf, const Symbol &sym, uint64_t /*pltEntryAddr*/) const { // In -pie or -shared mode, assume r30 points to .got2+0x8000, and use a // .got2.plt_pic32. thunk. - writePPC32PltCallStub(buf, sym.getGotPltVA(), sym.file, 0x8000); + writePPC32PltCallStub(ctx, buf, sym.getGotPltVA(), sym.file, 0x8000); } void PPC::writeGotHeader(uint8_t *buf) const { @@ -525,7 +525,7 @@ void PPC::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { } } -TargetInfo *elf::getPPCTargetInfo() { - static PPC target; +TargetInfo *elf::getPPCTargetInfo(Ctx &ctx) { + static PPC target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index fdf3d07b98bca..da2a5aeed43b0 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -168,7 +168,7 @@ enum class LegacyToPrefixMask : uint64_t { class PPC64 final : public TargetInfo { public: - PPC64(); + PPC64(Ctx &); int getTlsGdRelaxSkip(RelType type) const override; uint32_t calcEFlags() const override; RelExpr getRelExpr(RelType type, const Symbol &s, @@ -578,7 +578,7 @@ static uint64_t readPrefixedInstruction(const uint8_t *loc) { return ctx.arg.isLE ? (fullInstr << 32 | fullInstr >> 32) : fullInstr; } -PPC64::PPC64() { +PPC64::PPC64(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_PPC64_COPY; gotRel = R_PPC64_GLOB_DAT; pltRel = R_PPC64_JMP_SLOT; @@ -1750,7 +1750,7 @@ bool PPC64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, return true; } -TargetInfo *elf::getPPC64TargetInfo() { - static PPC64 target; +TargetInfo *elf::getPPC64TargetInfo(Ctx &ctx) { + static PPC64 target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index 4b02612bec870..f776ac8ede1bd 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -29,7 +29,7 @@ namespace { class RISCV final : public TargetInfo { public: - RISCV(); + RISCV(Ctx &); uint32_t calcEFlags() const override; int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override; void writeGotHeader(uint8_t *buf) const override; @@ -107,7 +107,7 @@ static uint32_t setLO12_S(uint32_t insn, uint32_t imm) { (extractBits(imm, 4, 0) << 7); } -RISCV::RISCV() { +RISCV::RISCV(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_RISCV_COPY; pltRel = R_RISCV_JUMP_SLOT; relativeRel = R_RISCV_RELATIVE; @@ -1305,7 +1305,7 @@ void RISCVAttributesSection::writeTo(uint8_t *buf) { } } -void elf::mergeRISCVAttributesSections() { +void elf::mergeRISCVAttributesSections(Ctx &) { // Find the first input SHT_RISCV_ATTRIBUTES; return if not found. size_t place = llvm::find_if(ctx.inputSections, @@ -1328,7 +1328,7 @@ void elf::mergeRISCVAttributesSections() { mergeAttributesSection(sections)); } -TargetInfo *elf::getRISCVTargetInfo() { - static RISCV target; +TargetInfo *elf::getRISCVTargetInfo(Ctx &ctx) { + static RISCV target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/SPARCV9.cpp b/lld/ELF/Arch/SPARCV9.cpp index f7f296c81f335..15c7c9c28b2ed 100644 --- a/lld/ELF/Arch/SPARCV9.cpp +++ b/lld/ELF/Arch/SPARCV9.cpp @@ -21,7 +21,7 @@ using namespace lld::elf; namespace { class SPARCV9 final : public TargetInfo { public: - SPARCV9(); + SPARCV9(Ctx &); RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; void writePlt(uint8_t *buf, const Symbol &sym, @@ -31,7 +31,7 @@ class SPARCV9 final : public TargetInfo { }; } // namespace -SPARCV9::SPARCV9() { +SPARCV9::SPARCV9(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_SPARC_COPY; gotRel = R_SPARC_GLOB_DAT; pltRel = R_SPARC_JMP_SLOT; @@ -193,7 +193,7 @@ void SPARCV9::writePlt(uint8_t *buf, const Symbol & /*sym*/, relocateNoSym(buf + 4, R_SPARC_WDISP19, -(off + 4 - pltEntrySize)); } -TargetInfo *elf::getSPARCV9TargetInfo() { - static SPARCV9 target; +TargetInfo *elf::getSPARCV9TargetInfo(Ctx &ctx) { + static SPARCV9 target(ctx); return ⌖ } diff --git a/lld/ELF/Arch/SystemZ.cpp b/lld/ELF/Arch/SystemZ.cpp index 484ffd7601ddc..fc87103165fd4 100644 --- a/lld/ELF/Arch/SystemZ.cpp +++ b/lld/ELF/Arch/SystemZ.cpp @@ -23,7 +23,7 @@ using namespace lld::elf; namespace { class SystemZ : public TargetInfo { public: - SystemZ(); + SystemZ(Ctx &); int getTlsGdRelaxSkip(RelType type) const override; RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; @@ -51,7 +51,7 @@ class SystemZ : public TargetInfo { }; } // namespace -SystemZ::SystemZ() { +SystemZ::SystemZ(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_390_COPY; gotRel = R_390_GLOB_DAT; pltRel = R_390_JMP_SLOT; @@ -453,7 +453,7 @@ bool SystemZ::relaxOnce(int pass) const { continue; if (rel.sym->auxIdx == 0) { rel.sym->allocateAux(); - addGotEntry(*rel.sym); + addGotEntry(ctx, *rel.sym); changed = true; } rel.expr = R_GOT_PC; @@ -601,7 +601,7 @@ void SystemZ::relocate(uint8_t *loc, const Relocation &rel, } } -TargetInfo *elf::getSystemZTargetInfo() { - static SystemZ t; +TargetInfo *elf::getSystemZTargetInfo(Ctx &ctx) { + static SystemZ t(ctx); return &t; } diff --git a/lld/ELF/Arch/X86.cpp b/lld/ELF/Arch/X86.cpp index e02038b1689c4..0a16ca24fcb31 100644 --- a/lld/ELF/Arch/X86.cpp +++ b/lld/ELF/Arch/X86.cpp @@ -22,7 +22,7 @@ using namespace lld::elf; namespace { class X86 : public TargetInfo { public: - X86(); + X86(Ctx &); int getTlsGdRelaxSkip(RelType type) const override; RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; @@ -42,7 +42,7 @@ class X86 : public TargetInfo { }; } // namespace -X86::X86() { +X86::X86(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_386_COPY; gotRel = R_386_GLOB_DAT; pltRel = R_386_JUMP_SLOT; @@ -518,7 +518,7 @@ void X86::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { namespace { class IntelIBT : public X86 { public: - IntelIBT(); + IntelIBT(Ctx &ctx) : X86(ctx) { pltHeaderSize = 0; } void writeGotPlt(uint8_t *buf, const Symbol &s) const override; void writePlt(uint8_t *buf, const Symbol &sym, uint64_t pltEntryAddr) const override; @@ -528,8 +528,6 @@ class IntelIBT : public X86 { }; } // namespace -IntelIBT::IntelIBT() { pltHeaderSize = 0; } - void IntelIBT::writeGotPlt(uint8_t *buf, const Symbol &s) const { uint64_t va = ctx.in.ibtPlt->getVA() + IBTPltHeaderSize + s.getPltIdx() * pltEntrySize; @@ -580,7 +578,7 @@ void IntelIBT::writeIBTPlt(uint8_t *buf, size_t numEntries) const { namespace { class RetpolinePic : public X86 { public: - RetpolinePic(); + RetpolinePic(Ctx &); void writeGotPlt(uint8_t *buf, const Symbol &s) const override; void writePltHeader(uint8_t *buf) const override; void writePlt(uint8_t *buf, const Symbol &sym, @@ -589,7 +587,7 @@ class RetpolinePic : public X86 { class RetpolineNoPic : public X86 { public: - RetpolineNoPic(); + RetpolineNoPic(Ctx &); void writeGotPlt(uint8_t *buf, const Symbol &s) const override; void writePltHeader(uint8_t *buf) const override; void writePlt(uint8_t *buf, const Symbol &sym, @@ -597,7 +595,7 @@ class RetpolineNoPic : public X86 { }; } // namespace -RetpolinePic::RetpolinePic() { +RetpolinePic::RetpolinePic(Ctx &ctx) : X86(ctx) { pltHeaderSize = 48; pltEntrySize = 32; ipltEntrySize = 32; @@ -651,7 +649,7 @@ void RetpolinePic::writePlt(uint8_t *buf, const Symbol &sym, write32le(buf + 23, -off - 27); } -RetpolineNoPic::RetpolineNoPic() { +RetpolineNoPic::RetpolineNoPic(Ctx &ctx) : X86(ctx) { pltHeaderSize = 48; pltEntrySize = 32; ipltEntrySize = 32; @@ -710,21 +708,21 @@ void RetpolineNoPic::writePlt(uint8_t *buf, const Symbol &sym, write32le(buf + 22, -off - 26); } -TargetInfo *elf::getX86TargetInfo() { +TargetInfo *elf::getX86TargetInfo(Ctx &ctx) { if (ctx.arg.zRetpolineplt) { if (ctx.arg.isPic) { - static RetpolinePic t; + static RetpolinePic t(ctx); return &t; } - static RetpolineNoPic t; + static RetpolineNoPic t(ctx); return &t; } if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT) { - static IntelIBT t; + static IntelIBT t(ctx); return &t; } - static X86 t; + static X86 t(ctx); return &t; } diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp index 48f17718365e2..d58d0a2961d61 100644 --- a/lld/ELF/Arch/X86_64.cpp +++ b/lld/ELF/Arch/X86_64.cpp @@ -26,7 +26,7 @@ using namespace lld::elf; namespace { class X86_64 : public TargetInfo { public: - X86_64(); + X86_64(Ctx &); int getTlsGdRelaxSkip(RelType type) const override; RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const override; @@ -67,7 +67,7 @@ static const std::vector> nopInstructions = { {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}}; -X86_64::X86_64() { +X86_64::X86_64(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_X86_64_COPY; gotRel = R_X86_64_GLOB_DAT; pltRel = R_X86_64_JUMP_SLOT; @@ -339,7 +339,7 @@ bool X86_64::relaxOnce(int pass) const { continue; if (rel.sym->auxIdx == 0) { rel.sym->allocateAux(); - addGotEntry(*rel.sym); + addGotEntry(ctx, *rel.sym); changed = true; } rel.expr = R_GOT_PC; @@ -388,6 +388,7 @@ RelExpr X86_64::getRelExpr(RelType type, const Symbol &s, case R_X86_64_GOTPCREL: case R_X86_64_GOTPCRELX: case R_X86_64_REX_GOTPCRELX: + case R_X86_64_REX2_GOTPCRELX: case R_X86_64_GOTTPOFF: return R_GOT_PC; case R_X86_64_GOTOFF64: @@ -725,6 +726,7 @@ int64_t X86_64::getImplicitAddend(const uint8_t *buf, RelType type) const { case R_X86_64_GOTPCREL: case R_X86_64_GOTPCRELX: case R_X86_64_REX_GOTPCRELX: + case R_X86_64_REX2_GOTPCRELX: case R_X86_64_PC32: case R_X86_64_GOTTPOFF: case R_X86_64_PLT32: @@ -808,6 +810,7 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { break; case R_X86_64_GOTPCRELX: case R_X86_64_REX_GOTPCRELX: + case R_X86_64_REX2_GOTPCRELX: if (rel.expr != R_GOT_PC) { relaxGot(loc, rel, val); } else { @@ -859,12 +862,13 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend, const uint8_t *loc) const { - // Only R_X86_64_[REX_]GOTPCRELX can be relaxed. GNU as may emit GOTPCRELX - // with addend != -4. Such an instruction does not load the full GOT entry, so - // we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip), %rax - // (addend=0) loads the high 32 bits of the GOT entry. + // Only R_X86_64_[REX_]|[REX2_]GOTPCRELX can be relaxed. GNU as may emit + // GOTPCRELX with addend != -4. Such an instruction does not load the full GOT + // entry, so we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip), + // %rax (addend=0) loads the high 32 bits of the GOT entry. if (!ctx.arg.relax || addend != -4 || - (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX)) + (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX && + type != R_X86_64_REX2_GOTPCRELX)) return R_GOT_PC; const uint8_t op = loc[-2]; const uint8_t modRm = loc[-1]; @@ -880,7 +884,7 @@ RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend, if (op == 0xff && (modRm == 0x15 || modRm == 0x25)) return R_RELAX_GOT_PC; - // We don't support test/binop instructions without a REX prefix. + // We don't support test/binop instructions without a REX/REX2 prefix. if (type == R_X86_64_GOTPCRELX) return R_GOT_PC; @@ -894,8 +898,8 @@ RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend, // "Intel 64 and IA-32 Architectures Software Developer's Manual V2" // (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/ // 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf) -static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, - uint8_t modRm) { +static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, uint8_t modRm, + bool isRex2) { const uint8_t rex = loc[-3]; // Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg". if (op == 0x85) { @@ -921,7 +925,7 @@ static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, // See "TEST-Logical Compare" (4-428 Vol. 2B). loc[-2] = 0xf7; - // Move R bit to the B bit in REX byte. + // Move R bit to the B bit in REX/REX2 byte. // REX byte is encoded as 0100WRXB, where // 0100 is 4bit fixed pattern. // REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the @@ -932,7 +936,17 @@ static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, // REX.B This 1-bit value is an extension to the MODRM.rm field or the // SIB.base field. // See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A). - loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2; + // + // REX2 prefix is encoded as 0xd5|M|R2|X2|B2|WRXB, where + // 0xd5 is 1byte fixed pattern. + // REX2's [W,R,X,B] have the same meanings as REX's. + // REX2.M encodes the map id. + // R2/X2/B2 provides the fifth and most siginicant bits of the R/X/B + // register identifiers, each of which can now address all 32 GPRs. + if (isRex2) + loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2; + else + loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2; write32le(loc, val); return; } @@ -953,7 +967,10 @@ static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, // "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for // descriptions about each operation. loc[-2] = 0x81; - loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2; + if (isRex2) + loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2; + else + loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2; write32le(loc, val); } @@ -974,7 +991,7 @@ static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) { // We are relaxing a rip relative to an absolute, so compensate // for the old -4 addend. assert(!ctx.arg.isPic); - relaxGotNoPic(loc, val + 4, op, modRm); + relaxGotNoPic(loc, val + 4, op, modRm, rel.type == R_X86_64_REX2_GOTPCRELX); return; } @@ -1059,7 +1076,7 @@ void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { namespace { class IntelIBT : public X86_64 { public: - IntelIBT(); + IntelIBT(Ctx &ctx) : X86_64(ctx) { pltHeaderSize = 0; }; void writeGotPlt(uint8_t *buf, const Symbol &s) const override; void writePlt(uint8_t *buf, const Symbol &sym, uint64_t pltEntryAddr) const override; @@ -1069,8 +1086,6 @@ class IntelIBT : public X86_64 { }; } // namespace -IntelIBT::IntelIBT() { pltHeaderSize = 0; } - void IntelIBT::writeGotPlt(uint8_t *buf, const Symbol &s) const { uint64_t va = ctx.in.ibtPlt->getVA() + IBTPltHeaderSize + s.getPltIdx() * pltEntrySize; @@ -1119,7 +1134,7 @@ void IntelIBT::writeIBTPlt(uint8_t *buf, size_t numEntries) const { namespace { class Retpoline : public X86_64 { public: - Retpoline(); + Retpoline(Ctx &); void writeGotPlt(uint8_t *buf, const Symbol &s) const override; void writePltHeader(uint8_t *buf) const override; void writePlt(uint8_t *buf, const Symbol &sym, @@ -1128,7 +1143,7 @@ class Retpoline : public X86_64 { class RetpolineZNow : public X86_64 { public: - RetpolineZNow(); + RetpolineZNow(Ctx &); void writeGotPlt(uint8_t *buf, const Symbol &s) const override {} void writePltHeader(uint8_t *buf) const override; void writePlt(uint8_t *buf, const Symbol &sym, @@ -1136,7 +1151,7 @@ class RetpolineZNow : public X86_64 { }; } // namespace -Retpoline::Retpoline() { +Retpoline::Retpoline(Ctx &ctx) : X86_64(ctx) { pltHeaderSize = 48; pltEntrySize = 32; ipltEntrySize = 32; @@ -1189,7 +1204,7 @@ void Retpoline::writePlt(uint8_t *buf, const Symbol &sym, write32le(buf + 23, -off - 27); } -RetpolineZNow::RetpolineZNow() { +RetpolineZNow::RetpolineZNow(Ctx &ctx) : X86_64(ctx) { pltHeaderSize = 32; pltEntrySize = 16; ipltEntrySize = 16; @@ -1224,23 +1239,21 @@ void RetpolineZNow::writePlt(uint8_t *buf, const Symbol &sym, write32le(buf + 8, ctx.in.plt->getVA() - pltEntryAddr - 12); } -static TargetInfo *getTargetInfo() { +TargetInfo *elf::getX86_64TargetInfo(Ctx &ctx) { if (ctx.arg.zRetpolineplt) { if (ctx.arg.zNow) { - static RetpolineZNow t; + static RetpolineZNow t(ctx); return &t; } - static Retpoline t; + static Retpoline t(ctx); return &t; } if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT) { - static IntelIBT t; + static IntelIBT t(ctx); return &t; } - static X86_64 t; + static X86_64 t(ctx); return &t; } - -TargetInfo *elf::getX86_64TargetInfo() { return getTargetInfo(); } diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 80a45bc4b6379..1d113375143a5 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -173,9 +173,9 @@ class LinkerDriver { std::unique_ptr lto; std::vector files; - InputFile *armCmseImpLib = nullptr; public: + InputFile *armCmseImpLib = nullptr; SmallVector, 0> archiveFiles; }; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 8f34b156c9c4e..94cd060b697d2 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -109,7 +109,7 @@ void Ctx::reset() { in.reset(); sym = ElfSym{}; - symtab = std::make_unique(); + symtab = std::make_unique(*this); memoryBuffers.clear(); objectFiles.clear(); @@ -167,7 +167,7 @@ bool link(ArrayRef args, llvm::raw_ostream &stdoutOS, LinkerScript script(ctx); ctx.script = &script; ctx.symAux.emplace_back(); - ctx.symtab = std::make_unique(); + ctx.symtab = std::make_unique(ctx); ctx.partitions.clear(); ctx.partitions.emplace_back(); @@ -2875,7 +2875,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { for (StringRef name : ctx.arg.undefined) ctx.symtab->addUnusedUndefined(name)->referenced = true; - parseFiles(files, armCmseImpLib); + parseFiles(ctx, files); // Create dynamic sections for dynamic linking and static PIE. ctx.arg.hasDynSymTab = !ctx.sharedFiles.empty() || ctx.arg.isPic; @@ -3052,7 +3052,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { excludeLibs(ctx, args); // Record [__acle_se_, ] pairs for later processing. - processArmCmseSymbols(); + processArmCmseSymbols(ctx); // Apply symbol renames for --wrap and combine foo@v1 and foo@@v1. redirectSymbols(ctx, wrapped); @@ -3122,7 +3122,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { // The Target instance handles target-specific stuff, such as applying // relocations or writing a PLT section. It also contains target-dependent // values such as a default image base address. - ctx.target = getTarget(); + ctx.target = getTarget(ctx); ctx.arg.eflags = ctx.target->calcEFlags(); // maxPageSize (sometimes called abi page size) is the maximum page size that @@ -3144,10 +3144,10 @@ template void LinkerDriver::link(opt::InputArgList &args) { ctx.inputSections.push_back(createCommentSection()); // Split SHF_MERGE and .eh_frame sections into pieces in preparation for garbage collection. - splitSections(); + splitSections(ctx); // Garbage collection and removal of shared symbols from unused shared objects. - markLive(); + markLive(ctx); // Make copies of any input sections that need to be copied into each // partition. @@ -3160,17 +3160,17 @@ template void LinkerDriver::link(opt::InputArgList &args) { // Create synthesized sections such as .got and .plt. This is called before // processSectionCommands() so that they can be placed by SECTIONS commands. - createSyntheticSections(); + createSyntheticSections(ctx); // Some input sections that are used for exception handling need to be moved // into synthetic sections. Do that now so that they aren't assigned to // output sections in the usual way. if (!ctx.arg.relocatable) - combineEhSections(); + combineEhSections(ctx); // Merge .riscv.attributes sections. if (ctx.arg.emachine == EM_RISCV) - mergeRISCVAttributesSections(); + mergeRISCVAttributesSections(ctx); { llvm::TimeTraceScope timeScope("Assign sections"); @@ -3201,7 +3201,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { // ICF runs after processSectionCommands() so that we know the output sections. if (ctx.arg.icf != ICFLevel::None) { findKeepUniqueSections(ctx, args); - doIcf(); + doIcf(ctx); } // Read the callgraph now that we know what was gced or icfed diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp index 3f4f479785fd9..09582b8af8248 100644 --- a/lld/ELF/ICF.cpp +++ b/lld/ELF/ICF.cpp @@ -97,6 +97,7 @@ using namespace lld::elf; namespace { template class ICF { public: + ICF(Ctx &ctx) : ctx(ctx) {} void run(); private: @@ -120,6 +121,7 @@ template class ICF { void forEachClass(llvm::function_ref fn); + Ctx &ctx; SmallVector sections; // We repeat the main loop while `Repeat` is true. @@ -457,7 +459,7 @@ static void combineRelocHashes(unsigned cnt, InputSection *isec, isec->eqClass[(cnt + 1) % 2] = hash | (1U << 31); } -static void print(const Twine &s) { +static void print(Ctx &ctx, const Twine &s) { if (ctx.arg.printIcfSections) message(s); } @@ -546,9 +548,9 @@ template void ICF::run() { forEachClassRange(0, sections.size(), [&](size_t begin, size_t end) { if (end - begin == 1) return; - print("selected section " + toString(sections[begin])); + print(ctx, "selected section " + toString(sections[begin])); for (size_t i = begin + 1; i < end; ++i) { - print(" removing identical section " + toString(sections[i])); + print(ctx, " removing identical section " + toString(sections[i])); sections[begin]->replace(sections[i]); // At this point we know sections merged are fully identical and hence @@ -586,12 +588,12 @@ template void ICF::run() { } // ICF entry point function. -template void elf::doIcf() { +template void elf::doIcf(Ctx &ctx) { llvm::TimeTraceScope timeScope("ICF"); - ICF().run(); + ICF(ctx).run(); } -template void elf::doIcf(); -template void elf::doIcf(); -template void elf::doIcf(); -template void elf::doIcf(); +template void elf::doIcf(Ctx &); +template void elf::doIcf(Ctx &); +template void elf::doIcf(Ctx &); +template void elf::doIcf(Ctx &); diff --git a/lld/ELF/ICF.h b/lld/ELF/ICF.h index 3246cc33f43c9..b126c889ea863 100644 --- a/lld/ELF/ICF.h +++ b/lld/ELF/ICF.h @@ -10,9 +10,9 @@ #define LLD_ELF_ICF_H namespace lld::elf { +struct Ctx; -template void doIcf(); - +template void doIcf(Ctx &); } #endif diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index 8dc6811045b3c..7265ed56e957f 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -296,7 +296,7 @@ static bool isCompatible(InputFile *file) { return false; } -template static void doParseFile(InputFile *file) { +template static void doParseFile(Ctx &ctx, InputFile *file) { if (!isCompatible(file)) return; @@ -329,7 +329,9 @@ template static void doParseFile(InputFile *file) { } // Add symbols in File to the symbol table. -void elf::parseFile(InputFile *file) { invokeELFT(doParseFile, file); } +void elf::parseFile(Ctx &ctx, InputFile *file) { + invokeELFT(doParseFile, ctx, file); +} // This function is explicitly instantiated in ARM.cpp. Mark it extern here, // to avoid warnings when building with MSVC. @@ -339,23 +341,21 @@ extern template void ObjFile::importCmseSymbols(); extern template void ObjFile::importCmseSymbols(); template -static void doParseFiles(const std::vector &files, - InputFile *armCmseImpLib) { +static void doParseFiles(Ctx &ctx, const std::vector &files) { // Add all files to the symbol table. This will add almost all symbols that we // need to the symbol table. This process might add files to the link due to // addDependentLibrary. for (size_t i = 0; i < files.size(); ++i) { llvm::TimeTraceScope timeScope("Parse input files", files[i]->getName()); - doParseFile(files[i]); + doParseFile(ctx, files[i]); } - if (armCmseImpLib) - cast>(*armCmseImpLib).importCmseSymbols(); + if (ctx.driver.armCmseImpLib) + cast>(*ctx.driver.armCmseImpLib).importCmseSymbols(); } -void elf::parseFiles(const std::vector &files, - InputFile *armCmseImpLib) { +void elf::parseFiles(Ctx &ctx, const std::vector &files) { llvm::TimeTraceScope timeScope("Parse input files"); - invokeELFT(doParseFiles, files, armCmseImpLib); + invokeELFT(doParseFiles, ctx, files); } // Concatenates arguments to construct a string representing an error location. diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h index 4e777761e143b..730a4d8855c6b 100644 --- a/lld/ELF/InputFiles.h +++ b/lld/ELF/InputFiles.h @@ -43,9 +43,8 @@ class Symbol; std::optional readFile(StringRef path); // Add symbols in File to the symbol table. -void parseFile(InputFile *file); -void parseFiles(const std::vector &files, - InputFile *armCmseImpLib); +void parseFile(Ctx &, InputFile *file); +void parseFiles(Ctx &, const std::vector &files); // The root class of input files. class InputFile { diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index d5d9576c79eb5..6b4b0716b9ccb 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -281,7 +281,7 @@ void BitcodeCompiler::add(BitcodeFile &f) { // If LazyObjFile has not been added to link, emit empty index files. // This is needed because this is what GNU gold plugin does and we have a // distributed build system that depends on that behavior. -static void thinLTOCreateEmptyIndexFiles() { +static void thinLTOCreateEmptyIndexFiles(Ctx &ctx) { DenseSet linkedBitCodeFiles; for (BitcodeFile *f : ctx.bitcodeFiles) linkedBitCodeFiles.insert(f->getName()); @@ -345,7 +345,7 @@ std::vector BitcodeCompiler::compile() { } if (ctx.arg.thinLTOEmitIndexFiles) - thinLTOCreateEmptyIndexFiles(); + thinLTOCreateEmptyIndexFiles(ctx); if (ctx.arg.thinLTOIndexOnly) { if (!ctx.arg.ltoObjPath.empty()) diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index 1ff33366c7897..cce584ae4d867 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -1343,7 +1343,7 @@ void LinkerScript::adjustOutputSections() { if (isEmpty) { sec->flags = flags & ((sec->nonAlloc ? 0 : (uint64_t)SHF_ALLOC) | SHF_WRITE); - sec->sortRank = getSectionRank(*sec); + sec->sortRank = getSectionRank(ctx, *sec); } // The code below may remove empty output sections. We should save the diff --git a/lld/ELF/MapFile.cpp b/lld/ELF/MapFile.cpp index 17c694d410a6d..3495cdb0bc666 100644 --- a/lld/ELF/MapFile.cpp +++ b/lld/ELF/MapFile.cpp @@ -44,7 +44,7 @@ static constexpr char indent8[] = " "; // 8 spaces static constexpr char indent16[] = " "; // 16 spaces // Print out the first three columns of a line. -static void writeHeader(raw_ostream &os, uint64_t vma, uint64_t lma, +static void writeHeader(Ctx &ctx, raw_ostream &os, uint64_t vma, uint64_t lma, uint64_t size, uint64_t align) { if (ctx.arg.is64) os << format("%16llx %16llx %8llx %5lld ", vma, lma, size, align); @@ -90,14 +90,14 @@ static SymbolMapTy getSectionSyms(ArrayRef syms) { // Demangling symbols (which is what toString() does) is slow, so // we do that in batch using parallel-for. static DenseMap -getSymbolStrings(ArrayRef syms) { +getSymbolStrings(Ctx &ctx, ArrayRef syms) { auto strs = std::make_unique(syms.size()); parallelFor(0, syms.size(), [&](size_t i) { raw_string_ostream os(strs[i]); OutputSection *osec = syms[i]->getOutputSection(); uint64_t vma = syms[i]->getVA(); uint64_t lma = osec ? osec->getLMA() + vma - osec->getVA(0) : 0; - writeHeader(os, vma, lma, syms[i]->getSize(), 1); + writeHeader(ctx, os, vma, lma, syms[i]->getSize(), 1); os << indent16 << toString(*syms[i]); }); @@ -113,7 +113,7 @@ getSymbolStrings(ArrayRef syms) { // .eh_frame tend to contain a lot of section pieces that are contiguous // both in input file and output file. Such pieces are squashed before // being displayed to make output compact. -static void printEhFrame(raw_ostream &os, const EhFrameSection *sec) { +static void printEhFrame(Ctx &ctx, raw_ostream &os, const EhFrameSection *sec) { std::vector pieces; auto add = [&](const EhSectionPiece &p) { @@ -139,18 +139,18 @@ static void printEhFrame(raw_ostream &os, const EhFrameSection *sec) { // Print out section pieces. const OutputSection *osec = sec->getOutputSection(); for (EhSectionPiece &p : pieces) { - writeHeader(os, osec->addr + p.outputOff, osec->getLMA() + p.outputOff, + writeHeader(ctx, os, osec->addr + p.outputOff, osec->getLMA() + p.outputOff, p.size, 1); os << indent8 << toString(p.sec->file) << ":(" << p.sec->name << "+0x" << Twine::utohexstr(p.inputOff) + ")\n"; } } -static void writeMapFile(raw_fd_ostream &os) { +static void writeMapFile(Ctx &ctx, raw_fd_ostream &os) { // Collect symbol info that we want to print out. std::vector syms = getSymbols(); SymbolMapTy sectionSyms = getSectionSyms(syms); - DenseMap symStr = getSymbolStrings(syms); + DenseMap symStr = getSymbolStrings(ctx, syms); // Print out the header line. int w = ctx.arg.is64 ? 16 : 8; @@ -163,7 +163,7 @@ static void writeMapFile(raw_fd_ostream &os) { if (assign->provide && !assign->sym) continue; uint64_t lma = osec ? osec->getLMA() + assign->addr - osec->getVA(0) : 0; - writeHeader(os, assign->addr, lma, assign->size, 1); + writeHeader(ctx, os, assign->addr, lma, assign->size, 1); os << assign->commandString << '\n'; continue; } @@ -171,7 +171,8 @@ static void writeMapFile(raw_fd_ostream &os) { continue; osec = &cast(cmd)->osec; - writeHeader(os, osec->addr, osec->getLMA(), osec->size, osec->addralign); + writeHeader(ctx, os, osec->addr, osec->getLMA(), osec->size, + osec->addralign); os << osec->name << '\n'; // Dump symbols for each input section. @@ -179,11 +180,11 @@ static void writeMapFile(raw_fd_ostream &os) { if (auto *isd = dyn_cast(subCmd)) { for (InputSection *isec : isd->sections) { if (auto *ehSec = dyn_cast(isec)) { - printEhFrame(os, ehSec); + printEhFrame(ctx, os, ehSec); continue; } - writeHeader(os, isec->getVA(), osec->getLMA() + isec->outSecOff, + writeHeader(ctx, os, isec->getVA(), osec->getLMA() + isec->outSecOff, isec->getSize(), isec->addralign); os << indent8 << toString(isec) << '\n'; for (Symbol *sym : llvm::make_first_range(sectionSyms[isec])) @@ -193,7 +194,7 @@ static void writeMapFile(raw_fd_ostream &os) { } if (auto *data = dyn_cast(subCmd)) { - writeHeader(os, osec->addr + data->offset, + writeHeader(ctx, os, osec->addr + data->offset, osec->getLMA() + data->offset, data->size, 1); os << indent8 << data->commandString << '\n'; continue; @@ -202,7 +203,7 @@ static void writeMapFile(raw_fd_ostream &os) { if (auto *assign = dyn_cast(subCmd)) { if (assign->provide && !assign->sym) continue; - writeHeader(os, assign->addr, + writeHeader(ctx, os, assign->addr, osec->getLMA() + assign->addr - osec->getVA(0), assign->size, 1); os << indent8 << assign->commandString << '\n'; @@ -223,7 +224,7 @@ static void writeMapFile(raw_fd_ostream &os) { // // In this case, strlen is defined by libc.so.6 and used by other two // files. -static void writeCref(raw_fd_ostream &os) { +static void writeCref(Ctx &ctx, raw_fd_ostream &os) { // Collect symbols and files. MapVector> map; for (ELFFileBase *file : ctx.objectFiles) { @@ -256,7 +257,7 @@ static void writeCref(raw_fd_ostream &os) { } } -void elf::writeMapAndCref() { +void elf::writeMapAndCref(Ctx &ctx) { if (ctx.arg.mapFile.empty() && !ctx.arg.cref) return; @@ -272,7 +273,7 @@ void elf::writeMapAndCref() { } if (!ctx.arg.mapFile.empty()) - writeMapFile(os); + writeMapFile(ctx, os); if (ctx.arg.cref) - writeCref(os); + writeCref(ctx, os); } diff --git a/lld/ELF/MapFile.h b/lld/ELF/MapFile.h index b271f627df570..c4efd33a3095c 100644 --- a/lld/ELF/MapFile.h +++ b/lld/ELF/MapFile.h @@ -10,7 +10,8 @@ #define LLD_ELF_MAPFILE_H namespace lld::elf { -void writeMapAndCref(); +struct Ctx; +void writeMapAndCref(Ctx &); } #endif diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp index b9a4e392a507a..23e2359491370 100644 --- a/lld/ELF/MarkLive.cpp +++ b/lld/ELF/MarkLive.cpp @@ -44,7 +44,7 @@ using namespace lld::elf; namespace { template class MarkLive { public: - MarkLive(unsigned partition) : partition(partition) {} + MarkLive(Ctx &ctx, unsigned partition) : ctx(ctx), partition(partition) {} void run(); void moveToMain(); @@ -60,6 +60,7 @@ template class MarkLive { template void scanEhFrameSection(EhInputSection &eh, ArrayRef rels); + Ctx &ctx; // The index of the partition that we are currently processing. unsigned partition; @@ -73,21 +74,21 @@ template class MarkLive { } // namespace template -static uint64_t getAddend(InputSectionBase &sec, +static uint64_t getAddend(Ctx &ctx, InputSectionBase &sec, const typename ELFT::Rel &rel) { return ctx.target->getImplicitAddend(sec.content().begin() + rel.r_offset, rel.getType(ctx.arg.isMips64EL)); } template -static uint64_t getAddend(InputSectionBase &sec, +static uint64_t getAddend(Ctx &, InputSectionBase &sec, const typename ELFT::Rela &rel) { return rel.r_addend; } // Currently, we assume all input CREL relocations have an explicit addend. template -static uint64_t getAddend(InputSectionBase &sec, +static uint64_t getAddend(Ctx &, InputSectionBase &sec, const typename ELFT::Crel &rel) { return rel.r_addend; } @@ -107,7 +108,7 @@ void MarkLive::resolveReloc(InputSectionBase &sec, RelTy &rel, uint64_t offset = d->value; if (d->isSection()) - offset += getAddend(sec, rel); + offset += getAddend(ctx, sec, rel); // fromFDE being true means this is referenced by a FDE in a .eh_frame // piece. The relocation points to the described function or to a LSDA. We @@ -361,7 +362,7 @@ template void MarkLive::moveToMain() { // Before calling this function, Live bits are off for all // input sections. This function make some or all of them on // so that they are emitted to the output file. -template void elf::markLive() { +template void elf::markLive(Ctx &ctx) { llvm::TimeTraceScope timeScope("markLive"); // If --gc-sections is not given, retain all input sections. if (!ctx.arg.gcSections) { @@ -378,13 +379,13 @@ template void elf::markLive() { // Follow the graph to mark all live sections. for (unsigned i = 1, e = ctx.partitions.size(); i <= e; ++i) - MarkLive(i).run(); + MarkLive(ctx, i).run(); // If we have multiple partitions, some sections need to live in the main // partition even if they were allocated to a loadable partition. Move them // there now. if (ctx.partitions.size() != 1) - MarkLive(1).moveToMain(); + MarkLive(ctx, 1).moveToMain(); // Report garbage-collected sections. if (ctx.arg.printGcSections) @@ -393,7 +394,7 @@ template void elf::markLive() { message("removing unused section " + toString(sec)); } -template void elf::markLive(); -template void elf::markLive(); -template void elf::markLive(); -template void elf::markLive(); +template void elf::markLive(Ctx &); +template void elf::markLive(Ctx &); +template void elf::markLive(Ctx &); +template void elf::markLive(Ctx &); diff --git a/lld/ELF/MarkLive.h b/lld/ELF/MarkLive.h index ef62fdf964e4b..a614646e25c99 100644 --- a/lld/ELF/MarkLive.h +++ b/lld/ELF/MarkLive.h @@ -10,9 +10,9 @@ #define LLD_ELF_MARKLIVE_H namespace lld::elf { +struct Ctx; -template void markLive(); - +template void markLive(Ctx &); } #endif // LLD_ELF_MARKLIVE_H diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index e0181f0809cc5..3d4de56b6dfb3 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -65,7 +65,8 @@ using namespace llvm::support::endian; using namespace lld; using namespace lld::elf; -static std::optional getLinkerScriptLocation(const Symbol &sym) { +static std::optional getLinkerScriptLocation(Ctx &ctx, + const Symbol &sym) { for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *assign = dyn_cast(cmd)) if (assign->sym == &sym) @@ -73,11 +74,11 @@ static std::optional getLinkerScriptLocation(const Symbol &sym) { return std::nullopt; } -static std::string getDefinedLocation(const Symbol &sym) { +static std::string getDefinedLocation(Ctx &ctx, const Symbol &sym) { const char msg[] = "\n>>> defined in "; if (sym.file) return msg + toString(sym.file); - if (std::optional loc = getLinkerScriptLocation(sym)) + if (std::optional loc = getLinkerScriptLocation(ctx, sym)) return msg + *loc; return ""; } @@ -87,18 +88,18 @@ static std::string getDefinedLocation(const Symbol &sym) { // >>> defined in /home/alice/src/foo.o // >>> referenced by bar.c:12 (/home/alice/src/bar.c:12) // >>> /home/alice/src/bar.o:(.text+0x1) -static std::string getLocation(InputSectionBase &s, const Symbol &sym, +static std::string getLocation(Ctx &ctx, InputSectionBase &s, const Symbol &sym, uint64_t off) { - std::string msg = getDefinedLocation(sym) + "\n>>> referenced by "; + std::string msg = getDefinedLocation(ctx, sym) + "\n>>> referenced by "; std::string src = s.getSrcMsg(sym, off); if (!src.empty()) msg += src + "\n>>> "; return msg + s.getObjMsg(off); } -void elf::reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v, - int64_t min, uint64_t max) { - ErrorPlace errPlace = getErrorPlace(loc); +void elf::reportRangeError(Ctx &, uint8_t *loc, const Relocation &rel, + const Twine &v, int64_t min, uint64_t max) { + ErrorPlace errPlace = getErrorPlace(ctx, loc); std::string hint; if (rel.sym) { if (!rel.sym->isSection()) @@ -116,7 +117,7 @@ void elf::reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v, if (!errPlace.srcLoc.empty()) hint += "\n>>> referenced by " + errPlace.srcLoc; if (rel.sym && !rel.sym->isSection()) - hint += getDefinedLocation(*rel.sym); + hint += getDefinedLocation(ctx, *rel.sym); if (errPlace.isec && errPlace.isec->name.starts_with(".debug")) hint += "; consider recompiling with -fdebug-types-section to reduce size " @@ -127,13 +128,13 @@ void elf::reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v, ", " + Twine(max).str() + "]" + hint); } -void elf::reportRangeError(uint8_t *loc, int64_t v, int n, const Symbol &sym, - const Twine &msg) { - ErrorPlace errPlace = getErrorPlace(loc); +void elf::reportRangeError(Ctx &ctx, uint8_t *loc, int64_t v, int n, + const Symbol &sym, const Twine &msg) { + ErrorPlace errPlace = getErrorPlace(ctx, loc); std::string hint; if (!sym.getName().empty()) - hint = - "; references '" + lld::toString(sym) + '\'' + getDefinedLocation(sym); + hint = "; references '" + lld::toString(sym) + '\'' + + getDefinedLocation(ctx, sym); errorOrWarn(errPlace.loc + msg + " is out of range: " + Twine(v) + " is not in [" + Twine(llvm::minIntN(n)) + ", " + Twine(llvm::maxIntN(n)) + "]" + hint); @@ -284,7 +285,7 @@ template static bool isReadOnly(SharedSymbol &ss) { // them are copied by a copy relocation, all of them need to be copied. // Otherwise, they would refer to different places at runtime. template -static SmallSet getSymbolsAt(SharedSymbol &ss) { +static SmallSet getSymbolsAt(Ctx &ctx, SharedSymbol &ss) { using Elf_Sym = typename ELFT::Sym; const auto &file = cast(*ss.file); @@ -371,7 +372,7 @@ static void replaceWithDefined(Symbol &sym, SectionBase &sec, uint64_t value, // to the variable in .bss. This kind of issue is sometimes very hard to // debug. What's a solution? Instead of exporting a variable V from a DSO, // define an accessor getV(). -template static void addCopyRelSymbol(SharedSymbol &ss) { +template static void addCopyRelSymbol(Ctx &ctx, SharedSymbol &ss) { // Copy relocation against zero-sized symbol doesn't make sense. uint64_t symSize = ss.getSize(); if (symSize == 0 || ss.alignment == 0) @@ -396,7 +397,7 @@ template static void addCopyRelSymbol(SharedSymbol &ss) { // Look through the DSO's dynamic symbol table for aliases and create a // dynamic symbol for each one. This causes the copy relocation to correctly // interpose any aliases. - for (SharedSymbol *sym : getSymbolsAt(ss)) + for (SharedSymbol *sym : getSymbolsAt(ctx, ss)) replaceWithDefined(*sym, *sec, 0, sym->size); ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->copyRel, *sec, 0, ss); @@ -459,10 +460,12 @@ class OffsetGetter { // InputSectionBase. class RelocationScanner { public: + RelocationScanner(Ctx &ctx) : ctx(ctx) {} template void scanSection(InputSectionBase &s, bool isEH = false); private: + Ctx &ctx; InputSectionBase *sec; OffsetGetter getter; @@ -476,6 +479,9 @@ class RelocationScanner { uint64_t relOff) const; void processAux(RelExpr expr, RelType type, uint64_t offset, Symbol &sym, int64_t addend) const; + unsigned handleTlsRelocation(RelExpr expr, RelType type, uint64_t offset, + Symbol &sym, int64_t addend); + template void scanOne(typename Relocs::const_iterator &i); template void scan(Relocs rels); @@ -520,7 +526,7 @@ int64_t RelocationScanner::computeMipsAddend(const RelTy &rel, RelExpr expr, // Custom error message if Sym is defined in a discarded section. template -static std::string maybeReportDiscarded(Undefined &sym) { +static std::string maybeReportDiscarded(Ctx &ctx, Undefined &sym) { auto *file = dyn_cast_or_null>(sym.file); if (!file || !sym.discardedSecIdx) return ""; @@ -708,7 +714,7 @@ static const Symbol *getAlternativeSpelling(const Undefined &sym, return nullptr; } -static void reportUndefinedSymbol(const UndefinedDiag &undef, +static void reportUndefinedSymbol(Ctx &ctx, const UndefinedDiag &undef, bool correctSpelling) { Undefined &sym = *undef.sym; @@ -728,16 +734,16 @@ static void reportUndefinedSymbol(const UndefinedDiag &undef, std::string msg; switch (ctx.arg.ekind) { case ELF32LEKind: - msg = maybeReportDiscarded(sym); + msg = maybeReportDiscarded(ctx, sym); break; case ELF32BEKind: - msg = maybeReportDiscarded(sym); + msg = maybeReportDiscarded(ctx, sym); break; case ELF64LEKind: - msg = maybeReportDiscarded(sym); + msg = maybeReportDiscarded(ctx, sym); break; case ELF64BEKind: - msg = maybeReportDiscarded(sym); + msg = maybeReportDiscarded(ctx, sym); break; default: llvm_unreachable(""); @@ -795,7 +801,7 @@ static void reportUndefinedSymbol(const UndefinedDiag &undef, error(msg, ErrorTag::SymbolNotFound, {sym.getName()}); } -void elf::reportUndefinedSymbols() { +void elf::reportUndefinedSymbols(Ctx &ctx) { // Find the first "undefined symbol" diagnostic for each diagnostic, and // collect all "referenced from" lines at the first diagnostic. DenseMap firstRef; @@ -811,14 +817,14 @@ void elf::reportUndefinedSymbols() { // Enable spell corrector for the first 2 diagnostics. for (const auto &[i, undef] : llvm::enumerate(undefs)) if (!undef.locs.empty()) - reportUndefinedSymbol(undef, i < 2); + reportUndefinedSymbol(ctx, undef, i < 2); undefs.clear(); } // Report an undefined symbol if necessary. // Returns true if the undefined symbol will produce an error message. -static bool maybeReportUndefined(Undefined &sym, InputSectionBase &sec, - uint64_t offset) { +static bool maybeReportUndefined(Ctx &ctx, Undefined &sym, + InputSectionBase &sec, uint64_t offset) { std::lock_guard lock(relocMutex); // If versioned, issue an error (even if the symbol is weak) because we don't // know the defining filename which is required to construct a Verneed entry. @@ -921,7 +927,7 @@ static void addPltEntry(PltSection &plt, GotPltSection &gotPlt, sym, 0, R_ABS}); } -void elf::addGotEntry(Symbol &sym) { +void elf::addGotEntry(Ctx &ctx, Symbol &sym) { ctx.in.got->addEntry(sym); uint64_t off = sym.getGotOffset(); @@ -941,7 +947,7 @@ void elf::addGotEntry(Symbol &sym) { addRelativeReloc(*ctx.in.got, off, sym, 0, R_ABS, ctx.target->symbolicRel); } -static void addTpOffsetGotEntry(Symbol &sym) { +static void addTpOffsetGotEntry(Ctx &ctx, Symbol &sym) { ctx.in.got->addEntry(sym); uint64_t off = sym.getGotOffset(); if (!sym.isPreemptible && !ctx.arg.shared) { @@ -955,7 +961,7 @@ static void addTpOffsetGotEntry(Symbol &sym) { // Return true if we can define a symbol in the executable that // contains the value/function of a symbol defined in a shared // library. -static bool canDefineSymbolInExecutable(Symbol &sym) { +static bool canDefineSymbolInExecutable(Ctx &ctx, Symbol &sym) { // If the symbol has default visibility the symbol defined in the // executable will preempt it. // Note that we want the visibility of the shared symbol itself, not @@ -1036,7 +1042,7 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type, return true; error("relocation " + toString(type) + " cannot refer to absolute symbol: " + - toString(sym) + getLocation(*sec, sym, relOff)); + toString(sym) + getLocation(ctx, *sec, sym, relOff)); return true; } @@ -1201,9 +1207,9 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, // R_AARCH64_AUTH_ABS64. if (!ctx.arg.shared && sym.isShared() && !(ctx.arg.emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64)) { - if (!canDefineSymbolInExecutable(sym)) { + if (!canDefineSymbolInExecutable(ctx, sym)) { errorOrWarn("cannot preempt symbol: " + toString(sym) + - getLocation(*sec, sym, offset)); + getLocation(ctx, *sec, sym, offset)); return; } @@ -1214,7 +1220,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, error("unresolvable relocation " + toString(type) + " against symbol '" + toString(*ss) + "'; recompile with -fPIC or remove '-z nocopyreloc'" + - getLocation(*sec, sym, offset)); + getLocation(ctx, *sec, sym, offset)); sym.setFlags(NEEDS_COPY); } sec->addReloc({expr, type, offset, addend, &sym}); @@ -1252,7 +1258,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, if (ctx.arg.pie && ctx.arg.emachine == EM_386) errorOrWarn("symbol '" + toString(sym) + "' cannot be preempted; recompile with -fPIE" + - getLocation(*sec, sym, offset)); + getLocation(ctx, *sec, sym, offset)); sym.setFlags(NEEDS_COPY | NEEDS_PLT); sec->addReloc({expr, type, offset, addend, &sym}); return; @@ -1262,7 +1268,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, errorOrWarn("relocation " + toString(type) + " cannot be used against " + (sym.getName().empty() ? "local symbol" : "symbol '" + toString(sym) + "'") + - "; recompile with -fPIC" + getLocation(*sec, sym, offset)); + "; recompile with -fPIC" + getLocation(ctx, *sec, sym, offset)); } // This function is similar to the `handleTlsRelocation`. MIPS does not @@ -1271,7 +1277,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, // pollute other `handleTlsRelocation` by MIPS `ifs` statements. // Mips has a custom MipsGotSection that handles the writing of GOT entries // without dynamic relocations. -static unsigned handleMipsTlsRelocation(RelType type, Symbol &sym, +static unsigned handleMipsTlsRelocation(Ctx &ctx, RelType type, Symbol &sym, InputSectionBase &c, uint64_t offset, int64_t addend, RelExpr expr) { if (expr == R_MIPS_TLSLD) { @@ -1294,20 +1300,21 @@ static unsigned handleMipsTlsRelocation(RelType type, Symbol &sym, // symbol in TLS block. // // Returns the number of relocations processed. -static unsigned handleTlsRelocation(RelType type, Symbol &sym, - InputSectionBase &c, uint64_t offset, - int64_t addend, RelExpr expr) { +unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type, + uint64_t offset, Symbol &sym, + int64_t addend) { if (expr == R_TPREL || expr == R_TPREL_NEG) { if (ctx.arg.shared) { errorOrWarn("relocation " + toString(type) + " against " + toString(sym) + - " cannot be used with -shared" + getLocation(c, sym, offset)); + " cannot be used with -shared" + + getLocation(ctx, *sec, sym, offset)); return 1; } return 0; } if (ctx.arg.emachine == EM_MIPS) - return handleMipsTlsRelocation(type, sym, c, offset, addend, expr); + return handleMipsTlsRelocation(ctx, type, sym, *sec, offset, addend, expr); // LoongArch does not yet implement transition from TLSDESC to LE/IE, so // generate TLSDESC dynamic relocation for the dynamic linker to handle. @@ -1316,7 +1323,7 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, R_TLSDESC_CALL>(expr)) { if (expr != R_TLSDESC_CALL) { sym.setFlags(NEEDS_TLSDESC); - c.addReloc({expr, type, offset, addend, &sym}); + sec->addReloc({expr, type, offset, addend, &sym}); } return 1; } @@ -1331,7 +1338,7 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, if (expr != R_TLSDESC_CALL) { if (!isRISCV || type == R_RISCV_TLSDESC_HI20) sym.setFlags(NEEDS_TLSDESC); - c.addReloc({expr, type, offset, addend, &sym}); + sec->addReloc({expr, type, offset, addend, &sym}); } return 1; } @@ -1345,10 +1352,11 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, !ctx.arg.shared && ctx.arg.emachine != EM_ARM && ctx.arg.emachine != EM_HEXAGON && ctx.arg.emachine != EM_LOONGARCH && !(isRISCV && expr != R_TLSDESC_PC && expr != R_TLSDESC_CALL) && - !c.file->ppc64DisableTLSRelax; + !sec->file->ppc64DisableTLSRelax; // If we are producing an executable and the symbol is non-preemptable, it - // must be defined and the code sequence can be optimized to use Local-Exec. + // must be defined and the code sequence can be optimized to use + // Local-Exesec-> // // ARM and RISC-V do not support any relaxations for TLS relocations, however, // we can omit the DTPMOD dynamic relocations and resolve them at link time @@ -1361,33 +1369,33 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, // module index, with a special value of 0 for the current module. GOT[e1] is // unused. There only needs to be one module index entry. if (oneof(expr)) { - // Local-Dynamic relocs can be optimized to Local-Exec. + // Local-Dynamic relocs can be optimized to Local-Exesec-> if (execOptimize) { - c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE), type, - offset, addend, &sym}); + sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE), + type, offset, addend, &sym}); return ctx.target->getTlsGdRelaxSkip(type); } if (expr == R_TLSLD_HINT) return 1; ctx.needsTlsLd.store(true, std::memory_order_relaxed); - c.addReloc({expr, type, offset, addend, &sym}); + sec->addReloc({expr, type, offset, addend, &sym}); return 1; } - // Local-Dynamic relocs can be optimized to Local-Exec. + // Local-Dynamic relocs can be optimized to Local-Exesec-> if (expr == R_DTPREL) { if (execOptimize) expr = ctx.target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE); - c.addReloc({expr, type, offset, addend, &sym}); + sec->addReloc({expr, type, offset, addend, &sym}); return 1; } // Local-Dynamic sequence where offset of tls variable relative to dynamic // thread pointer is stored in the got. This cannot be optimized to - // Local-Exec. + // Local-Exesec-> if (expr == R_TLSLD_GOT_OFF) { sym.setFlags(NEEDS_GOT_DTPREL); - c.addReloc({expr, type, offset, addend, &sym}); + sec->addReloc({expr, type, offset, addend, &sym}); return 1; } @@ -1396,7 +1404,7 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, R_LOONGARCH_TLSGD_PAGE_PC>(expr)) { if (!execOptimize) { sym.setFlags(NEEDS_TLSGD); - c.addReloc({expr, type, offset, addend, &sym}); + sec->addReloc({expr, type, offset, addend, &sym}); return 1; } @@ -1405,14 +1413,14 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, // // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a non-preemptible // label, so TLSDESC=>IE will be categorized as R_RELAX_TLS_GD_TO_LE. We fix - // the categorization in RISCV::relocateAlloc. + // the categorization in RISCV::relocateAllosec-> if (sym.isPreemptible) { sym.setFlags(NEEDS_TLSGD_TO_IE); - c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE), type, - offset, addend, &sym}); + sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE), + type, offset, addend, &sym}); } else { - c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_LE), type, - offset, addend, &sym}); + sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_LE), + type, offset, addend, &sym}); } return ctx.target->getTlsGdRelaxSkip(type); } @@ -1423,15 +1431,15 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, // Initial-Exec relocs can be optimized to Local-Exec if the symbol is // locally defined. This is not supported on SystemZ. if (execOptimize && isLocalInExecutable && ctx.arg.emachine != EM_S390) { - c.addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym}); + sec->addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym}); } else if (expr != R_TLSIE_HINT) { sym.setFlags(NEEDS_TLSIE); // R_GOT needs a relative relocation for PIC on i386 and Hexagon. if (expr == R_GOT && ctx.arg.isPic && !ctx.target->usesOnlyLowPageBits(type)) - addRelativeReloc(c, offset, sym, addend, expr, type); + addRelativeReloc(*sec, offset, sym, addend, expr, type); else - c.addReloc({expr, type, offset, addend, &sym}); + sec->addReloc({expr, type, offset, addend, &sym}); } return 1; } @@ -1480,7 +1488,7 @@ void RelocationScanner::scanOne(typename Relocs::const_iterator &i) { // Error if the target symbol is undefined. Symbol index 0 may be used by // marker relocations, e.g. R_*_NONE and R_ARM_V4BX. Don't error on them. if (sym.isUndefined() && symIndex != 0 && - maybeReportUndefined(cast(sym), *sec, offset)) + maybeReportUndefined(ctx, cast(sym), *sec, offset)) return; if (ctx.arg.emachine == EM_PPC64) { @@ -1508,7 +1516,7 @@ void RelocationScanner::scanOne(typename Relocs::const_iterator &i) { if (i == end) { errorOrWarn("R_PPC64_TLSGD/R_PPC64_TLSLD may not be the last " "relocation" + - getLocation(*sec, sym, offset)); + getLocation(ctx, *sec, sym, offset)); return; } } @@ -1539,7 +1547,7 @@ void RelocationScanner::scanOne(typename Relocs::const_iterator &i) { // but we need to process them in handleTlsRelocation. if (sym.isTls() || oneof(expr)) { if (unsigned processed = - handleTlsRelocation(type, sym, *sec, offset, addend, expr)) { + handleTlsRelocation(expr, type, offset, sym, addend)) { i += processed - 1; return; } @@ -1635,7 +1643,7 @@ void RelocationScanner::scanSection(InputSectionBase &s, bool isEH) { scan(rels.relas); } -template void elf::scanRelocations() { +template void elf::scanRelocations(Ctx &ctx) { // Scan all relocations. Each relocation goes through a series of tests to // determine if it needs special treatment, such as creating GOT, PLT, // copy relocations, etc. Note that relocations for non-alloc sections are @@ -1649,8 +1657,8 @@ template void elf::scanRelocations() { parallel::TaskGroup tg; auto outerFn = [&]() { for (ELFFileBase *f : ctx.objectFiles) { - auto fn = [f]() { - RelocationScanner scanner; + auto fn = [f, &ctx]() { + RelocationScanner scanner(ctx); for (InputSectionBase *s : f->getSections()) { if (s && s->kind() == SectionBase::Regular && s->isLive() && (s->flags & SHF_ALLOC) && @@ -1663,8 +1671,8 @@ template void elf::scanRelocations() { else tg.spawn(fn); } - auto scanEH = [] { - RelocationScanner scanner; + auto scanEH = [&] { + RelocationScanner scanner(ctx); for (Partition &part : ctx.partitions) { for (EhInputSection *sec : part.ehFrame->sections) scanner.template scanSection(*sec, /*isEH=*/true); @@ -1687,7 +1695,7 @@ template void elf::scanRelocations() { outerFn(); } -static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) { +static bool handleNonPreemptibleIfunc(Ctx &ctx, Symbol &sym, uint16_t flags) { // Handle a reference to a non-preemptible ifunc. These are special in a // few ways: // @@ -1763,7 +1771,7 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) { d.type = STT_FUNC; if (flags & NEEDS_GOT) - addGotEntry(sym); + addGotEntry(ctx, sym); } else if (flags & NEEDS_GOT) { // Redirect GOT accesses to point to the Igot. sym.gotInIgot = true; @@ -1771,10 +1779,10 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) { return true; } -void elf::postScanRelocations() { - auto fn = [](Symbol &sym) { +void elf::postScanRelocations(Ctx &ctx) { + auto fn = [&](Symbol &sym) { auto flags = sym.flags.load(std::memory_order_relaxed); - if (handleNonPreemptibleIfunc(sym, flags)) + if (handleNonPreemptibleIfunc(ctx, sym, flags)) return; if (sym.isTagged() && sym.isDefined()) @@ -1785,13 +1793,13 @@ void elf::postScanRelocations() { sym.allocateAux(); if (flags & NEEDS_GOT) - addGotEntry(sym); + addGotEntry(ctx, sym); if (flags & NEEDS_PLT) addPltEntry(*ctx.in.plt, *ctx.in.gotPlt, *ctx.in.relaPlt, ctx.target->pltRel, sym); if (flags & NEEDS_COPY) { if (sym.isObject()) { - invokeELFT(addCopyRelSymbol, cast(sym)); + invokeELFT(addCopyRelSymbol, ctx, cast(sym)); // NEEDS_COPY is cleared for sym and its aliases so that in // later iterations aliases won't cause redundant copies. assert(!sym.hasFlag(NEEDS_COPY)); @@ -1855,7 +1863,7 @@ void elf::postScanRelocations() { } if ((flags & NEEDS_TLSIE) && !(flags & NEEDS_TLSGD_TO_IE)) - addTpOffsetGotEntry(sym); + addTpOffsetGotEntry(ctx, sym); }; GotSection *got = ctx.in.got.get(); @@ -2045,7 +2053,7 @@ void ThunkCreator::mergeThunks(ArrayRef outputSections) { }); } -static int64_t getPCBias(RelType type) { +static int64_t getPCBias(Ctx &ctx, RelType type) { if (ctx.arg.emachine != EM_ARM) return 0; switch (type) { @@ -2067,7 +2075,7 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os, const Relocation &rel, uint64_t src) { // See the comment in getThunk for -pcBias below. - const int64_t pcBias = getPCBias(rel.type); + const int64_t pcBias = getPCBias(ctx, rel.type); for (std::pair tp : isd->thunkSections) { ThunkSection *ts = tp.first; uint64_t tsBase = os->addr + ts->outSecOff - pcBias; @@ -2228,7 +2236,7 @@ std::pair ThunkCreator::getThunk(InputSection *isec, // out in the relocation addend. We compensate for the PC bias so that // an Arm and Thumb relocation to the same destination get the same keyAddend, // which is usually 0. - const int64_t pcBias = getPCBias(rel.type); + const int64_t pcBias = getPCBias(ctx, rel.type); const int64_t keyAddend = rel.addend + pcBias; // We use a ((section, offset), addend) pair to find the thunk position if @@ -2252,7 +2260,7 @@ std::pair ThunkCreator::getThunk(InputSection *isec, return std::make_pair(t, false); // No existing compatible Thunk in range, create a new one - Thunk *t = addThunk(*isec, rel); + Thunk *t = addThunk(ctx, *isec, rel); thunkVec->push_back(t); return std::make_pair(t, true); } @@ -2350,7 +2358,7 @@ bool ThunkCreator::createThunks(uint32_t pass, // STT_SECTION + non-zero addend, clear the addend after // redirection. if (ctx.arg.emachine != EM_MIPS) - rel.addend = -getPCBias(rel.type); + rel.addend = -getPCBias(ctx, rel.type); } for (auto &p : isd->thunkSections) @@ -2383,13 +2391,13 @@ bool elf::hexagonNeedsTLSSymbol(ArrayRef outputSections) { return needTlsSymbol; } -void elf::hexagonTLSSymbolUpdate(ArrayRef outputSections) { +void elf::hexagonTLSSymbolUpdate(Ctx &ctx) { Symbol *sym = ctx.symtab->find("__tls_get_addr"); if (!sym) return; bool needEntry = true; forEachInputSectionDescription( - outputSections, [&](OutputSection *os, InputSectionDescription *isd) { + ctx.outputSections, [&](OutputSection *os, InputSectionDescription *isd) { for (InputSection *isec : isd->sections) for (Relocation &rel : isec->relocs()) if (rel.sym->type == llvm::ELF::STT_TLS && rel.expr == R_PLT_PC) { @@ -2411,8 +2419,8 @@ static bool matchesRefTo(const NoCrossRefCommand &cmd, StringRef osec) { } template -static void scanCrossRefs(const NoCrossRefCommand &cmd, OutputSection *osec, - InputSection *sec, Rels rels) { +static void scanCrossRefs(Ctx &ctx, const NoCrossRefCommand &cmd, + OutputSection *osec, InputSection *sec, Rels rels) { for (const auto &r : rels) { Symbol &sym = sec->file->getSymbol(r.getSymbol(ctx.arg.isMips64EL)); // A legal cross-reference is when the destination output section is @@ -2435,7 +2443,7 @@ static void scanCrossRefs(const NoCrossRefCommand &cmd, OutputSection *osec, // For each output section described by at least one NOCROSSREFS(_TO) command, // scan relocations from its input sections for prohibited cross references. -template void elf::checkNoCrossRefs() { +template void elf::checkNoCrossRefs(Ctx &ctx) { for (OutputSection *osec : ctx.outputSections) { for (const NoCrossRefCommand &noxref : ctx.script->noCrossRefs) { if (!llvm::is_contained(noxref.outputSections, osec->name) || @@ -2446,19 +2454,19 @@ template void elf::checkNoCrossRefs() { if (!isd) continue; parallelForEach(isd->sections, [&](InputSection *sec) { - invokeOnRelocs(*sec, scanCrossRefs, noxref, osec, sec); + invokeOnRelocs(*sec, scanCrossRefs, ctx, noxref, osec, sec); }); } } } } -template void elf::scanRelocations(); -template void elf::scanRelocations(); -template void elf::scanRelocations(); -template void elf::scanRelocations(); +template void elf::scanRelocations(Ctx &); +template void elf::scanRelocations(Ctx &); +template void elf::scanRelocations(Ctx &); +template void elf::scanRelocations(Ctx &); -template void elf::checkNoCrossRefs(); -template void elf::checkNoCrossRefs(); -template void elf::checkNoCrossRefs(); -template void elf::checkNoCrossRefs(); +template void elf::checkNoCrossRefs(Ctx &); +template void elf::checkNoCrossRefs(Ctx &); +template void elf::checkNoCrossRefs(Ctx &); +template void elf::checkNoCrossRefs(Ctx &); diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h index aaa4581490a28..4d349f68d33cc 100644 --- a/lld/ELF/Relocations.h +++ b/lld/ELF/Relocations.h @@ -16,6 +16,7 @@ #include namespace lld::elf { +struct Ctx; class Symbol; class InputSection; class InputSectionBase; @@ -141,13 +142,13 @@ struct JumpInstrMod { // This function writes undefined symbol diagnostics to an internal buffer. // Call reportUndefinedSymbols() after calling scanRelocations() to emit // the diagnostics. -template void scanRelocations(); -template void checkNoCrossRefs(); -void reportUndefinedSymbols(); -void postScanRelocations(); -void addGotEntry(Symbol &sym); +template void scanRelocations(Ctx &ctx); +template void checkNoCrossRefs(Ctx &ctx); +void reportUndefinedSymbols(Ctx &); +void postScanRelocations(Ctx &ctx); +void addGotEntry(Ctx &ctx, Symbol &sym); -void hexagonTLSSymbolUpdate(ArrayRef outputSections); +void hexagonTLSSymbolUpdate(Ctx &ctx); bool hexagonNeedsTLSSymbol(ArrayRef outputSections); class ThunkSection; @@ -156,6 +157,7 @@ class InputSectionDescription; class ThunkCreator { public: + ThunkCreator(Ctx &ctx) : ctx(ctx) {} // Return true if Thunks have been added to OutputSections bool createThunks(uint32_t pass, ArrayRef outputSections); @@ -178,6 +180,8 @@ class ThunkCreator { bool normalizeExistingThunk(Relocation &rel, uint64_t src); + Ctx &ctx; + // Record all the available Thunks for a (Symbol, addend) pair, where Symbol // is represented as a (section, offset) pair. There may be multiple // relocations sharing the same (section, offset + addend) pair. We may revert diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp index 74fa66e6d1182..db8ee8f4d7b3b 100644 --- a/lld/ELF/SymbolTable.cpp +++ b/lld/ELF/SymbolTable.cpp @@ -215,7 +215,7 @@ bool SymbolTable::assignExactVersion(SymbolVersion ver, uint16_t versionId, // Get a list of symbols which we need to assign the version to. SmallVector syms = findByVersion(ver); - auto getName = [](uint16_t ver) -> std::string { + auto getName = [&ctx = ctx](uint16_t ver) -> std::string { if (ver == VER_NDX_LOCAL) return "VER_NDX_LOCAL"; if (ver == VER_NDX_GLOBAL) diff --git a/lld/ELF/SymbolTable.h b/lld/ELF/SymbolTable.h index c0bc73502bbe6..de00a01661551 100644 --- a/lld/ELF/SymbolTable.h +++ b/lld/ELF/SymbolTable.h @@ -15,7 +15,7 @@ #include "llvm/Support/Compiler.h" namespace lld::elf { - +struct Ctx; class InputFile; class SharedFile; @@ -38,6 +38,7 @@ struct ArmCmseEntryFunction { // is one add* function per symbol type. class SymbolTable { public: + SymbolTable(Ctx &ctx) : ctx(ctx) {} ArrayRef getSymbols() const { return symVector; } void wrap(Symbol *sym, Symbol *real, Symbol *wrap); @@ -91,6 +92,8 @@ class SymbolTable { void assignWildcardVersion(SymbolVersion ver, uint16_t versionId, bool includeNonDefault); + Ctx &ctx; + // Global symbols and a map from symbol name to the index. The order is not // defined. We can use an arbitrary order, but it has to be deterministic even // when cross linking. diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp index 65283adf4e505..efdaf07c97514 100644 --- a/lld/ELF/Symbols.cpp +++ b/lld/ELF/Symbols.cpp @@ -250,7 +250,7 @@ void Symbol::parseSymbolVersion() { void Symbol::extract() const { if (file->lazy) { file->lazy = false; - parseFile(file); + parseFile(ctx, file); } } diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index ce31c379ab182..8298420b8b01d 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -1568,7 +1568,7 @@ DynamicSection::computeContents() { } if (ctx.arg.emachine == EM_PPC64) - addInt(DT_PPC64_OPT, getPPC64TargetInfo()->ppc64DynamicSectionOpt); + addInt(DT_PPC64_OPT, getPPC64TargetInfo(ctx)->ppc64DynamicSectionOpt); addInt(DT_NULL, 0); return entries; @@ -3946,7 +3946,7 @@ void MergeNoTailSection::finalizeContents() { }); } -template void elf::splitSections() { +template void elf::splitSections(Ctx &ctx) { llvm::TimeTraceScope timeScope("Split sections"); // splitIntoPieces needs to be called on each MergeInputSection // before calling finalizeContents(). @@ -3962,7 +3962,7 @@ template void elf::splitSections() { }); } -void elf::combineEhSections() { +void elf::combineEhSections(Ctx &ctx) { llvm::TimeTraceScope timeScope("Combine EH sections"); for (EhInputSection *sec : ctx.ehInputSections) { EhFrameSection &eh = *sec->getPartition().ehFrame; @@ -4495,7 +4495,7 @@ void InStruct::reset() { symTabShndx.reset(); } -static bool needsInterpSection() { +static bool needsInterpSection(Ctx &ctx) { return !ctx.arg.relocatable && !ctx.arg.shared && !ctx.arg.dynamicLinker.empty() && ctx.script->needsInterpSection(); } @@ -4513,7 +4513,7 @@ bool elf::hasMemtag() { // that ifuncs use in fully static executables. bool elf::canHaveMemtagGlobals() { return hasMemtag() && - (ctx.arg.relocatable || ctx.arg.shared || needsInterpSection()); + (ctx.arg.relocatable || ctx.arg.shared || needsInterpSection(ctx)); } constexpr char kMemtagAndroidNoteName[] = "Android"; @@ -4652,11 +4652,11 @@ static Defined *addOptionalRegular(StringRef name, SectionBase *sec, return cast(s); } -template void elf::createSyntheticSections() { +template void elf::createSyntheticSections(Ctx &ctx) { // Add the .interp section first because it is not a SyntheticSection. // The removeUnusedSyntheticSections() function relies on the // SyntheticSections coming last. - if (needsInterpSection()) { + if (needsInterpSection(ctx)) { for (size_t i = 1; i <= ctx.partitions.size(); ++i) { InputSection *sec = createInterpSection(); sec->partition = i; @@ -4664,7 +4664,7 @@ template void elf::createSyntheticSections() { } } - auto add = [](SyntheticSection &sec) { ctx.inputSections.push_back(&sec); }; + auto add = [&](SyntheticSection &sec) { ctx.inputSections.push_back(&sec); }; if (ctx.arg.zSectionHeader) ctx.in.shStrTab = std::make_unique(".shstrtab", false); @@ -4927,10 +4927,10 @@ template void elf::createSyntheticSections() { add(*ctx.in.strTab); } -template void elf::splitSections(); -template void elf::splitSections(); -template void elf::splitSections(); -template void elf::splitSections(); +template void elf::splitSections(Ctx &); +template void elf::splitSections(Ctx &); +template void elf::splitSections(Ctx &); +template void elf::splitSections(Ctx &); template void EhFrameSection::iterateFDEWithLSDA( function_ref); @@ -4956,7 +4956,7 @@ template void elf::writePhdrs(uint8_t *Buf, Partition &Part); template void elf::writePhdrs(uint8_t *Buf, Partition &Part); template void elf::writePhdrs(uint8_t *Buf, Partition &Part); -template void elf::createSyntheticSections(); -template void elf::createSyntheticSections(); -template void elf::createSyntheticSections(); -template void elf::createSyntheticSections(); +template void elf::createSyntheticSections(Ctx &); +template void elf::createSyntheticSections(Ctx &); +template void elf::createSyntheticSections(Ctx &); +template void elf::createSyntheticSections(Ctx &); diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index 34654a2c57846..759a908b202a9 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -1426,11 +1426,11 @@ class MemtagGlobalDescriptors final : public SyntheticSection { SmallVector symbols; }; -template void createSyntheticSections(); +template void createSyntheticSections(Ctx &); InputSection *createInterpSection(); MergeInputSection *createCommentSection(); -template void splitSections(); -void combineEhSections(); +template void splitSections(Ctx &); +void combineEhSections(Ctx &); bool hasMemtag(); bool canHaveMemtagGlobals(); diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp index d895757ad4e49..d5d11b9549e03 100644 --- a/lld/ELF/Target.cpp +++ b/lld/ELF/Target.cpp @@ -45,56 +45,45 @@ std::string lld::toString(RelType type) { return std::string(s); } -TargetInfo *elf::getTarget() { +TargetInfo *elf::getTarget(Ctx &ctx) { switch (ctx.arg.emachine) { case EM_386: case EM_IAMCU: - return getX86TargetInfo(); + return getX86TargetInfo(ctx); case EM_AARCH64: - return getAArch64TargetInfo(); + return getAArch64TargetInfo(ctx); case EM_AMDGPU: - return getAMDGPUTargetInfo(); + return getAMDGPUTargetInfo(ctx); case EM_ARM: - return getARMTargetInfo(); + return getARMTargetInfo(ctx); case EM_AVR: - return getAVRTargetInfo(); + return getAVRTargetInfo(ctx); case EM_HEXAGON: - return getHexagonTargetInfo(); + return getHexagonTargetInfo(ctx); case EM_LOONGARCH: - return getLoongArchTargetInfo(); + return getLoongArchTargetInfo(ctx); case EM_MIPS: - switch (ctx.arg.ekind) { - case ELF32LEKind: - return getMipsTargetInfo(); - case ELF32BEKind: - return getMipsTargetInfo(); - case ELF64LEKind: - return getMipsTargetInfo(); - case ELF64BEKind: - return getMipsTargetInfo(); - default: - llvm_unreachable("unsupported MIPS target"); - } + return getMipsTargetInfo(ctx); case EM_MSP430: - return getMSP430TargetInfo(); + return getMSP430TargetInfo(ctx); case EM_PPC: - return getPPCTargetInfo(); + return getPPCTargetInfo(ctx); case EM_PPC64: - return getPPC64TargetInfo(); + return getPPC64TargetInfo(ctx); case EM_RISCV: - return getRISCVTargetInfo(); + return getRISCVTargetInfo(ctx); case EM_SPARCV9: - return getSPARCV9TargetInfo(); + return getSPARCV9TargetInfo(ctx); case EM_S390: - return getSystemZTargetInfo(); + return getSystemZTargetInfo(ctx); case EM_X86_64: - return getX86_64TargetInfo(); + return getX86_64TargetInfo(ctx); default: fatal("unsupported e_machine value: " + Twine(ctx.arg.emachine)); } } -ErrorPlace elf::getErrorPlace(const uint8_t *loc) { +ErrorPlace elf::getErrorPlace(Ctx &ctx, const uint8_t *loc) { assert(loc != nullptr); for (InputSectionBase *d : ctx.inputSections) { auto *isec = dyn_cast(d); diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h index d51b30a81d629..16944688f3cee 100644 --- a/lld/ELF/Target.h +++ b/lld/ELF/Target.h @@ -29,6 +29,7 @@ class Symbol; class TargetInfo { public: + TargetInfo(Ctx &ctx) : ctx(ctx) {} virtual uint32_t calcEFlags() const { return 0; } virtual RelExpr getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const = 0; @@ -113,6 +114,7 @@ class TargetInfo { return false; } + Ctx &ctx; unsigned defaultCommonPageSize = 4096; unsigned defaultMaxPageSize = 4096; @@ -177,21 +179,21 @@ class TargetInfo { uint64_t defaultImageBase = 0x10000; }; -TargetInfo *getAArch64TargetInfo(); -TargetInfo *getAMDGPUTargetInfo(); -TargetInfo *getARMTargetInfo(); -TargetInfo *getAVRTargetInfo(); -TargetInfo *getHexagonTargetInfo(); -TargetInfo *getLoongArchTargetInfo(); -TargetInfo *getMSP430TargetInfo(); -TargetInfo *getPPC64TargetInfo(); -TargetInfo *getPPCTargetInfo(); -TargetInfo *getRISCVTargetInfo(); -TargetInfo *getSPARCV9TargetInfo(); -TargetInfo *getSystemZTargetInfo(); -TargetInfo *getX86TargetInfo(); -TargetInfo *getX86_64TargetInfo(); -template TargetInfo *getMipsTargetInfo(); +TargetInfo *getAArch64TargetInfo(Ctx &); +TargetInfo *getAMDGPUTargetInfo(Ctx &); +TargetInfo *getARMTargetInfo(Ctx &); +TargetInfo *getAVRTargetInfo(Ctx &); +TargetInfo *getHexagonTargetInfo(Ctx &); +TargetInfo *getLoongArchTargetInfo(Ctx &); +TargetInfo *getMSP430TargetInfo(Ctx &); +TargetInfo *getMipsTargetInfo(Ctx &); +TargetInfo *getPPC64TargetInfo(Ctx &); +TargetInfo *getPPCTargetInfo(Ctx &); +TargetInfo *getRISCVTargetInfo(Ctx &); +TargetInfo *getSPARCV9TargetInfo(Ctx &); +TargetInfo *getSystemZTargetInfo(Ctx &); +TargetInfo *getX86TargetInfo(Ctx &); +TargetInfo *getX86_64TargetInfo(Ctx &); struct ErrorPlace { InputSectionBase *isec; @@ -200,13 +202,13 @@ struct ErrorPlace { }; // Returns input section and corresponding source string for the given location. -ErrorPlace getErrorPlace(const uint8_t *loc); +ErrorPlace getErrorPlace(Ctx &ctx, const uint8_t *loc); static inline std::string getErrorLocation(const uint8_t *loc) { - return getErrorPlace(loc).loc; + return getErrorPlace(ctx, loc).loc; } -void processArmCmseSymbols(); +void processArmCmseSymbols(Ctx &); void writePPC32GlinkSection(uint8_t *buf, size_t numEntries); @@ -233,7 +235,7 @@ uint64_t getAArch64Page(uint64_t expr); template void writeARMCmseImportLib(); uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type); void riscvFinalizeRelax(int passes); -void mergeRISCVAttributesSections(); +void mergeRISCVAttributesSections(Ctx &); void addArmInputSectionMappingSymbols(); void addArmSyntheticSectionMappingSymbol(Defined *); void sortArmMappingSymbols(); @@ -241,25 +243,26 @@ void convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf); void createTaggedSymbols(const SmallVector &files); void initSymbolAnchors(); -TargetInfo *getTarget(); +TargetInfo *getTarget(Ctx &); template bool isMipsPIC(const Defined *sym); -void reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v, - int64_t min, uint64_t max); -void reportRangeError(uint8_t *loc, int64_t v, int n, const Symbol &sym, - const Twine &msg); +void reportRangeError(Ctx &, uint8_t *loc, const Relocation &rel, + const Twine &v, int64_t min, uint64_t max); +void reportRangeError(Ctx &ctx, uint8_t *loc, int64_t v, int n, + const Symbol &sym, const Twine &msg); // Make sure that V can be represented as an N bit signed integer. inline void checkInt(uint8_t *loc, int64_t v, int n, const Relocation &rel) { if (v != llvm::SignExtend64(v, n)) - reportRangeError(loc, rel, Twine(v), llvm::minIntN(n), llvm::maxIntN(n)); + reportRangeError(ctx, loc, rel, Twine(v), llvm::minIntN(n), + llvm::maxIntN(n)); } // Make sure that V can be represented as an N bit unsigned integer. inline void checkUInt(uint8_t *loc, uint64_t v, int n, const Relocation &rel) { if ((v >> n) != 0) - reportRangeError(loc, rel, Twine(v), 0, llvm::maxUIntN(n)); + reportRangeError(ctx, loc, rel, Twine(v), 0, llvm::maxUIntN(n)); } // Make sure that V can be represented as an N bit signed or unsigned integer. @@ -268,7 +271,7 @@ inline void checkIntUInt(uint8_t *loc, uint64_t v, int n, // For the error message we should cast V to a signed integer so that error // messages show a small negative value rather than an extremely large one if (v != (uint64_t)llvm::SignExtend64(v, n) && (v >> n) != 0) - reportRangeError(loc, rel, Twine((int64_t)v), llvm::minIntN(n), + reportRangeError(ctx, loc, rel, Twine((int64_t)v), llvm::minIntN(n), llvm::maxUIntN(n)); } diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp index 330dba75edc5e..dcb60330dbb12 100644 --- a/lld/ELF/Thunks.cpp +++ b/lld/ELF/Thunks.cpp @@ -54,7 +54,8 @@ namespace { // otherwise it creates a long thunk. class AArch64Thunk : public Thunk { public: - AArch64Thunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {} + AArch64Thunk(Ctx &ctx, Symbol &dest, int64_t addend) + : Thunk(ctx, dest, addend) {} bool getMayUseShortThunk(); void writeTo(uint8_t *buf) override; @@ -66,8 +67,8 @@ class AArch64Thunk : public Thunk { // AArch64 long range Thunks. class AArch64ABSLongThunk final : public AArch64Thunk { public: - AArch64ABSLongThunk(Symbol &dest, int64_t addend) - : AArch64Thunk(dest, addend) {} + AArch64ABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : AArch64Thunk(ctx, dest, addend) {} uint32_t size() override { return getMayUseShortThunk() ? 4 : 16; } void addSymbols(ThunkSection &isec) override; @@ -77,7 +78,8 @@ class AArch64ABSLongThunk final : public AArch64Thunk { class AArch64ADRPThunk final : public AArch64Thunk { public: - AArch64ADRPThunk(Symbol &dest, int64_t addend) : AArch64Thunk(dest, addend) {} + AArch64ADRPThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : AArch64Thunk(ctx, dest, addend) {} uint32_t size() override { return getMayUseShortThunk() ? 4 : 12; } void addSymbols(ThunkSection &isec) override; @@ -95,7 +97,7 @@ class AArch64ADRPThunk final : public AArch64Thunk { // if the target is in range, otherwise it creates a long thunk. class ARMThunk : public Thunk { public: - ARMThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {} + ARMThunk(Ctx &ctx, Symbol &dest, int64_t addend) : Thunk(ctx, dest, addend) {} bool getMayUseShortThunk(); uint32_t size() override { return getMayUseShortThunk() ? 4 : sizeLong(); } @@ -125,7 +127,8 @@ class ARMThunk : public Thunk { // which has a range of 16MB. class ThumbThunk : public Thunk { public: - ThumbThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) { + ThumbThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : Thunk(ctx, dest, addend) { alignment = 2; } @@ -150,7 +153,8 @@ class ThumbThunk : public Thunk { // Source State, TargetState, Target Requirement, ABS or PI, Range class ARMV7ABSLongThunk final : public ARMThunk { public: - ARMV7ABSLongThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {} + ARMV7ABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ARMThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 12; } void writeLong(uint8_t *buf) override; @@ -159,7 +163,8 @@ class ARMV7ABSLongThunk final : public ARMThunk { class ARMV7PILongThunk final : public ARMThunk { public: - ARMV7PILongThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {} + ARMV7PILongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ARMThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 16; } void writeLong(uint8_t *buf) override; @@ -168,8 +173,8 @@ class ARMV7PILongThunk final : public ARMThunk { class ThumbV7ABSLongThunk final : public ThumbThunk { public: - ThumbV7ABSLongThunk(Symbol &dest, int64_t addend) - : ThumbThunk(dest, addend) {} + ThumbV7ABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 10; } void writeLong(uint8_t *buf) override; @@ -178,7 +183,8 @@ class ThumbV7ABSLongThunk final : public ThumbThunk { class ThumbV7PILongThunk final : public ThumbThunk { public: - ThumbV7PILongThunk(Symbol &dest, int64_t addend) : ThumbThunk(dest, addend) {} + ThumbV7PILongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 12; } void writeLong(uint8_t *buf) override; @@ -188,8 +194,8 @@ class ThumbV7PILongThunk final : public ThumbThunk { // Implementations of Thunks for Arm v6-M. Only Thumb instructions are permitted class ThumbV6MABSLongThunk final : public ThumbThunk { public: - ThumbV6MABSLongThunk(Symbol &dest, int64_t addend) - : ThumbThunk(dest, addend) {} + ThumbV6MABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 12; } void writeLong(uint8_t *buf) override; @@ -198,8 +204,8 @@ class ThumbV6MABSLongThunk final : public ThumbThunk { class ThumbV6MABSXOLongThunk final : public ThumbThunk { public: - ThumbV6MABSXOLongThunk(Symbol &dest, int64_t addend) - : ThumbThunk(dest, addend) {} + ThumbV6MABSXOLongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 20; } void writeLong(uint8_t *buf) override; @@ -208,8 +214,8 @@ class ThumbV6MABSXOLongThunk final : public ThumbThunk { class ThumbV6MPILongThunk final : public ThumbThunk { public: - ThumbV6MPILongThunk(Symbol &dest, int64_t addend) - : ThumbThunk(dest, addend) {} + ThumbV6MPILongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 16; } void writeLong(uint8_t *buf) override; @@ -226,7 +232,8 @@ class ThumbV6MPILongThunk final : public ThumbThunk { // can also use this thunk, but only for Arm->Arm calls. class ARMV5LongLdrPcThunk final : public ARMThunk { public: - ARMV5LongLdrPcThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {} + ARMV5LongLdrPcThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ARMThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 8; } void writeLong(uint8_t *buf) override; @@ -237,7 +244,8 @@ class ARMV5LongLdrPcThunk final : public ARMThunk { // will not invoke Arm/Thumb state changes. class ARMV4PILongBXThunk final : public ARMThunk { public: - ARMV4PILongBXThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {} + ARMV4PILongBXThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ARMThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 16; } void writeLong(uint8_t *buf) override; @@ -246,7 +254,8 @@ class ARMV4PILongBXThunk final : public ARMThunk { class ARMV4PILongThunk final : public ARMThunk { public: - ARMV4PILongThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {} + ARMV4PILongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ARMThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 12; } void writeLong(uint8_t *buf) override; @@ -255,8 +264,8 @@ class ARMV4PILongThunk final : public ARMThunk { class ThumbV4PILongBXThunk final : public ThumbThunk { public: - ThumbV4PILongBXThunk(Symbol &dest, int64_t addend) - : ThumbThunk(dest, addend) {} + ThumbV4PILongBXThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 16; } void writeLong(uint8_t *buf) override; @@ -265,8 +274,8 @@ class ThumbV4PILongBXThunk final : public ThumbThunk { class ThumbV4PILongThunk final : public ThumbThunk { public: - ThumbV4PILongThunk(Symbol &dest, int64_t addend) - : ThumbThunk(dest, addend) {} + ThumbV4PILongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 20; } void writeLong(uint8_t *buf) override; @@ -275,7 +284,8 @@ class ThumbV4PILongThunk final : public ThumbThunk { class ARMV4ABSLongBXThunk final : public ARMThunk { public: - ARMV4ABSLongBXThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {} + ARMV4ABSLongBXThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ARMThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 12; } void writeLong(uint8_t *buf) override; @@ -284,8 +294,8 @@ class ARMV4ABSLongBXThunk final : public ARMThunk { class ThumbV4ABSLongBXThunk final : public ThumbThunk { public: - ThumbV4ABSLongBXThunk(Symbol &dest, int64_t addend) - : ThumbThunk(dest, addend) {} + ThumbV4ABSLongBXThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 12; } void writeLong(uint8_t *buf) override; @@ -294,8 +304,8 @@ class ThumbV4ABSLongBXThunk final : public ThumbThunk { class ThumbV4ABSLongThunk final : public ThumbThunk { public: - ThumbV4ABSLongThunk(Symbol &dest, int64_t addend) - : ThumbThunk(dest, addend) {} + ThumbV4ABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : ThumbThunk(ctx, dest, addend) {} uint32_t sizeLong() override { return 16; } void writeLong(uint8_t *buf) override; @@ -306,7 +316,7 @@ class ThumbV4ABSLongThunk final : public ThumbThunk { // when their destination is out of range [0, 0x1ffff]. class AVRThunk : public Thunk { public: - AVRThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {} + AVRThunk(Ctx &ctx, Symbol &dest, int64_t addend) : Thunk(ctx, dest, addend) {} uint32_t size() override { return 4; } void writeTo(uint8_t *buf) override; void addSymbols(ThunkSection &isec) override; @@ -315,7 +325,7 @@ class AVRThunk : public Thunk { // MIPS LA25 thunk class MipsThunk final : public Thunk { public: - MipsThunk(Symbol &dest) : Thunk(dest, 0) {} + MipsThunk(Ctx &ctx, Symbol &dest) : Thunk(ctx, dest, 0) {} uint32_t size() override { return 16; } void writeTo(uint8_t *buf) override; @@ -326,7 +336,7 @@ class MipsThunk final : public Thunk { // microMIPS R2-R5 LA25 thunk class MicroMipsThunk final : public Thunk { public: - MicroMipsThunk(Symbol &dest) : Thunk(dest, 0) {} + MicroMipsThunk(Ctx &ctx, Symbol &dest) : Thunk(ctx, dest, 0) {} uint32_t size() override { return 14; } void writeTo(uint8_t *buf) override; @@ -337,7 +347,7 @@ class MicroMipsThunk final : public Thunk { // microMIPS R6 LA25 thunk class MicroMipsR6Thunk final : public Thunk { public: - MicroMipsR6Thunk(Symbol &dest) : Thunk(dest, 0) {} + MicroMipsR6Thunk(Ctx &ctx, Symbol &dest) : Thunk(ctx, dest, 0) {} uint32_t size() override { return 12; } void writeTo(uint8_t *buf) override; @@ -349,9 +359,9 @@ class PPC32PltCallStub final : public Thunk { public: // For R_PPC_PLTREL24, Thunk::addend records the addend which will be used to // decide the offsets in the call stub. - PPC32PltCallStub(const InputSection &isec, const Relocation &rel, + PPC32PltCallStub(Ctx &ctx, const InputSection &isec, const Relocation &rel, Symbol &dest) - : Thunk(dest, rel.addend), file(isec.file) {} + : Thunk(ctx, dest, rel.addend), file(isec.file) {} uint32_t size() override { return 16; } void writeTo(uint8_t *buf) override; void addSymbols(ThunkSection &isec) override; @@ -364,7 +374,8 @@ class PPC32PltCallStub final : public Thunk { class PPC32LongThunk final : public Thunk { public: - PPC32LongThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {} + PPC32LongThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : Thunk(ctx, dest, addend) {} uint32_t size() override { return ctx.arg.isPic ? 32 : 16; } void writeTo(uint8_t *buf) override; void addSymbols(ThunkSection &isec) override; @@ -380,7 +391,7 @@ class PPC32LongThunk final : public Thunk { // 3) Transferring control to the target function through an indirect branch. class PPC64PltCallStub final : public Thunk { public: - PPC64PltCallStub(Symbol &dest) : Thunk(dest, 0) {} + PPC64PltCallStub(Ctx &ctx, Symbol &dest) : Thunk(ctx, dest, 0) {} uint32_t size() override { return 20; } void writeTo(uint8_t *buf) override; void addSymbols(ThunkSection &isec) override; @@ -396,7 +407,8 @@ class PPC64PltCallStub final : public Thunk { // 2) Tail calls the callee. class PPC64R2SaveStub final : public Thunk { public: - PPC64R2SaveStub(Symbol &dest, int64_t addend) : Thunk(dest, addend) { + PPC64R2SaveStub(Ctx &ctx, Symbol &dest, int64_t addend) + : Thunk(ctx, dest, addend) { alignment = 16; } @@ -433,8 +445,8 @@ class PPC64R2SaveStub final : public Thunk { // set r12 to satisfy the requirement of the global entry point. class PPC64R12SetupStub final : public Thunk { public: - PPC64R12SetupStub(Symbol &dest, bool gotPlt) - : Thunk(dest, 0), gotPlt(gotPlt) { + PPC64R12SetupStub(Ctx &ctx, Symbol &dest, bool gotPlt) + : Thunk(ctx, dest, 0), gotPlt(gotPlt) { alignment = 16; } uint32_t size() override { return 32; } @@ -464,13 +476,14 @@ class PPC64LongBranchThunk : public Thunk { const Relocation &rel) const override; protected: - PPC64LongBranchThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {} + PPC64LongBranchThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : Thunk(ctx, dest, addend) {} }; class PPC64PILongBranchThunk final : public PPC64LongBranchThunk { public: - PPC64PILongBranchThunk(Symbol &dest, int64_t addend) - : PPC64LongBranchThunk(dest, addend) { + PPC64PILongBranchThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : PPC64LongBranchThunk(ctx, dest, addend) { assert(!dest.isPreemptible); if (std::optional index = ctx.in.ppc64LongBranchTarget->addEntry(&dest, addend)) { @@ -485,8 +498,8 @@ class PPC64PILongBranchThunk final : public PPC64LongBranchThunk { class PPC64PDLongBranchThunk final : public PPC64LongBranchThunk { public: - PPC64PDLongBranchThunk(Symbol &dest, int64_t addend) - : PPC64LongBranchThunk(dest, addend) { + PPC64PDLongBranchThunk(Ctx &ctx, Symbol &dest, int64_t addend) + : PPC64LongBranchThunk(ctx, dest, addend) { ctx.in.ppc64LongBranchTarget->addEntry(&dest, addend); } }; @@ -1037,7 +1050,7 @@ InputSection *MicroMipsR6Thunk::getTargetInputSection() const { return dyn_cast(dr.section); } -void elf::writePPC32PltCallStub(uint8_t *buf, uint64_t gotPltVA, +void elf::writePPC32PltCallStub(Ctx &ctx, uint8_t *buf, uint64_t gotPltVA, const InputFile *file, int64_t addend) { if (!ctx.arg.isPic) { write32(buf + 0, 0x3d600000 | (gotPltVA + 0x8000) >> 16); // lis r11,ha @@ -1074,7 +1087,7 @@ void elf::writePPC32PltCallStub(uint8_t *buf, uint64_t gotPltVA, } void PPC32PltCallStub::writeTo(uint8_t *buf) { - writePPC32PltCallStub(buf, destination.getGotPltVA(), file, addend); + writePPC32PltCallStub(ctx, buf, destination.getGotPltVA(), file, addend); } void PPC32PltCallStub::addSymbols(ThunkSection &isec) { @@ -1199,7 +1212,8 @@ void PPC64R12SetupStub::writeTo(uint8_t *buf) { int64_t offset = (gotPlt ? destination.getGotPltVA() : destination.getVA()) - getThunkTargetSym()->getVA(); if (!isInt<34>(offset)) - reportRangeError(buf, offset, 34, destination, "R12 setup stub offset"); + reportRangeError(ctx, buf, offset, 34, destination, + "R12 setup stub offset"); int nextInstOffset; if (ctx.arg.power10Stubs) { @@ -1254,19 +1268,20 @@ bool PPC64LongBranchThunk::isCompatibleWith(const InputSection &isec, return rel.type == R_PPC64_REL24 || rel.type == R_PPC64_REL14; } -Thunk::Thunk(Symbol &d, int64_t a) : destination(d), addend(a), offset(0) { +Thunk::Thunk(Ctx &ctx, Symbol &d, int64_t a) + : ctx(ctx), destination(d), addend(a), offset(0) { destination.thunkAccessed = true; } Thunk::~Thunk() = default; -static Thunk *addThunkAArch64(RelType type, Symbol &s, int64_t a) { +static Thunk *addThunkAArch64(Ctx &ctx, RelType type, Symbol &s, int64_t a) { if (type != R_AARCH64_CALL26 && type != R_AARCH64_JUMP26 && type != R_AARCH64_PLT32) fatal("unrecognized relocation type"); if (ctx.arg.picThunk) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); } // Creates a thunk for long branches or Thumb-ARM interworking. @@ -1277,7 +1292,7 @@ static Thunk *addThunkAArch64(RelType type, Symbol &s, int64_t a) { // // TODO: use B for short Thumb->Arm thunks instead of LDR (this doesn't work for // Arm->Thumb, as in Arm state no BX PC trick; it doesn't switch state). -static Thunk *addThunkArmv4(RelType reloc, Symbol &s, int64_t a) { +static Thunk *addThunkArmv4(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) { bool thumb_target = s.getVA(a) & 1; switch (reloc) { @@ -1287,21 +1302,21 @@ static Thunk *addThunkArmv4(RelType reloc, Symbol &s, int64_t a) { case R_ARM_CALL: if (ctx.arg.picThunk) { if (thumb_target) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); } if (thumb_target) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); case R_ARM_THM_CALL: if (ctx.arg.picThunk) { if (thumb_target) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); } if (thumb_target) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); } fatal("relocation " + toString(reloc) + " to " + toString(s) + " not supported for Armv4 or Armv4T target"); @@ -1312,7 +1327,7 @@ static Thunk *addThunkArmv4(RelType reloc, Symbol &s, int64_t a) { // - MOVT and MOVW instructions cannot be used // - Only Thumb relocation that can generate a Thunk is a BL, this can always // be transformed into a BLX -static Thunk *addThunkArmv5v6(RelType reloc, Symbol &s, int64_t a) { +static Thunk *addThunkArmv5v6(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) { switch (reloc) { case R_ARM_PC24: case R_ARM_PLT32: @@ -1320,8 +1335,8 @@ static Thunk *addThunkArmv5v6(RelType reloc, Symbol &s, int64_t a) { case R_ARM_CALL: case R_ARM_THM_CALL: if (ctx.arg.picThunk) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); } fatal("relocation " + toString(reloc) + " to " + toString(s) + " not supported for Armv5 or Armv6 targets"); @@ -1332,8 +1347,8 @@ static Thunk *addThunkArmv5v6(RelType reloc, Symbol &s, int64_t a) { // - MOVT and MOVW instructions cannot be used. // - Only a limited number of instructions can access registers r8 and above // - No interworking support is needed (all Thumb). -static Thunk *addThunkV6M(const InputSection &isec, RelType reloc, Symbol &s, - int64_t a) { +static Thunk *addThunkV6M(Ctx &ctx, const InputSection &isec, RelType reloc, + Symbol &s, int64_t a) { const bool isPureCode = isec.getParent()->flags & SHF_ARM_PURECODE; switch (reloc) { case R_ARM_THM_JUMP19: @@ -1341,23 +1356,23 @@ static Thunk *addThunkV6M(const InputSection &isec, RelType reloc, Symbol &s, case R_ARM_THM_CALL: if (ctx.arg.isPic) { if (!isPureCode) - return make(s, a); + return make(ctx, s, a); fatal("relocation " + toString(reloc) + " to " + toString(s) + " not supported for Armv6-M targets for position independent" " and execute only code"); } if (isPureCode) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); } fatal("relocation " + toString(reloc) + " to " + toString(s) + " not supported for Armv6-M targets"); } // Creates a thunk for Thumb-ARM interworking or branch range extension. -static Thunk *addThunkArm(const InputSection &isec, RelType reloc, Symbol &s, - int64_t a) { +static Thunk *addThunkArm(Ctx &ctx, const InputSection &isec, RelType reloc, + Symbol &s, int64_t a) { // Decide which Thunk is needed based on: // Available instruction set // - An Arm Thunk can only be used if Arm state is available. @@ -1377,10 +1392,10 @@ static Thunk *addThunkArm(const InputSection &isec, RelType reloc, Symbol &s, // architecture to flag. if (!ctx.arg.armHasMovtMovw) { if (ctx.arg.armJ1J2BranchEncoding) - return addThunkV6M(isec, reloc, s, a); + return addThunkV6M(ctx, isec, reloc, s, a); if (ctx.arg.armHasBlx) - return addThunkArmv5v6(reloc, s, a); - return addThunkArmv4(reloc, s, a); + return addThunkArmv5v6(ctx, reloc, s, a); + return addThunkArmv4(ctx, reloc, s, a); } switch (reloc) { @@ -1389,47 +1404,47 @@ static Thunk *addThunkArm(const InputSection &isec, RelType reloc, Symbol &s, case R_ARM_JUMP24: case R_ARM_CALL: if (ctx.arg.picThunk) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); case R_ARM_THM_JUMP19: case R_ARM_THM_JUMP24: case R_ARM_THM_CALL: if (ctx.arg.picThunk) - return make(s, a); - return make(s, a); + return make(ctx, s, a); + return make(ctx, s, a); } fatal("unrecognized relocation type"); } -static Thunk *addThunkAVR(RelType type, Symbol &s, int64_t a) { +static Thunk *addThunkAVR(Ctx &ctx, RelType type, Symbol &s, int64_t a) { switch (type) { case R_AVR_LO8_LDI_GS: case R_AVR_HI8_LDI_GS: - return make(s, a); + return make(ctx, s, a); default: fatal("unrecognized relocation type " + toString(type)); } } -static Thunk *addThunkMips(RelType type, Symbol &s) { +static Thunk *addThunkMips(Ctx &ctx, RelType type, Symbol &s) { if ((s.stOther & STO_MIPS_MICROMIPS) && isMipsR6()) - return make(s); + return make(ctx, s); if (s.stOther & STO_MIPS_MICROMIPS) - return make(s); - return make(s); + return make(ctx, s); + return make(ctx, s); } -static Thunk *addThunkPPC32(const InputSection &isec, const Relocation &rel, - Symbol &s) { +static Thunk *addThunkPPC32(Ctx &ctx, const InputSection &isec, + const Relocation &rel, Symbol &s) { assert((rel.type == R_PPC_LOCAL24PC || rel.type == R_PPC_REL24 || rel.type == R_PPC_PLTREL24) && "unexpected relocation type for thunk"); if (s.isInPlt()) - return make(isec, rel, s); - return make(s, rel.addend); + return make(ctx, isec, rel, s); + return make(ctx, s, rel.addend); } -static Thunk *addThunkPPC64(RelType type, Symbol &s, int64_t a) { +static Thunk *addThunkPPC64(Ctx &ctx, RelType type, Symbol &s, int64_t a) { assert((type == R_PPC64_REL14 || type == R_PPC64_REL24 || type == R_PPC64_REL24_NOTOC) && "unexpected relocation type for thunk"); @@ -1437,45 +1452,45 @@ static Thunk *addThunkPPC64(RelType type, Symbol &s, int64_t a) { // If we are emitting stubs for NOTOC relocations, we need to tell // the PLT resolver that there can be multiple TOCs. if (type == R_PPC64_REL24_NOTOC) - getPPC64TargetInfo()->ppc64DynamicSectionOpt = 0x2; + getPPC64TargetInfo(ctx)->ppc64DynamicSectionOpt = 0x2; if (s.isInPlt()) return type == R_PPC64_REL24_NOTOC - ? (Thunk *)make(s, /*gotPlt=*/true) - : (Thunk *)make(s); + ? (Thunk *)make(ctx, s, /*gotPlt=*/true) + : (Thunk *)make(ctx, s); // This check looks at the st_other bits of the callee. If the value is 1 // then the callee clobbers the TOC and we need an R2 save stub when RelType // is R_PPC64_REL14 or R_PPC64_REL24. if ((type == R_PPC64_REL14 || type == R_PPC64_REL24) && (s.stOther >> 5) == 1) - return make(s, a); + return make(ctx, s, a); if (type == R_PPC64_REL24_NOTOC) - return make(s, /*gotPlt=*/false); + return make(ctx, s, /*gotPlt=*/false); if (ctx.arg.picThunk) - return make(s, a); + return make(ctx, s, a); - return make(s, a); + return make(ctx, s, a); } -Thunk *elf::addThunk(const InputSection &isec, Relocation &rel) { +Thunk *elf::addThunk(Ctx &ctx, const InputSection &isec, Relocation &rel) { Symbol &s = *rel.sym; int64_t a = rel.addend; switch (ctx.arg.emachine) { case EM_AARCH64: - return addThunkAArch64(rel.type, s, a); + return addThunkAArch64(ctx, rel.type, s, a); case EM_ARM: - return addThunkArm(isec, rel.type, s, a); + return addThunkArm(ctx, isec, rel.type, s, a); case EM_AVR: - return addThunkAVR(rel.type, s, a); + return addThunkAVR(ctx, rel.type, s, a); case EM_MIPS: - return addThunkMips(rel.type, s); + return addThunkMips(ctx, rel.type, s); case EM_PPC: - return addThunkPPC32(isec, rel, s); + return addThunkPPC32(ctx, isec, rel, s); case EM_PPC64: - return addThunkPPC64(rel.type, s, a); + return addThunkPPC64(ctx, rel.type, s, a); default: llvm_unreachable("add Thunk only supported for ARM, AVR, Mips and PowerPC"); } diff --git a/lld/ELF/Thunks.h b/lld/ELF/Thunks.h index 12ddf08cadc09..678bc483986d5 100644 --- a/lld/ELF/Thunks.h +++ b/lld/ELF/Thunks.h @@ -13,6 +13,7 @@ #include "Relocations.h" namespace lld::elf { +struct Ctx; class Defined; class InputFile; class Symbol; @@ -28,7 +29,7 @@ class ThunkSection; // Thunks are assigned to synthetic ThunkSections class Thunk { public: - Thunk(Symbol &destination, int64_t addend); + Thunk(Ctx &, Symbol &destination, int64_t addend); virtual ~Thunk(); virtual uint32_t size() = 0; @@ -56,6 +57,7 @@ class Thunk { Defined *getThunkTargetSym() const { return syms[0]; } + Ctx &ctx; Symbol &destination; int64_t addend; llvm::SmallVector syms; @@ -67,9 +69,9 @@ class Thunk { // For a Relocation to symbol S create a Thunk to be added to a synthetic // ThunkSection. -Thunk *addThunk(const InputSection &isec, Relocation &rel); +Thunk *addThunk(Ctx &, const InputSection &isec, Relocation &rel); -void writePPC32PltCallStub(uint8_t *buf, uint64_t gotPltVA, +void writePPC32PltCallStub(Ctx &, uint8_t *buf, uint64_t gotPltVA, const InputFile *file, int64_t addend); void writePPC64LoadAndBranch(uint8_t *buf, int64_t offset); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index ce7cbc25d7eb0..2ad7de7324a84 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -294,7 +294,8 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) { } } -static OutputSection *findSection(StringRef name, unsigned partition = 1) { +static OutputSection *findSection(Ctx &ctx, StringRef name, + unsigned partition = 1) { for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) if (osd->osec.name == name && osd->osec.partition == partition) @@ -336,7 +337,7 @@ template void Writer::run() { // Handle --print-map(-M)/--Map and --cref. Dump them before checkSections() // because the files may be useful in case checkSections() or openFile() // fails, for example, due to an erroneous file size. - writeMapAndCref(); + writeMapAndCref(ctx); // Handle --print-memory-usage option. if (ctx.arg.printMemoryUsage) @@ -544,7 +545,7 @@ template void Writer::addSectionSymbols() { // // This function returns true if a section needs to be put into a // PT_GNU_RELRO segment. -static bool isRelroSection(const OutputSection *sec) { +static bool isRelroSection(Ctx &ctx, const OutputSection *sec) { if (!ctx.arg.zRelro) return false; if (sec->relro) @@ -648,7 +649,7 @@ enum RankFlags { RF_BSS = 1 << 7, }; -unsigned elf::getSectionRank(OutputSection &osec) { +unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) { unsigned rank = osec.partition * RF_PARTITION; // We want to put section specified by -T option first, so we @@ -713,7 +714,7 @@ unsigned elf::getSectionRank(OutputSection &osec) { // TLS sections directly before the other RELRO sections. if (!(osec.flags & SHF_TLS)) rank |= RF_NOT_TLS; - if (isRelroSection(&osec)) + if (isRelroSection(ctx, &osec)) osec.relro = true; else rank |= RF_NOT_RELRO; @@ -892,8 +893,8 @@ template void Writer::setReservedSymbolSections() { if (ctx.sym.bss) { // On RISC-V, set __bss_start to the start of .sbss if present. OutputSection *sbss = - ctx.arg.emachine == EM_RISCV ? findSection(".sbss") : nullptr; - ctx.sym.bss->section = sbss ? sbss : findSection(".bss"); + ctx.arg.emachine == EM_RISCV ? findSection(ctx, ".sbss") : nullptr; + ctx.sym.bss->section = sbss ? sbss : findSection(ctx, ".bss"); } // Setup MIPS _gp_disp/__gnu_local_gp symbols which should @@ -946,7 +947,7 @@ static bool shouldSkip(SectionCommand *cmd) { // characteristics with their neighbors as possible. For example, if // both are rw, or both are tls. static SmallVectorImpl::iterator -findOrphanPos(SmallVectorImpl::iterator b, +findOrphanPos(Ctx &ctx, SmallVectorImpl::iterator b, SmallVectorImpl::iterator e) { // Place non-alloc orphan sections at the end. This matches how we assign file // offsets to non-alloc sections. @@ -1028,7 +1029,8 @@ findOrphanPos(SmallVectorImpl::iterator b, } // Adds random priorities to sections not already in the map. -static void maybeShuffle(DenseMap &order) { +static void maybeShuffle(Ctx &ctx, + DenseMap &order) { if (ctx.arg.shuffleSections.empty()) return; @@ -1066,7 +1068,7 @@ static void maybeShuffle(DenseMap &order) { } // Builds section order for handling --symbol-ordering-file. -static DenseMap buildSectionOrder() { +static DenseMap buildSectionOrder(Ctx &ctx) { DenseMap sectionOrder; // Use the rarely used option --call-graph-ordering-file to sort sections. if (!ctx.arg.callGraphProfile.empty()) @@ -1125,7 +1127,7 @@ static DenseMap buildSectionOrder() { // Sorts the sections in ISD according to the provided section order. static void -sortISDBySectionOrder(InputSectionDescription *isd, +sortISDBySectionOrder(Ctx &ctx, InputSectionDescription *isd, const DenseMap &order, bool executableOutputSection) { SmallVector unorderedSections; @@ -1199,7 +1201,7 @@ sortISDBySectionOrder(InputSectionDescription *isd, isd->sections.push_back(isec); } -static void sortSection(OutputSection &osec, +static void sortSection(Ctx &ctx, OutputSection &osec, const DenseMap &order) { StringRef name = osec.name; @@ -1214,7 +1216,7 @@ static void sortSection(OutputSection &osec, if (!order.empty()) for (SectionCommand *b : osec.commands) if (auto *isd = dyn_cast(b)) - sortISDBySectionOrder(isd, order, osec.flags & SHF_EXECINSTR); + sortISDBySectionOrder(ctx, isd, order, osec.flags & SHF_EXECINSTR); if (ctx.script->hasSectionsCommand) return; @@ -1243,11 +1245,11 @@ static void sortSection(OutputSection &osec, // sorting for special input sections. This also handles --symbol-ordering-file. template void Writer::sortInputSections() { // Build the order once since it is expensive. - DenseMap order = buildSectionOrder(); - maybeShuffle(order); + DenseMap order = buildSectionOrder(ctx); + maybeShuffle(ctx, order); for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) - sortSection(osd->osec, order); + sortSection(ctx, osd->osec, order); } template void Writer::sortSections() { @@ -1264,7 +1266,7 @@ template void Writer::sortSections() { for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) - osd->osec.sortRank = getSectionRank(osd->osec); + osd->osec.sortRank = getSectionRank(ctx, osd->osec); if (!ctx.script->hasSectionsCommand) { // OutputDescs are mostly contiguous, but may be interleaved with // SymbolAssignments in the presence of INSERT commands. @@ -1348,7 +1350,7 @@ template void Writer::sortOrphanSections() { i = firstSectionOrDotAssignment; while (nonScriptI != e) { - auto pos = findOrphanPos(i, nonScriptI); + auto pos = findOrphanPos(ctx, i, nonScriptI); OutputSection *orphan = &cast(*nonScriptI)->osec; // As an optimization, find all sections with the same sort rank @@ -1435,7 +1437,7 @@ static void finalizeSynthetic(SyntheticSection *sec) { // in Writer::finalizeSections(). template void Writer::finalizeAddressDependentContent() { llvm::TimeTraceScope timeScope("Finalize address dependent content"); - ThunkCreator tc; + ThunkCreator tc(ctx); AArch64Err843419Patcher a64p; ARMErr657417Patcher a32p; ctx.script->assignAddresses(); @@ -1454,7 +1456,7 @@ template void Writer::finalizeAddressDependentContent() { // Converts call x@GDPLT to call __tls_get_addr if (ctx.arg.emachine == EM_HEXAGON) - hexagonTLSSymbolUpdate(ctx.outputSections); + hexagonTLSSymbolUpdate(ctx); uint32_t pass = 0, assignPasses = 0; for (;;) { @@ -1570,7 +1572,7 @@ template void Writer::finalizeAddressDependentContent() { // update symbol values and sizes associated with these sections. With basic // block sections, input sections can shrink when the jump instructions at // the end of the section are relaxed. -static void fixSymbolsAfterShrinking() { +static void fixSymbolsAfterShrinking(Ctx &ctx) { for (InputFile *File : ctx.objectFiles) { parallelForEach(File->getSymbols(), [&](Symbol *Sym) { auto *def = dyn_cast(Sym); @@ -1644,7 +1646,7 @@ template void Writer::optimizeBasicBlockJumps() { } } - fixSymbolsAfterShrinking(); + fixSymbolsAfterShrinking(ctx); for (OutputSection *osec : ctx.outputSections) for (InputSection *is : getInputSections(*osec, storage)) @@ -1709,9 +1711,9 @@ static void removeUnusedSyntheticSections(Ctx &ctx) { // Create output section objects and add them to OutputSections. template void Writer::finalizeSections() { if (!ctx.arg.relocatable) { - ctx.out.preinitArray = findSection(".preinit_array"); - ctx.out.initArray = findSection(".init_array"); - ctx.out.finiArray = findSection(".fini_array"); + ctx.out.preinitArray = findSection(ctx, ".preinit_array"); + ctx.out.initArray = findSection(ctx, ".init_array"); + ctx.out.finiArray = findSection(ctx, ".fini_array"); // The linker needs to define SECNAME_start, SECNAME_end and SECNAME_stop // symbols for sections, so that the runtime can get the start and end @@ -1741,7 +1743,7 @@ template void Writer::finalizeSections() { // st_shndx arbitrarily to 1 (ctx.out.elfHeader). if (ctx.arg.emachine == EM_RISCV) { if (!ctx.arg.shared) { - OutputSection *sec = findSection(".sdata"); + OutputSection *sec = findSection(ctx, ".sdata"); addOptionalRegular(ctx, "__global_pointer$", sec ? sec : ctx.out.elfHeader, 0x800, STV_DEFAULT); // Set riscvGlobalPointer to be used by the optional global pointer @@ -1804,9 +1806,9 @@ template void Writer::finalizeSections() { // that we can correctly decide if a dynamic relocation is needed. This is // called after processSymbolAssignments() because it needs to know whether // a linker-script-defined symbol is absolute. - scanRelocations(); - reportUndefinedSymbols(); - postScanRelocations(); + scanRelocations(ctx); + reportUndefinedSymbols(ctx); + postScanRelocations(ctx); if (ctx.in.plt && ctx.in.plt->isNeeded()) ctx.in.plt->addSymbols(); @@ -1975,7 +1977,7 @@ template void Writer::finalizeSections() { if (ctx.script->noCrossRefs.size()) { llvm::TimeTraceScope timeScope("Check NOCROSSREFS"); - checkNoCrossRefs(); + checkNoCrossRefs(ctx); } { @@ -2129,7 +2131,7 @@ template void Writer::addStartEndSymbols() { // As a special case, don't unnecessarily retain .ARM.exidx, which would // create an empty PT_ARM_EXIDX. - if (OutputSection *sec = findSection(".ARM.exidx")) + if (OutputSection *sec = findSection(ctx, ".ARM.exidx")) define("__exidx_start", "__exidx_end", sec); } @@ -2201,7 +2203,7 @@ SmallVector Writer::createPhdrs(Partition &part) { addHdr(PT_PHDR, PF_R)->add(part.programHeaders->getParent()); // PT_INTERP must be the second entry if exists. - if (OutputSection *cmd = findSection(".interp", partNo)) + if (OutputSection *cmd = findSection(ctx, ".interp", partNo)) addHdr(PT_INTERP, cmd->getPhdrFlags())->add(cmd); // Add the headers. We will remove them if they don't fit. @@ -2224,7 +2226,7 @@ SmallVector Writer::createPhdrs(Partition &part) { for (OutputSection *sec : ctx.outputSections) { if (sec->partition != partNo || !needsPtLoad(sec)) continue; - if (isRelroSection(sec)) { + if (isRelroSection(ctx, sec)) { inRelroPhdr = true; if (!relroEnd) relRo->add(sec); @@ -2318,17 +2320,17 @@ SmallVector Writer::createPhdrs(Partition &part) { if (ctx.arg.osabi == ELFOSABI_OPENBSD) { // PT_OPENBSD_MUTABLE makes the dynamic linker fill the segment with // zero data, like bss, but it can be treated differently. - if (OutputSection *cmd = findSection(".openbsd.mutable", partNo)) + if (OutputSection *cmd = findSection(ctx, ".openbsd.mutable", partNo)) addHdr(PT_OPENBSD_MUTABLE, cmd->getPhdrFlags())->add(cmd); // PT_OPENBSD_RANDOMIZE makes the dynamic linker fill the segment // with random data. - if (OutputSection *cmd = findSection(".openbsd.randomdata", partNo)) + if (OutputSection *cmd = findSection(ctx, ".openbsd.randomdata", partNo)) addHdr(PT_OPENBSD_RANDOMIZE, cmd->getPhdrFlags())->add(cmd); // PT_OPENBSD_SYSCALLS makes the kernel and dynamic linker register // system call sites. - if (OutputSection *cmd = findSection(".openbsd.syscalls", partNo)) + if (OutputSection *cmd = findSection(ctx, ".openbsd.syscalls", partNo)) addHdr(PT_OPENBSD_SYSCALLS, cmd->getPhdrFlags())->add(cmd); } @@ -2350,7 +2352,7 @@ SmallVector Writer::createPhdrs(Partition &part) { if (ctx.arg.zWxneeded) addHdr(PT_OPENBSD_WXNEEDED, PF_X); - if (OutputSection *cmd = findSection(".note.gnu.property", partNo)) + if (OutputSection *cmd = findSection(ctx, ".note.gnu.property", partNo)) addHdr(PT_GNU_PROPERTY, PF_R)->add(cmd); // Create one PT_NOTE per a group of contiguous SHT_NOTE sections with the @@ -2456,7 +2458,7 @@ template void Writer::fixSectionAlignments() { // Compute an in-file position for a given section. The file offset must be the // same with its virtual address modulo the page size, so that the loader can // load executables without any address adjustment. -static uint64_t computeFileOffset(OutputSection *os, uint64_t off) { +static uint64_t computeFileOffset(Ctx &ctx, OutputSection *os, uint64_t off) { // The first section in a PT_LOAD has to have congruent offset and address // modulo the maximum page size. if (os->ptLoad && os->ptLoad->firstSec == os) @@ -2519,7 +2521,7 @@ template void Writer::assignFileOffsets() { for (OutputSection *sec : ctx.outputSections) { if (!(sec->flags & SHF_ALLOC)) continue; - off = computeFileOffset(sec, off); + off = computeFileOffset(ctx, sec, off); sec->offset = off; if (sec->type != SHT_NOBITS) off += sec->size; @@ -2702,7 +2704,7 @@ template void Writer::checkSections() { // 3. the value of the symbol _start, if present; // 4. the number represented by the entry symbol, if it is a number; // 5. the address 0. -static uint64_t getEntryAddr() { +static uint64_t getEntryAddr(Ctx &ctx) { // Case 1, 2 or 3 if (Symbol *b = ctx.symtab->find(ctx.arg.entry)) return b->getVA(); @@ -2719,7 +2721,7 @@ static uint64_t getEntryAddr() { return 0; } -static uint16_t getELFType() { +static uint16_t getELFType(Ctx &ctx) { if (ctx.arg.isPic) return ET_DYN; if (ctx.arg.relocatable) @@ -2732,8 +2734,8 @@ template void Writer::writeHeader() { writePhdrs(ctx.bufferStart + sizeof(Elf_Ehdr), *ctx.mainPart); auto *eHdr = reinterpret_cast(ctx.bufferStart); - eHdr->e_type = getELFType(); - eHdr->e_entry = getEntryAddr(); + eHdr->e_type = getELFType(ctx); + eHdr->e_entry = getEntryAddr(ctx); // If -z nosectionheader is specified, omit the section header table. if (!ctx.in.shStrTab) @@ -2807,9 +2809,10 @@ template void Writer::writeSectionsBinary() { sec->writeTo(ctx.bufferStart + sec->offset, tg); } -static void fillTrap(uint8_t *i, uint8_t *end) { +static void fillTrap(std::array trapInstr, uint8_t *i, + uint8_t *end) { for (; i + 4 <= end; i += 4) - memcpy(i, &ctx.target->trapInstr, 4); + memcpy(i, trapInstr.data(), 4); } // Fill the last page of executable segments with trap instructions @@ -2824,6 +2827,7 @@ template void Writer::writeTrapInstr() { for (PhdrEntry *p : part.phdrs) if (p->p_type == PT_LOAD && (p->p_flags & PF_X)) fillTrap( + ctx.target->trapInstr, ctx.bufferStart + alignDown(p->firstSec->offset + p->p_filesz, 4), ctx.bufferStart + alignToPowerOf2(p->firstSec->offset + p->p_filesz, ctx.arg.maxPageSize)); diff --git a/lld/ELF/Writer.h b/lld/ELF/Writer.h index bf4783cc52f6b..3a82a4bb5f8bf 100644 --- a/lld/ELF/Writer.h +++ b/lld/ELF/Writer.h @@ -46,7 +46,7 @@ struct PhdrEntry { void addReservedSymbols(Ctx &ctx); bool includeInSymtab(const Symbol &b); -unsigned getSectionRank(OutputSection &osec); +unsigned getSectionRank(Ctx &, OutputSection &osec); template uint32_t calcMipsEFlags(); diff --git a/lld/test/ELF/x86-64-gotpc-no-relax-err.s b/lld/test/ELF/x86-64-gotpc-no-relax-err.s index 618dca47755f4..4280c8fd1dc97 100644 --- a/lld/test/ELF/x86-64-gotpc-no-relax-err.s +++ b/lld/test/ELF/x86-64-gotpc-no-relax-err.s @@ -7,15 +7,19 @@ ## `>>> defined in` for linker synthesized __stop_* symbols (there is no ## associated file or linker script line number). -# CHECK: error: {{.*}}:(.text+0x2): relocation R_X86_64_GOTPCRELX out of range: 2147483658 is not in [-2147483648, 2147483647]; references '__stop_data' +# CHECK: error: {{.*}}:(.text+0x2): relocation R_X86_64_GOTPCRELX out of range: 2147483666 is not in [-2147483648, 2147483647]; references '__stop_data' # CHECK-NEXT: >>> defined in # CHECK-EMPTY: -# CHECK-NEXT: error: {{.*}}:(.text+0x9): relocation R_X86_64_REX_GOTPCRELX out of range: 2147483651 is not in [-2147483648, 2147483647]; references '__stop_data' +# CHECK-NEXT: error: {{.*}}:(.text+0x9): relocation R_X86_64_REX_GOTPCRELX out of range: 2147483659 is not in [-2147483648, 2147483647]; references '__stop_data' +# CHECK-NEXT: >>> defined in +# CHECK-EMPTY: +# CHECK-NEXT: error: {{.*}}:(.text+0x11): relocation R_X86_64_REX2_GOTPCRELX out of range: 2147483651 is not in [-2147483648, 2147483647]; references '__stop_data' # CHECK-NEXT: >>> defined in #--- a.s movl __stop_data@GOTPCREL(%rip), %eax # out of range movq __stop_data@GOTPCREL(%rip), %rax # out of range + movq __stop_data@GOTPCREL(%rip), %r16 # out of range movq __stop_data@GOTPCREL(%rip), %rax # in range .section data,"aw",@progbits @@ -23,5 +27,5 @@ #--- lds SECTIONS { .text 0x200000 : { *(.text) } - .got 0x80200010 : { *(.got) } + .got 0x80200016 : { *(.got) } } diff --git a/lld/test/ELF/x86-64-gotpc-relax-nopic.s b/lld/test/ELF/x86-64-gotpc-relax-nopic.s index 7481904d16f1b..e3cd93d1d5796 100644 --- a/lld/test/ELF/x86-64-gotpc-relax-nopic.s +++ b/lld/test/ELF/x86-64-gotpc-relax-nopic.s @@ -10,30 +10,39 @@ # SYMRELOC: Symbols [ # SYMRELOC: Symbol { # SYMRELOC: Name: bar -# SYMRELOC-NEXT: Value: 0x203248 +# SYMRELOC-NEXT: Value: 0x203290 ## 2105751 = 0x202197 (bar) # DISASM: Disassembly of section .text: # DISASM-EMPTY: # DISASM-NEXT: <_start>: -# DISASM-NEXT: 2011c8: adcl {{.*}}(%rip), %eax # 0x202240 -# DISASM-NEXT: addl {{.*}}(%rip), %ebx # 0x202240 -# DISASM-NEXT: andl {{.*}}(%rip), %ecx # 0x202240 -# DISASM-NEXT: cmpl {{.*}}(%rip), %edx # 0x202240 -# DISASM-NEXT: orl {{.*}}(%rip), %edi # 0x202240 -# DISASM-NEXT: sbbl {{.*}}(%rip), %esi # 0x202240 -# DISASM-NEXT: subl {{.*}}(%rip), %ebp # 0x202240 -# DISASM-NEXT: xorl $0x203248, %r8d -# DISASM-NEXT: testl $0x203248, %r15d -# DISASM-NEXT: 201200: adcq $0x203248, %rax -# DISASM-NEXT: addq $0x203248, %rbx -# DISASM-NEXT: andq $0x203248, %rcx -# DISASM-NEXT: cmpq $0x203248, %rdx -# DISASM-NEXT: orq $0x203248, %rdi -# DISASM-NEXT: sbbq $0x203248, %rsi -# DISASM-NEXT: subq $0x203248, %rbp -# DISASM-NEXT: xorq $0x203248, %r8 -# DISASM-NEXT: testq $0x203248, %r15 +# DISASM-NEXT: 2011c8: adcl {{.*}}(%rip), %eax # 0x202288 +# DISASM-NEXT: addl {{.*}}(%rip), %ebx # 0x202288 +# DISASM-NEXT: andl {{.*}}(%rip), %ecx # 0x202288 +# DISASM-NEXT: cmpl {{.*}}(%rip), %edx # 0x202288 +# DISASM-NEXT: orl {{.*}}(%rip), %edi # 0x202288 +# DISASM-NEXT: sbbl {{.*}}(%rip), %esi # 0x202288 +# DISASM-NEXT: subl {{.*}}(%rip), %ebp # 0x202288 +# DISASM-NEXT: xorl $0x203290, %r8d +# DISASM-NEXT: testl $0x203290, %r15d +# DISASM-NEXT: 201200: adcq $0x203290, %rax +# DISASM-NEXT: addq $0x203290, %rbx +# DISASM-NEXT: andq $0x203290, %rcx +# DISASM-NEXT: cmpq $0x203290, %rdx +# DISASM-NEXT: orq $0x203290, %rdi +# DISASM-NEXT: sbbq $0x203290, %rsi +# DISASM-NEXT: subq $0x203290, %rbp +# DISASM-NEXT: xorq $0x203290, %r8 +# DISASM-NEXT: testq $0x203290, %r15 +# DISASM-NEXT: 20123f: adcq $0x203290, %r16 +# DISASM-NEXT: addq $0x203290, %r17 +# DISASM-NEXT: andq $0x203290, %r18 +# DISASM-NEXT: cmpq $0x203290, %r19 +# DISASM-NEXT: orq $0x203290, %r20 +# DISASM-NEXT: sbbq $0x203290, %r21 +# DISASM-NEXT: subq $0x203290, %r22 +# DISASM-NEXT: xorq $0x203290, %r23 +# DISASM-NEXT: testq $0x203290, %r24 # RUN: ld.lld --hash-style=sysv -shared %t.o -o %t2 # RUN: llvm-readobj -S -r -d %t2 | FileCheck --check-prefix=SEC-PIC %s @@ -46,8 +55,8 @@ # SEC-PIC-NEXT: SHF_ALLOC # SEC-PIC-NEXT: SHF_WRITE # SEC-PIC-NEXT: ] -# SEC-PIC-NEXT: Address: 0x2380 -# SEC-PIC-NEXT: Offset: 0x380 +# SEC-PIC-NEXT: Address: 0x23C8 +# SEC-PIC-NEXT: Offset: 0x3C8 # SEC-PIC-NEXT: Size: 8 # SEC-PIC-NEXT: Link: # SEC-PIC-NEXT: Info: @@ -57,7 +66,7 @@ # SEC-PIC: 0x000000006FFFFFF9 RELACOUNT 1 # SEC-PIC: Relocations [ # SEC-PIC-NEXT: Section ({{.*}}) .rela.dyn { -# SEC-PIC-NEXT: 0x2380 R_X86_64_RELATIVE - 0x3388 +# SEC-PIC-NEXT: 0x23C8 R_X86_64_RELATIVE - 0x33D0 # SEC-PIC-NEXT: } # SEC-PIC-NEXT: ] @@ -65,24 +74,33 @@ # DISASM-PIC: Disassembly of section .text: # DISASM-PIC-EMPTY: # DISASM-PIC-NEXT: <_start>: -# DISASM-PIC-NEXT: 1268: adcl {{.*}}(%rip), %eax # 0x2380 -# DISASM-PIC-NEXT: addl {{.*}}(%rip), %ebx # 0x2380 -# DISASM-PIC-NEXT: andl {{.*}}(%rip), %ecx # 0x2380 -# DISASM-PIC-NEXT: cmpl {{.*}}(%rip), %edx # 0x2380 -# DISASM-PIC-NEXT: orl {{.*}}(%rip), %edi # 0x2380 -# DISASM-PIC-NEXT: sbbl {{.*}}(%rip), %esi # 0x2380 -# DISASM-PIC-NEXT: subl {{.*}}(%rip), %ebp # 0x2380 -# DISASM-PIC-NEXT: xorl {{.*}}(%rip), %r8d # 0x2380 -# DISASM-PIC-NEXT: testl %r15d, {{.*}}(%rip) # 0x2380 -# DISASM-PIC-NEXT: 12a0: adcq {{.*}}(%rip), %rax # 0x2380 -# DISASM-PIC-NEXT: addq {{.*}}(%rip), %rbx # 0x2380 -# DISASM-PIC-NEXT: andq {{.*}}(%rip), %rcx # 0x2380 -# DISASM-PIC-NEXT: cmpq {{.*}}(%rip), %rdx # 0x2380 -# DISASM-PIC-NEXT: orq {{.*}}(%rip), %rdi # 0x2380 -# DISASM-PIC-NEXT: sbbq {{.*}}(%rip), %rsi # 0x2380 -# DISASM-PIC-NEXT: subq {{.*}}(%rip), %rbp # 0x2380 -# DISASM-PIC-NEXT: xorq {{.*}}(%rip), %r8 # 0x2380 -# DISASM-PIC-NEXT: testq %r15, {{.*}}(%rip) # 0x2380 +# DISASM-PIC-NEXT: 1268: adcl {{.*}}(%rip), %eax # 0x23c8 +# DISASM-PIC-NEXT: addl {{.*}}(%rip), %ebx # 0x23c8 +# DISASM-PIC-NEXT: andl {{.*}}(%rip), %ecx # 0x23c8 +# DISASM-PIC-NEXT: cmpl {{.*}}(%rip), %edx # 0x23c8 +# DISASM-PIC-NEXT: orl {{.*}}(%rip), %edi # 0x23c8 +# DISASM-PIC-NEXT: sbbl {{.*}}(%rip), %esi # 0x23c8 +# DISASM-PIC-NEXT: subl {{.*}}(%rip), %ebp # 0x23c8 +# DISASM-PIC-NEXT: xorl {{.*}}(%rip), %r8d # 0x23c8 +# DISASM-PIC-NEXT: testl %r15d, {{.*}}(%rip) # 0x23c8 +# DISASM-PIC-NEXT: 12a0: adcq {{.*}}(%rip), %rax # 0x23c8 +# DISASM-PIC-NEXT: addq {{.*}}(%rip), %rbx # 0x23c8 +# DISASM-PIC-NEXT: andq {{.*}}(%rip), %rcx # 0x23c8 +# DISASM-PIC-NEXT: cmpq {{.*}}(%rip), %rdx # 0x23c8 +# DISASM-PIC-NEXT: orq {{.*}}(%rip), %rdi # 0x23c8 +# DISASM-PIC-NEXT: sbbq {{.*}}(%rip), %rsi # 0x23c8 +# DISASM-PIC-NEXT: subq {{.*}}(%rip), %rbp # 0x23c8 +# DISASM-PIC-NEXT: xorq {{.*}}(%rip), %r8 # 0x23c8 +# DISASM-PIC-NEXT: testq %r15, {{.*}}(%rip) # 0x23c8 +# DISASM-PIC-NEXT: 12df: adcq {{.*}}(%rip), %r16 # 0x23c8 +# DISASM-PIC-NEXT: addq {{.*}}(%rip), %r17 # 0x23c8 +# DISASM-PIC-NEXT: andq {{.*}}(%rip), %r18 # 0x23c8 +# DISASM-PIC-NEXT: cmpq {{.*}}(%rip), %r19 # 0x23c8 +# DISASM-PIC-NEXT: orq {{.*}}(%rip), %r20 # 0x23c8 +# DISASM-PIC-NEXT: sbbq {{.*}}(%rip), %r21 # 0x23c8 +# DISASM-PIC-NEXT: subq {{.*}}(%rip), %r22 # 0x23c8 +# DISASM-PIC-NEXT: xorq {{.*}}(%rip), %r23 # 0x23c8 +# DISASM-PIC-NEXT: testq %r24, {{.*}}(%rip) # 0x23c8 .data .type bar, @object @@ -115,3 +133,14 @@ _start: subq bar@GOTPCREL(%rip), %rbp xorq bar@GOTPCREL(%rip), %r8 testq %r15, bar@GOTPCREL(%rip) + +## R_X86_64_REX2_GOTPCRELX + adcq bar@GOTPCREL(%rip), %r16 + addq bar@GOTPCREL(%rip), %r17 + andq bar@GOTPCREL(%rip), %r18 + cmpq bar@GOTPCREL(%rip), %r19 + orq bar@GOTPCREL(%rip), %r20 + sbbq bar@GOTPCREL(%rip), %r21 + subq bar@GOTPCREL(%rip), %r22 + xorq bar@GOTPCREL(%rip), %r23 + testq %r24, bar@GOTPCREL(%rip) diff --git a/lld/test/ELF/x86-64-gotpc-relax.s b/lld/test/ELF/x86-64-gotpc-relax.s index 5945bfc04a022..b1ff995b3fc21 100644 --- a/lld/test/ELF/x86-64-gotpc-relax.s +++ b/lld/test/ELF/x86-64-gotpc-relax.s @@ -1,5 +1,5 @@ # REQUIRES: x86 -## Test R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX GOT optimization. +## Test R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX/R_X86_64_REX2_GOTPCRELX GOT optimization. # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o # RUN: ld.lld %t.o -o %t1 --no-apply-dynamic-relocs @@ -15,16 +15,16 @@ ## In our implementation, .got is retained even if all GOT-generating relocations are optimized. # CHECK: Name Type Address Off Size ES Flg Lk Inf Al -# CHECK: .iplt PROGBITS 0000000000201280 000280 000010 00 AX 0 0 16 -# CHECK-NEXT: .got PROGBITS 0000000000202290 000290 000000 00 WA 0 0 8 +# CHECK: .iplt PROGBITS 00000000002012e0 0002e0 000010 00 AX 0 0 16 +# CHECK-NEXT: .got PROGBITS 00000000002022f0 0002f0 000000 00 WA 0 0 8 ## There is one R_X86_64_IRELATIVE relocations. # RELOC-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entry: # CHECK: Offset Info Type Symbol's Value Symbol's Name + Addend -# CHECK: 0000000000203290 0000000000000025 R_X86_64_IRELATIVE 2011e2 +# CHECK: 00000000002032f0 0000000000000025 R_X86_64_IRELATIVE 2011e2 # CHECK-LABEL: Hex dump of section '.got.plt': -# NOAPPLY-NEXT: 0x00203290 00000000 00000000 -# APPLY-NEXT: 0x00203290 e2112000 00000000 +# NOAPPLY-NEXT: 0x002032f0 00000000 00000000 +# APPLY-NEXT: 0x002032f0 e2112000 00000000 # 0x201173 + 7 - 10 = 0x201170 # 0x20117a + 7 - 17 = 0x201170 @@ -43,20 +43,20 @@ # DISASM-NEXT: leaq -17(%rip), %rax # DISASM-NEXT: leaq -23(%rip), %rax # DISASM-NEXT: leaq -30(%rip), %rax -# DISASM-NEXT: movq 8330(%rip), %rax -# DISASM-NEXT: movq 8323(%rip), %rax +# DISASM-NEXT: movq 8426(%rip), %rax +# DISASM-NEXT: movq 8419(%rip), %rax # DISASM-NEXT: leaq -52(%rip), %rax # DISASM-NEXT: leaq -59(%rip), %rax # DISASM-NEXT: leaq -65(%rip), %rax # DISASM-NEXT: leaq -72(%rip), %rax -# DISASM-NEXT: movq 8288(%rip), %rax -# DISASM-NEXT: movq 8281(%rip), %rax +# DISASM-NEXT: movq 8384(%rip), %rax +# DISASM-NEXT: movq 8377(%rip), %rax # DISASM-NEXT: callq 0x2011e0 # DISASM-NEXT: callq 0x2011e0 # DISASM-NEXT: callq 0x2011e1 # DISASM-NEXT: callq 0x2011e1 -# DISASM-NEXT: callq *8251(%rip) -# DISASM-NEXT: callq *8245(%rip) +# DISASM-NEXT: callq *8347(%rip) +# DISASM-NEXT: callq *8341(%rip) # DISASM-NEXT: jmp 0x2011e0 # DISASM-NEXT: nop # DISASM-NEXT: jmp 0x2011e0 @@ -65,13 +65,26 @@ # DISASM-NEXT: nop # DISASM-NEXT: jmp 0x2011e1 # DISASM-NEXT: nop -# DISASM-NEXT: jmpq *8215(%rip) -# DISASM-NEXT: jmpq *8209(%rip) +# DISASM-NEXT: jmpq *8311(%rip) +# DISASM-NEXT: jmpq *8305(%rip) +# DISASM-NEXT: leaq -167(%rip), %r16 +# DISASM-NEXT: leaq -175(%rip), %r16 +# DISASM-NEXT: leaq -182(%rip), %r16 +# DISASM-NEXT: leaq -190(%rip), %r16 +# DISASM-NEXT: movq 8265(%rip), %r16 +# DISASM-NEXT: movq 8257(%rip), %r16 +# DISASM-NEXT: leaq -215(%rip), %r16 +# DISASM-NEXT: leaq -223(%rip), %r16 +# DISASM-NEXT: leaq -230(%rip), %r16 +# DISASM-NEXT: leaq -238(%rip), %r16 +# DISASM-NEXT: movq 8217(%rip), %r16 +# DISASM-NEXT: movq 8209(%rip), %r16 # NORELAX-LABEL: <_start>: # NORELAX-COUNT-12: movq # NORELAX-COUNT-6: callq * # NORELAX-COUNT-6: jmpq * +# NORELAX-COUNT-12: movq .text .globl foo @@ -120,3 +133,16 @@ _start: jmp *hid@GOTPCREL(%rip) jmp *ifunc@GOTPCREL(%rip) jmp *ifunc@GOTPCREL(%rip) + + movq foo@GOTPCREL(%rip), %r16 + movq foo@GOTPCREL(%rip), %r16 + movq hid@GOTPCREL(%rip), %r16 + movq hid@GOTPCREL(%rip), %r16 + movq ifunc@GOTPCREL(%rip), %r16 + movq ifunc@GOTPCREL(%rip), %r16 + movq foo@GOTPCREL(%rip), %r16 + movq foo@GOTPCREL(%rip), %r16 + movq hid@GOTPCREL(%rip), %r16 + movq hid@GOTPCREL(%rip), %r16 + movq ifunc@GOTPCREL(%rip), %r16 + movq ifunc@GOTPCREL(%rip), %r16 diff --git a/lldb/docs/use/tutorial.rst b/lldb/docs/use/tutorial.rst index 00e7befdd087a..76e8ac4aeab89 100644 --- a/lldb/docs/use/tutorial.rst +++ b/lldb/docs/use/tutorial.rst @@ -536,6 +536,43 @@ This command will run the thread in the current frame until it reaches line 100 in this frame or stops if it leaves the current frame. This is a pretty close equivalent to GDB's ``until`` command. +One other useful thing to note about the lldb stepping commands is that they +are implemented as a stack of interruptible operations. Until the operation - +e.g. step to the next line - is completed, it will remain on the +stack. If the step over is interrupted and control returned to you, +any new stepping commands you issue won't replace the step-over, but instead +their operations will be pushed onto the stack after the original step over. +Then each of them will be retired as they are completed, finally returning to the +original step over operation. + +Suppose, for instance, you ``step-over`` a source line with a function call. +If there is a breakpoint in that function, hitting the breakpoint will interrupt +the step over. At that point, you will likely want to examine the state at +the breakpoint, maybe stepping around in that frame, or stepping into other +functions, running some expressions, etc. + +Because the original step-over has remained on the stack, when you've finished +your examinations, a simple ``continue`` will resume the original ``step-over`` +operation, and you will arrive at the end of your starting source line in the +original frame. + +This saves you from having to keep track of your original intention, and manually +issuing the requisite number of ``step-out`` commands to get back to the frame +you were stepping over. The stack maintains that information for you. + +Hand-called functions using the ``expr`` command are also implemented by +operations on this same stack. So if you are calling some code with the ``expr`` command, +and hit a breakpoint during the evaluation of that code, you can examine +the state where you stopped, and when you're satisfied, issue a +``continue`` to finish the expression evaluation operation and print the function +result. + +You can examine the state of the operations stack using the ``thread plan list`` +command, and if, for instance, you decide you don't actually want that outermost +next to continue running, you can remove it with the ``thread plan discard`` +command. If you are interested in following this process in more detail, the +``lldb step`` logging channel is useful source of information. + A process, by default, will share the LLDB terminal with the inferior process. When in this mode, much like when debugging with GDB, when the process is running anything you type will go to the ``STDIN`` of the inferior process. To diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h index 84ea9c0f772e1..6afa1c932ab05 100644 --- a/lldb/include/lldb/API/SBDebugger.h +++ b/lldb/include/lldb/API/SBDebugger.h @@ -304,6 +304,8 @@ class LLDB_API SBDebugger { bool GetUseColor() const; + bool SetShowInlineDiagnostics(bool); + bool SetUseSourceCache(bool use_source_cache); bool GetUseSourceCache() const; diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index a72c2596cc2c5..1d5f2fcc20626 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -364,6 +364,10 @@ class Debugger : public std::enable_shared_from_this, const std::string &GetInstanceName() { return m_instance_name; } + bool GetShowInlineDiagnostics() const; + + bool SetShowInlineDiagnostics(bool); + bool LoadPlugin(const FileSpec &spec, Status &error); void RunIOHandlers(); diff --git a/lldb/include/lldb/Expression/DiagnosticManager.h b/lldb/include/lldb/Expression/DiagnosticManager.h index d49b7c99b114f..b9a6421577781 100644 --- a/lldb/include/lldb/Expression/DiagnosticManager.h +++ b/lldb/include/lldb/Expression/DiagnosticManager.h @@ -12,6 +12,9 @@ #include "lldb/lldb-defines.h" #include "lldb/lldb-types.h" +#include "lldb/Utility/FileSpec.h" +#include "lldb/Utility/Status.h" + #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -20,6 +23,54 @@ namespace lldb_private { +/// A compiler-independent representation of a Diagnostic. Expression +/// evaluation failures often have more than one diagnostic that a UI +/// layer might want to render differently, for example to colorize +/// it. +/// +/// Running example: +/// (lldb) expr 1+foo +/// error: :1:3: use of undeclared identifier 'foo' +/// 1+foo +/// ^ +struct DiagnosticDetail { + struct SourceLocation { + FileSpec file; + unsigned line = 0; + uint16_t column = 0; + uint16_t length = 0; + bool hidden = false; + bool in_user_input = false; + }; + /// Contains {{}, 1, 3, 3, true} in the example above. + std::optional source_location; + /// Contains eSeverityError in the example above. + lldb::Severity severity = lldb::eSeverityInfo; + /// Contains "use of undeclared identifier 'x'" in the example above. + std::string message; + /// Contains the fully rendered error message. + std::string rendered; +}; + +/// An llvm::Error used to communicate diagnostics in Status. Multiple +/// diagnostics may be chained in an llvm::ErrorList. +class ExpressionError + : public llvm::ErrorInfo { + std::string m_message; + std::vector m_details; + +public: + static char ID; + using llvm::ErrorInfo::ErrorInfo; + ExpressionError(lldb::ExpressionResults result, std::string msg, + std::vector details = {}); + std::string message() const override; + llvm::ArrayRef GetDetails() const { return m_details; } + std::error_code convertToErrorCode() const override; + void log(llvm::raw_ostream &OS) const override; + std::unique_ptr Clone() const override; +}; + enum DiagnosticOrigin { eDiagnosticOriginUnknown = 0, eDiagnosticOriginLLDB, @@ -49,37 +100,28 @@ class Diagnostic { } } - Diagnostic(llvm::StringRef message, lldb::Severity severity, - DiagnosticOrigin origin, uint32_t compiler_id) - : m_message(message), m_severity(severity), m_origin(origin), - m_compiler_id(compiler_id) {} - - Diagnostic(const Diagnostic &rhs) - : m_message(rhs.m_message), m_severity(rhs.m_severity), - m_origin(rhs.m_origin), m_compiler_id(rhs.m_compiler_id) {} + Diagnostic(DiagnosticOrigin origin, uint32_t compiler_id, + DiagnosticDetail detail) + : m_origin(origin), m_compiler_id(compiler_id), m_detail(detail) {} virtual ~Diagnostic() = default; virtual bool HasFixIts() const { return false; } - lldb::Severity GetSeverity() const { return m_severity; } + lldb::Severity GetSeverity() const { return m_detail.severity; } uint32_t GetCompilerID() const { return m_compiler_id; } - llvm::StringRef GetMessage() const { return m_message; } + llvm::StringRef GetMessage() const { return m_detail.message; } + const DiagnosticDetail &GetDetail() const { return m_detail; } - void AppendMessage(llvm::StringRef message, - bool precede_with_newline = true) { - if (precede_with_newline) - m_message.push_back('\n'); - m_message += message; - } + void AppendMessage(llvm::StringRef message, bool precede_with_newline = true); protected: - std::string m_message; - lldb::Severity m_severity; DiagnosticOrigin m_origin; - uint32_t m_compiler_id; // Compiler-specific diagnostic ID + /// Compiler-specific diagnostic ID. + uint32_t m_compiler_id; + DiagnosticDetail m_detail; }; typedef std::vector> DiagnosticList; @@ -102,10 +144,7 @@ class DiagnosticManager { void AddDiagnostic(llvm::StringRef message, lldb::Severity severity, DiagnosticOrigin origin, - uint32_t compiler_id = LLDB_INVALID_COMPILER_ID) { - m_diagnostics.emplace_back( - std::make_unique(message, severity, origin, compiler_id)); - } + uint32_t compiler_id = LLDB_INVALID_COMPILER_ID); void AddDiagnostic(std::unique_ptr diagnostic) { if (diagnostic) @@ -130,6 +169,10 @@ class DiagnosticManager { m_diagnostics.back()->AppendMessage(str); } + /// Returns an \ref ExpressionError with \c arg as error code. + llvm::Error GetAsError(lldb::ExpressionResults result, + llvm::Twine message = {}) const; + // Returns a string containing errors in this format: // // "error: error text\n diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h index 20c4769af9033..c5167e5e0ecb6 100644 --- a/lldb/include/lldb/Interpreter/CommandObject.h +++ b/lldb/include/lldb/Interpreter/CommandObject.h @@ -340,6 +340,13 @@ class CommandObject : public std::enable_shared_from_this { return false; } + /// Set the command input as it appeared in the terminal. This + /// is used to have errors refer directly to the original command. + void SetOriginalCommandString(std::string s) { m_original_command = s; } + + /// \param offset_in_command is on what column \c args_string + /// appears, if applicable. This enables diagnostics that refer back + /// to the user input. virtual void Execute(const char *args_string, CommandReturnObject &result) = 0; @@ -404,6 +411,7 @@ class CommandObject : public std::enable_shared_from_this { std::string m_cmd_help_short; std::string m_cmd_help_long; std::string m_cmd_syntax; + std::string m_original_command; Flags m_flags; std::vector m_arguments; lldb::CommandOverrideCallback m_deprecated_command_override_callback; diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h index 084ce4afb8cef..3910c26d115a0 100644 --- a/lldb/include/lldb/Utility/Status.h +++ b/lldb/include/lldb/Utility/Status.h @@ -38,6 +38,7 @@ class CloneableError using llvm::ErrorInfo::ErrorInfo; CloneableError() : ErrorInfo() {} virtual std::unique_ptr Clone() const = 0; + virtual lldb::ErrorType GetErrorType() const = 0; static char ID; }; @@ -48,6 +49,7 @@ class CloneableECError using llvm::ErrorInfo::ErrorInfo; std::error_code convertToErrorCode() const override { return EC; } void log(llvm::raw_ostream &OS) const override { OS << EC.message(); } + lldb::ErrorType GetErrorType() const override; static char ID; protected: @@ -63,6 +65,7 @@ class MachKernelError MachKernelError(std::error_code ec) : ErrorInfo(ec) {} std::string message() const override; std::unique_ptr Clone() const override; + lldb::ErrorType GetErrorType() const override; static char ID; }; @@ -72,21 +75,18 @@ class Win32Error : public llvm::ErrorInfo { Win32Error(std::error_code ec, const llvm::Twine &msg = {}) : ErrorInfo(ec) {} std::string message() const override; std::unique_ptr Clone() const override; + lldb::ErrorType GetErrorType() const override; static char ID; }; -class ExpressionError - : public llvm::ErrorInfo { +class ExpressionErrorBase + : public llvm::ErrorInfo { public: - using llvm::ErrorInfo::ErrorInfo; - ExpressionError(std::error_code ec, std::string msg = {}) - : ErrorInfo(ec), m_string(msg) {} - std::unique_ptr Clone() const override; - std::string message() const override { return m_string; } + using llvm::ErrorInfo::ErrorInfo; + ExpressionErrorBase(std::error_code ec, std::string msg = {}) + : ErrorInfo(ec) {} + lldb::ErrorType GetErrorType() const override; static char ID; - -protected: - std::string m_string; }; /// \class Status Status.h "lldb/Utility/Status.h" An error handling class. @@ -160,9 +160,6 @@ class Status { return Status(llvm::formatv(format, std::forward(args)...)); } - static Status FromExpressionError(lldb::ExpressionResults result, - std::string msg); - /// Set the current error to errno. /// /// Update the error value to be \c errno and update the type to be \c @@ -175,8 +172,11 @@ class Status { /// Avoid using this in new code. Migrate APIs to llvm::Expected instead. static Status FromError(llvm::Error error); - /// FIXME: Replace this with a takeError() method. + /// FIXME: Replace all uses with takeError() instead. llvm::Error ToError() const; + + llvm::Error takeError() { return std::move(m_error); } + /// Don't call this function in new code. Instead, redesign the API /// to use llvm::Expected instead of Status. Status Clone() const { return Status(ToError()); } diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index 6b72994fc96af..47931f1c16f9a 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -1483,6 +1483,12 @@ bool SBDebugger::GetUseColor() const { return (m_opaque_sp ? m_opaque_sp->GetUseColor() : false); } +bool SBDebugger::SetShowInlineDiagnostics(bool value) { + LLDB_INSTRUMENT_VA(this, value); + + return (m_opaque_sp ? m_opaque_sp->SetShowInlineDiagnostics(value) : false); +} + bool SBDebugger::SetUseSourceCache(bool value) { LLDB_INSTRUMENT_VA(this, value); diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp index 8d7364052a006..35058a713aef8 100644 --- a/lldb/source/Breakpoint/BreakpointLocation.cpp +++ b/lldb/source/Breakpoint/BreakpointLocation.cpp @@ -264,9 +264,10 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx, if (!m_user_expression_sp->Parse(diagnostics, exe_ctx, eExecutionPolicyOnlyWhenNeeded, true, false)) { - error = Status::FromErrorStringWithFormat( - "Couldn't parse conditional expression:\n%s", - diagnostics.GetString().c_str()); + error = Status::FromError( + diagnostics.GetAsError(lldb::eExpressionParseError, + "Couldn't parse conditional expression:")); + m_user_expression_sp.reset(); return true; } @@ -324,8 +325,8 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx, } } else { ret = false; - error = Status::FromErrorStringWithFormat( - "Couldn't execute expression:\n%s", diagnostics.GetString().c_str()); + error = Status::FromError(diagnostics.GetAsError( + lldb::eExpressionParseError, "Couldn't execute expression:")); } return ret; diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp index 771194638e1b6..9722c85a79b78 100644 --- a/lldb/source/Commands/CommandObjectExpression.cpp +++ b/lldb/source/Commands/CommandObjectExpression.cpp @@ -6,10 +6,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/StringRef.h" - #include "CommandObjectExpression.h" +#include "DiagnosticRendering.h" #include "lldb/Core/Debugger.h" +#include "lldb/Expression/DiagnosticManager.h" #include "lldb/Expression/ExpressionVariable.h" #include "lldb/Expression/REPL.h" #include "lldb/Expression/UserExpression.h" @@ -486,19 +486,34 @@ bool CommandObjectExpression::EvaluateExpression(llvm::StringRef expr, result.SetStatus(eReturnStatusSuccessFinishResult); } else { - const char *error_cstr = result_valobj_sp->GetError().AsCString(); - if (error_cstr && error_cstr[0]) { - const size_t error_cstr_len = strlen(error_cstr); - const bool ends_with_newline = error_cstr[error_cstr_len - 1] == '\n'; - if (strstr(error_cstr, "error:") != error_cstr) - error_stream.PutCString("error: "); - error_stream.Write(error_cstr, error_cstr_len); - if (!ends_with_newline) - error_stream.EOL(); + // Retrieve the diagnostics. + std::vector details; + llvm::consumeError(llvm::handleErrors( + result_valobj_sp->GetError().ToError(), + [&](ExpressionError &error) { details = error.GetDetails(); })); + // Find the position of the expression in the command. + std::optional expr_pos; + size_t nchar = m_original_command.find(expr); + if (nchar != std::string::npos) + expr_pos = nchar + GetDebugger().GetPrompt().size(); + + if (!details.empty()) { + bool show_inline = + GetDebugger().GetShowInlineDiagnostics() && !expr.contains('\n'); + RenderDiagnosticDetails(error_stream, expr_pos, show_inline, details); } else { - error_stream.PutCString("error: unknown error\n"); + const char *error_cstr = result_valobj_sp->GetError().AsCString(); + llvm::StringRef error(error_cstr); + if (!error.empty()) { + if (!error.starts_with("error:")) + error_stream << "error: "; + error_stream << error; + if (!error.ends_with('\n')) + error_stream.EOL(); + } else { + error_stream << "error: unknown error\n"; + } } - result.SetStatus(eReturnStatusFailed); } } diff --git a/lldb/source/Commands/DiagnosticRendering.h b/lldb/source/Commands/DiagnosticRendering.h new file mode 100644 index 0000000000000..5fdd090253a82 --- /dev/null +++ b/lldb/source/Commands/DiagnosticRendering.h @@ -0,0 +1,133 @@ +//===-- DiagnosticRendering.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_COMMANDS_DIAGNOSTICRENDERING_H +#define LLDB_SOURCE_COMMANDS_DIAGNOSTICRENDERING_H + +#include "lldb/Expression/DiagnosticManager.h" +#include "lldb/Utility/Stream.h" +#include "llvm/Support/WithColor.h" + +namespace lldb_private { + +static llvm::raw_ostream &PrintSeverity(Stream &stream, + lldb::Severity severity) { + llvm::HighlightColor color; + llvm::StringRef text; + switch (severity) { + case lldb::eSeverityError: + color = llvm::HighlightColor::Error; + text = "error: "; + break; + case lldb::eSeverityWarning: + color = llvm::HighlightColor::Warning; + text = "warning: "; + break; + case lldb::eSeverityInfo: + color = llvm::HighlightColor::Remark; + text = "note: "; + break; + } + return llvm::WithColor(stream.AsRawOstream(), color, llvm::ColorMode::Enable) + << text; +} + +// Public for unittesting. +static void RenderDiagnosticDetails(Stream &stream, + std::optional offset_in_command, + bool show_inline, + llvm::ArrayRef details) { + if (details.empty()) + return; + + if (!offset_in_command) { + for (const DiagnosticDetail &detail : details) { + PrintSeverity(stream, detail.severity); + stream << detail.rendered << '\n'; + } + return; + } + + // Print a line with caret indicator(s) below the lldb prompt + command. + const size_t padding = *offset_in_command; + stream << std::string(padding, ' '); + + size_t offset = 1; + std::vector remaining_details, other_details, + hidden_details; + for (const DiagnosticDetail &detail : details) { + if (!show_inline || !detail.source_location) { + other_details.push_back(detail); + continue; + } + if (detail.source_location->hidden) { + hidden_details.push_back(detail); + continue; + } + if (!detail.source_location->in_user_input) { + other_details.push_back(detail); + continue; + } + + auto &loc = *detail.source_location; + remaining_details.push_back(detail); + if (offset > loc.column) + continue; + stream << std::string(loc.column - offset, ' ') << '^'; + if (loc.length > 1) + stream << std::string(loc.length - 1, '~'); + offset = loc.column + 1; + } + stream << '\n'; + + // Work through each detail in reverse order using the vector/stack. + bool did_print = false; + for (auto detail = remaining_details.rbegin(); + detail != remaining_details.rend(); + ++detail, remaining_details.pop_back()) { + // Get the information to print this detail and remove it from the stack. + // Print all the lines for all the other messages first. + stream << std::string(padding, ' '); + size_t offset = 1; + for (auto &remaining_detail : + llvm::ArrayRef(remaining_details).drop_back(1)) { + uint16_t column = remaining_detail.source_location->column; + stream << std::string(column - offset, ' ') << "│"; + offset = column + 1; + } + + // Print the line connecting the ^ with the error message. + uint16_t column = detail->source_location->column; + if (offset <= column) + stream << std::string(column - offset, ' ') << "╰─ "; + + // Print a colorized string based on the message's severity type. + PrintSeverity(stream, detail->severity); + + // Finally, print the message and start a new line. + stream << detail->message << '\n'; + did_print = true; + } + + // Print the non-located details. + for (const DiagnosticDetail &detail : other_details) { + PrintSeverity(stream, detail.severity); + stream << detail.rendered << '\n'; + did_print = true; + } + + // Print the hidden details as a last resort. + if (!did_print) + for (const DiagnosticDetail &detail : hidden_details) { + PrintSeverity(stream, detail.severity); + stream << detail.rendered << '\n'; + } +} + +} // namespace lldb_private +#endif diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td index a6cb951187a04..e11aad2660b46 100644 --- a/lldb/source/Core/CoreProperties.td +++ b/lldb/source/Core/CoreProperties.td @@ -225,4 +225,8 @@ let Definition = "debugger" in { DefaultEnumValue<"eDWIMPrintVerbosityNone">, EnumValues<"OptionEnumValues(g_dwim_print_verbosities)">, Desc<"The verbosity level used by dwim-print.">; + def ShowInlineDiagnostics: Property<"show-inline-diagnostics", "Boolean">, + Global, + DefaultFalse, + Desc<"Controls whether diagnostics can refer directly to the command input, drawing arrows to it. If false, diagnostics will echo the input.">; } diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 9bdc5a3949751..e6b9eedd89b4e 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -592,7 +592,18 @@ lldb::DWIMPrintVerbosity Debugger::GetDWIMPrintVerbosity() const { const uint32_t idx = ePropertyDWIMPrintVerbosity; return GetPropertyAtIndexAs( idx, static_cast( - g_debugger_properties[idx].default_uint_value)); + g_debugger_properties[idx].default_uint_value != 0)); +} + +bool Debugger::GetShowInlineDiagnostics() const { + const uint32_t idx = ePropertyShowInlineDiagnostics; + return GetPropertyAtIndexAs( + idx, g_debugger_properties[idx].default_uint_value); +} + +bool Debugger::SetShowInlineDiagnostics(bool b) { + const uint32_t idx = ePropertyShowInlineDiagnostics; + return SetPropertyAtIndex(idx, b); } #pragma mark Debugger diff --git a/lldb/source/Expression/DiagnosticManager.cpp b/lldb/source/Expression/DiagnosticManager.cpp index a8330138f3d53..7c67a0ce4aa02 100644 --- a/lldb/source/Expression/DiagnosticManager.cpp +++ b/lldb/source/Expression/DiagnosticManager.cpp @@ -14,24 +14,30 @@ #include "lldb/Utility/StreamString.h" using namespace lldb_private; +char ExpressionError::ID; -void DiagnosticManager::Dump(Log *log) { - if (!log) - return; - - std::string str = GetString(); - - // GetString() puts a separator after each diagnostic. We want to remove the - // last '\n' because log->PutCString will add one for us. - - if (str.size() && str.back() == '\n') { - str.pop_back(); +/// A std::error_code category for eErrorTypeExpression. +class ExpressionCategory : public std::error_category { + const char *name() const noexcept override { + return "LLDBExpressionCategory"; } - - log->PutCString(str.c_str()); + std::string message(int __ev) const override { + return ExpressionResultAsCString( + static_cast(__ev)); + }; +}; +ExpressionCategory &expression_category() { + static ExpressionCategory g_expression_category; + return g_expression_category; } -static const char *StringForSeverity(lldb::Severity severity) { +ExpressionError::ExpressionError(lldb::ExpressionResults result, + std::string msg, + std::vector details) + : ErrorInfo(std::error_code(result, expression_category())), m_message(msg), + m_details(details) {} + +static llvm::StringRef StringForSeverity(lldb::Severity severity) { switch (severity) { // this should be exhaustive case lldb::eSeverityError: @@ -44,9 +50,33 @@ static const char *StringForSeverity(lldb::Severity severity) { llvm_unreachable("switch needs another case for lldb::Severity enum"); } +std::string ExpressionError::message() const { + std::string str; + { + llvm::raw_string_ostream os(str); + if (!m_message.empty()) + os << m_message << '\n'; + for (const auto &detail : m_details) + os << StringForSeverity(detail.severity) << detail.rendered << '\n'; + } + return str; +} + +std::error_code ExpressionError::convertToErrorCode() const { + return llvm::inconvertibleErrorCode(); +} + +void ExpressionError::log(llvm::raw_ostream &OS) const { OS << message(); } + +std::unique_ptr ExpressionError::Clone() const { + return std::make_unique( + (lldb::ExpressionResults)convertToErrorCode().value(), m_message, + m_details); +} + std::string DiagnosticManager::GetString(char separator) { - std::string ret; - llvm::raw_string_ostream stream(ret); + std::string str; + llvm::raw_string_ostream stream(str); for (const auto &diagnostic : Diagnostics()) { llvm::StringRef severity = StringForSeverity(diagnostic->GetSeverity()); @@ -61,8 +91,39 @@ std::string DiagnosticManager::GetString(char separator) { stream << message.drop_front(severity_pos + severity.size()); stream << separator; } + return str; +} + +void DiagnosticManager::Dump(Log *log) { + if (!log) + return; - return ret; + std::string str = GetString(); + + // We want to remove the last '\n' because log->PutCString will add + // one for us. + + if (!str.empty() && str.back() == '\n') + str.pop_back(); + + log->PutString(str); +} + +llvm::Error DiagnosticManager::GetAsError(lldb::ExpressionResults result, + llvm::Twine message) const { + std::vector details; + for (const auto &diag : m_diagnostics) + details.push_back(diag->GetDetail()); + return llvm::make_error(result, message.str(), details); +} + +void DiagnosticManager::AddDiagnostic(llvm::StringRef message, + lldb::Severity severity, + DiagnosticOrigin origin, + uint32_t compiler_id) { + m_diagnostics.emplace_back(std::make_unique( + origin, compiler_id, + DiagnosticDetail{{}, severity, message.str(), message.str()})); } size_t DiagnosticManager::Printf(lldb::Severity severity, const char *format, @@ -85,3 +146,13 @@ void DiagnosticManager::PutString(lldb::Severity severity, return; AddDiagnostic(str, severity, eDiagnosticOriginLLDB); } + +void Diagnostic::AppendMessage(llvm::StringRef message, + bool precede_with_newline) { + if (precede_with_newline) { + m_detail.message.push_back('\n'); + m_detail.rendered.push_back('\n'); + } + m_detail.message += message; + m_detail.rendered += message; +} diff --git a/lldb/source/Expression/ExpressionParser.cpp b/lldb/source/Expression/ExpressionParser.cpp index 868556c1c58a5..1ba5e10d65d05 100644 --- a/lldb/source/Expression/ExpressionParser.cpp +++ b/lldb/source/Expression/ExpressionParser.cpp @@ -63,9 +63,8 @@ ExpressionParser::RunStaticInitializers(IRExecutionUnitSP &execution_unit_sp, exe_ctx, call_static_initializer, options, execution_errors); if (results != eExpressionCompleted) { - err = Status::FromErrorStringWithFormat( - "couldn't run static initializer: %s", - execution_errors.GetString().c_str()); + err = Status::FromError(execution_errors.GetAsError( + lldb::eExpressionSetupError, "couldn't run static initializer:")); return err; } } diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp index 872f6304f91ba..b3c81af24893d 100644 --- a/lldb/source/Expression/UserExpression.cpp +++ b/lldb/source/Expression/UserExpression.cpp @@ -328,18 +328,20 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, } if (!parse_success) { - std::string msg; - { - llvm::raw_string_ostream os(msg); - if (!diagnostic_manager.Diagnostics().empty()) - os << diagnostic_manager.GetString(); - else - os << "expression failed to parse (no further compiler diagnostics)"; - if (target->GetEnableNotifyAboutFixIts() && fixed_expression && - !fixed_expression->empty()) - os << "\nfixed expression suggested:\n " << *fixed_expression; + if (target->GetEnableNotifyAboutFixIts() && fixed_expression && + !fixed_expression->empty()) { + std::string fixit = + "fixed expression suggested:\n " + *fixed_expression; + diagnostic_manager.AddDiagnostic(fixit, lldb::eSeverityInfo, + eDiagnosticOriginLLDB); } - error = Status::FromExpressionError(execution_results, msg); + if (diagnostic_manager.Diagnostics().empty()) + error = Status::FromError(llvm::make_error( + execution_results, + "expression failed to parse (no further compiler diagnostics)")); + else + error = + Status::FromError(diagnostic_manager.GetAsError(execution_results)); } } @@ -351,18 +353,18 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, LLDB_LOG(log, "== [UserExpression::Evaluate] Expression may not run, but " "is not constant =="); - if (!diagnostic_manager.Diagnostics().size()) - error = Status::FromExpressionError( + if (diagnostic_manager.Diagnostics().empty()) + error = Status::FromError(llvm::make_error( lldb::eExpressionSetupError, - "expression needed to run but couldn't"); + "expression needed to run but couldn't")); } else if (execution_policy == eExecutionPolicyTopLevel) { error = Status(UserExpression::kNoResult, lldb::eErrorTypeGeneric); return lldb::eExpressionCompleted; } else { if (options.InvokeCancelCallback(lldb::eExpressionEvaluationExecution)) { - error = Status::FromExpressionError( + error = Status::FromError(llvm::make_error( lldb::eExpressionInterrupted, - "expression interrupted by callback before execution"); + "expression interrupted by callback before execution")); result_valobj_sp = ValueObjectConstResult::Create( exe_ctx.GetBestExecutionContextScope(), std::move(error)); return lldb::eExpressionInterrupted; @@ -380,12 +382,13 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, LLDB_LOG(log, "== [UserExpression::Evaluate] Execution completed " "abnormally =="); - if (!diagnostic_manager.Diagnostics().size()) - error = Status::FromExpressionError( - execution_results, "expression failed to execute, unknown error"); + if (diagnostic_manager.Diagnostics().empty()) + error = Status::FromError(llvm::make_error( + execution_results, + "expression failed to execute, unknown error")); else - error = Status::FromExpressionError(execution_results, - diagnostic_manager.GetString()); + error = Status::FromError( + diagnostic_manager.GetAsError(execution_results)); } else { if (expr_result) { result_valobj_sp = expr_result->GetValueObject(); @@ -407,9 +410,9 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, } if (options.InvokeCancelCallback(lldb::eExpressionEvaluationComplete)) { - error = Status::FromExpressionError( + error = Status::FromError(llvm::make_error( lldb::eExpressionInterrupted, - "expression interrupted by callback after complete"); + "expression interrupted by callback after complete")); return lldb::eExpressionInterrupted; } diff --git a/lldb/source/Expression/UtilityFunction.cpp b/lldb/source/Expression/UtilityFunction.cpp index 55ebfb8ef342e..97c226ae1c5f9 100644 --- a/lldb/source/Expression/UtilityFunction.cpp +++ b/lldb/source/Expression/UtilityFunction.cpp @@ -83,19 +83,19 @@ FunctionCaller *UtilityFunction::MakeFunctionCaller( m_caller_up.reset(process_sp->GetTarget().GetFunctionCallerForLanguage( Language().AsLanguageType(), return_type, impl_code_address, arg_value_list, name.c_str(), error)); - if (error.Fail()) { - + if (error.Fail()) return nullptr; - } + if (m_caller_up) { DiagnosticManager diagnostics; unsigned num_errors = m_caller_up->CompileFunction(thread_to_use_sp, diagnostics); if (num_errors) { - error = Status::FromErrorStringWithFormat( - "Error compiling %s caller function: \"%s\".", - m_function_name.c_str(), diagnostics.GetString().c_str()); + error = Status::FromError(diagnostics.GetAsError( + lldb::eExpressionParseError, + "Error compiling " + m_function_name + " caller function:")); + m_caller_up.reset(); return nullptr; } @@ -104,9 +104,9 @@ FunctionCaller *UtilityFunction::MakeFunctionCaller( ExecutionContext exe_ctx(process_sp); if (!m_caller_up->WriteFunctionWrapper(exe_ctx, diagnostics)) { - error = Status::FromErrorStringWithFormat( - "Error inserting caller function for %s: \"%s\".", - m_function_name.c_str(), diagnostics.GetString().c_str()); + error = Status::FromError(diagnostics.GetAsError( + lldb::eExpressionSetupError, + "Error inserting " + m_function_name + " caller function:")); m_caller_up.reset(); return nullptr; } diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index acd592c3bd2db..d17aa6fec1f00 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -1887,7 +1887,8 @@ bool CommandInterpreter::HandleCommand(const char *command_line, CommandReturnObject &result, bool force_repeat_command) { std::string command_string(command_line); - std::string original_command_string(command_line); + std::string original_command_string(command_string); + std::string real_original_command_string(command_string); Log *log = GetLog(LLDBLog::Commands); llvm::PrettyStackTraceFormat stack_trace("HandleCommand(command = \"%s\")", @@ -2076,6 +2077,7 @@ bool CommandInterpreter::HandleCommand(const char *command_line, } ElapsedTime elapsed(execute_time); + cmd_obj->SetOriginalCommandString(real_original_command_string); cmd_obj->Execute(remainder.c_str(), result); } diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h index 21abd71cc34ee..c473df808ee8d 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h @@ -29,9 +29,8 @@ class ClangDiagnostic : public Diagnostic { return diag->getKind() == eDiagnosticOriginClang; } - ClangDiagnostic(llvm::StringRef message, lldb::Severity severity, - uint32_t compiler_id) - : Diagnostic(message, severity, eDiagnosticOriginClang, compiler_id) {} + ClangDiagnostic(DiagnosticDetail detail, uint32_t compiler_id) + : Diagnostic(eDiagnosticOriginClang, compiler_id, detail) {} ~ClangDiagnostic() override = default; diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp index 4eeac372a2e65..9b056ea73a77f 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp @@ -26,6 +26,7 @@ #include "clang/Frontend/FrontendActions.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/FrontendPluginRegistry.h" +#include "clang/Frontend/TextDiagnostic.h" #include "clang/Frontend/TextDiagnosticBuffer.h" #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Lex/Preprocessor.h" @@ -161,7 +162,8 @@ static void AddAllFixIts(ClangDiagnostic *diag, const clang::Diagnostic &Info) { class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer { public: - ClangDiagnosticManagerAdapter(DiagnosticOptions &opts) { + ClangDiagnosticManagerAdapter(DiagnosticOptions &opts, StringRef filename) + : m_filename(filename) { DiagnosticOptions *options = new DiagnosticOptions(opts); options->ShowPresumedLoc = true; options->ShowLevel = false; @@ -174,15 +176,22 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer { m_manager = manager; } - /// Returns the last ClangDiagnostic message that the DiagnosticManager - /// received or a nullptr if the DiagnosticMangager hasn't seen any - /// Clang diagnostics yet. + /// Returns the last error ClangDiagnostic message that the + /// DiagnosticManager received or a nullptr. ClangDiagnostic *MaybeGetLastClangDiag() const { if (m_manager->Diagnostics().empty()) return nullptr; - lldb_private::Diagnostic *diag = m_manager->Diagnostics().back().get(); - ClangDiagnostic *clang_diag = dyn_cast(diag); - return clang_diag; + auto &diags = m_manager->Diagnostics(); + for (auto it = diags.rbegin(); it != diags.rend(); it++) { + lldb_private::Diagnostic *diag = it->get(); + if (ClangDiagnostic *clang_diag = dyn_cast(diag)) { + if (clang_diag->GetSeverity() == lldb::eSeverityWarning) + return nullptr; + if (clang_diag->GetSeverity() == lldb::eSeverityError) + return clang_diag; + } + } + return nullptr; } void HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, @@ -211,25 +220,20 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer { m_output.clear(); m_passthrough->HandleDiagnostic(DiagLevel, Info); - lldb::Severity severity; - bool make_new_diagnostic = true; - + DiagnosticDetail detail; switch (DiagLevel) { case DiagnosticsEngine::Level::Fatal: case DiagnosticsEngine::Level::Error: - severity = lldb::eSeverityError; + detail.severity = lldb::eSeverityError; break; case DiagnosticsEngine::Level::Warning: - severity = lldb::eSeverityWarning; + detail.severity = lldb::eSeverityWarning; break; case DiagnosticsEngine::Level::Remark: case DiagnosticsEngine::Level::Ignored: - severity = lldb::eSeverityInfo; + detail.severity = lldb::eSeverityInfo; break; case DiagnosticsEngine::Level::Note: - m_manager->AppendMessageToDiagnostic(m_output); - make_new_diagnostic = false; - // 'note:' diagnostics for errors and warnings can also contain Fix-Its. // We add these Fix-Its to the last error diagnostic to make sure // that we later have all Fix-Its related to an 'error' diagnostic when @@ -247,24 +251,55 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer { AddAllFixIts(clang_diag, Info); break; } - if (make_new_diagnostic) { // ClangDiagnostic messages are expected to have no whitespace/newlines // around them. std::string stripped_output = std::string(llvm::StringRef(m_output).trim()); - auto new_diagnostic = std::make_unique( - stripped_output, severity, Info.getID()); + // Translate the source location. + if (Info.hasSourceManager()) { + DiagnosticDetail::SourceLocation loc; + clang::SourceManager &sm = Info.getSourceManager(); + const clang::SourceLocation sloc = Info.getLocation(); + if (sloc.isValid()) { + const clang::FullSourceLoc fsloc(sloc, sm); + clang::PresumedLoc PLoc = fsloc.getPresumedLoc(true); + StringRef filename = + PLoc.isValid() ? PLoc.getFilename() : StringRef{}; + loc.file = FileSpec(filename); + loc.line = fsloc.getSpellingLineNumber(); + loc.column = fsloc.getSpellingColumnNumber(); + loc.in_user_input = filename == m_filename; + loc.hidden = filename.starts_with(" loc.column) + loc.length = end_col - loc.column; + break; + } + } + detail.source_location = loc; + } + } + llvm::SmallString<0> msg; + Info.FormatDiagnostic(msg); + detail.message = msg.str(); + detail.rendered = stripped_output; + auto new_diagnostic = + std::make_unique(detail, Info.getID()); // Don't store away warning fixits, since the compiler doesn't have // enough context in an expression for the warning to be useful. // FIXME: Should we try to filter out FixIts that apply to our generated // code, and not the user's expression? - if (severity == lldb::eSeverityError) + if (detail.severity == lldb::eSeverityError) AddAllFixIts(new_diagnostic.get(), Info); m_manager->AddDiagnostic(std::move(new_diagnostic)); - } } void BeginSourceFile(const LangOptions &LO, const Preprocessor *PP) override { @@ -280,6 +315,7 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer { std::shared_ptr m_os; /// Output string filled by m_os. std::string m_output; + StringRef m_filename; }; /// Returns true if the SDK for the specified triple supports @@ -710,8 +746,8 @@ ClangExpressionParser::ClangExpressionParser( // 4. Set language options. SetupLangOpts(*m_compiler, *exe_scope, expr); - if (auto *clang_expr = dyn_cast(&m_expr); - clang_expr && clang_expr->DidImportCxxModules()) { + auto *clang_expr = dyn_cast(&m_expr); + if (clang_expr && clang_expr->DidImportCxxModules()) { LLDB_LOG(log, "Adding lang options for importing C++ modules"); SetupImportStdModuleLangOpts(*m_compiler, *target_sp); SetupModuleHeaderPaths(m_compiler.get(), m_include_directories, target_sp); @@ -738,9 +774,9 @@ ClangExpressionParser::ClangExpressionParser( m_compiler->getLangOpts()); // 5. Set up the diagnostic buffer for reporting errors - auto diag_mgr = new ClangDiagnosticManagerAdapter( - m_compiler->getDiagnostics().getDiagnosticOptions()); + m_compiler->getDiagnostics().getDiagnosticOptions(), + clang_expr ? clang_expr->GetFilename() : StringRef()); m_compiler->getDiagnostics().setClient(diag_mgr); // 6. Set up the source management objects inside the compiler @@ -1502,13 +1538,9 @@ lldb_private::Status ClangExpressionParser::DoPrepareForExecution( new ClangDynamicCheckerFunctions(); DiagnosticManager install_diags; - if (Error Err = dynamic_checkers->Install(install_diags, exe_ctx)) { - std::string ErrMsg = "couldn't install checkers: " + toString(std::move(Err)); - if (install_diags.Diagnostics().size()) - ErrMsg = ErrMsg + "\n" + install_diags.GetString().c_str(); - err = Status(ErrMsg); - return err; - } + if (Error Err = dynamic_checkers->Install(install_diags, exe_ctx)) + return Status::FromError(install_diags.GetAsError( + lldb::eExpressionSetupError, "couldn't install checkers:")); process->SetDynamicCheckers(dynamic_checkers); diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h index 09604feea5dec..7c0c6a0147e2a 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h @@ -177,6 +177,8 @@ class ClangUserExpression : public LLVMUserExpression { /// Returns true iff this expression is using any imported C++ modules. bool DidImportCxxModules() const { return !m_imported_cpp_modules.empty(); } + llvm::StringRef GetFilename() const { return m_filename; } + private: /// Populate m_in_cplusplus_method and m_in_objectivec_method based on the /// environment. diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp index 31315e46ca168..e9830c9f8722b 100644 --- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp +++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp @@ -863,9 +863,9 @@ uint32_t PlatformPOSIX::DoLoadImage(lldb_private::Process *process, func_args_addr, arguments, diagnostics)) { - error = Status::FromErrorStringWithFormat( - "dlopen error: could not write function arguments: %s", - diagnostics.GetString().c_str()); + error = Status::FromError(diagnostics.GetAsError( + lldb::eExpressionSetupError, + "dlopen error: could not write function arguments:")); return LLDB_INVALID_IMAGE_TOKEN; } @@ -906,9 +906,9 @@ uint32_t PlatformPOSIX::DoLoadImage(lldb_private::Process *process, ExpressionResults results = do_dlopen_function->ExecuteFunction( exe_ctx, &func_args_addr, options, diagnostics, return_value); if (results != eExpressionCompleted) { - error = Status::FromErrorStringWithFormat( - "dlopen error: failed executing dlopen wrapper function: %s", - diagnostics.GetString().c_str()); + error = Status::FromError(diagnostics.GetAsError( + lldb::eExpressionSetupError, + "dlopen error: failed executing dlopen wrapper function:")); return LLDB_INVALID_IMAGE_TOKEN; } diff --git a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp index 7352d6f33f217..3936b8367fb83 100644 --- a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp +++ b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp @@ -341,9 +341,9 @@ uint32_t PlatformWindows::DoLoadImage(Process *process, diagnostics.Clear(); if (!invocation->WriteFunctionArguments(context, injected_parameters, parameters, diagnostics)) { - error = Status::FromErrorStringWithFormat( - "LoadLibrary error: unable to write function parameters: %s", - diagnostics.GetString().c_str()); + error = Status::FromError(diagnostics.GetAsError( + eExpressionSetupError, + "LoadLibrary error: unable to write function parameters:")); return LLDB_INVALID_IMAGE_TOKEN; } @@ -384,9 +384,9 @@ uint32_t PlatformWindows::DoLoadImage(Process *process, invocation->ExecuteFunction(context, &injected_parameters, options, diagnostics, value); if (result != eExpressionCompleted) { - error = Status::FromErrorStringWithFormat( - "LoadLibrary error: failed to execute LoadLibrary helper: %s", - diagnostics.GetString().c_str()); + error = Status::FromError(diagnostics.GetAsError( + eExpressionSetupError, + "LoadLibrary error: failed to execute LoadLibrary helper:")); return LLDB_INVALID_IMAGE_TOKEN; } diff --git a/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h b/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h index 1a034e87cfb65..06834edf14ea1 100644 --- a/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h +++ b/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h @@ -48,7 +48,7 @@ class CommandObjectThreadTraceExportCTF : public CommandObjectParsed { Options *GetOptions() override { return &m_options; } protected: - void DoExecute(Args &command, CommandReturnObject &result) override; + void DoExecute(Args &args, CommandReturnObject &result) override; CommandOptions m_options; }; diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 6123e5b9c2090..04395e37f0425 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -2680,7 +2680,8 @@ Target::CreateUtilityFunction(std::string expression, std::string name, DiagnosticManager diagnostics; if (!utility_fn->Install(diagnostics, exe_ctx)) - return llvm::createStringError(diagnostics.GetString()); + return diagnostics.GetAsError(lldb::eExpressionSetupError, + "Could not install utility function:"); return std::move(utility_fn); } diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index 7f73962c7fc9a..cf3772bc480ba 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -43,7 +43,7 @@ char CloneableError::ID; char CloneableECError::ID; char MachKernelError::ID; char Win32Error::ID; -char ExpressionError::ID; +char ExpressionErrorBase::ID; namespace { /// A std::error_code category for eErrorTypeGeneric. @@ -55,21 +55,6 @@ LLDBGenericCategory &lldb_generic_category() { static LLDBGenericCategory g_generic_category; return g_generic_category; } - -/// A std::error_code category for eErrorTypeExpression. -class ExpressionCategory : public std::error_category { - const char *name() const noexcept override { - return "LLDBExpressionCategory"; - } - std::string message(int __ev) const override { - return ExpressionResultAsCString( - static_cast(__ev)); - }; -}; -ExpressionCategory &expression_category() { - static ExpressionCategory g_expression_category; - return g_expression_category; -} } // namespace Status::Status() : m_error(llvm::Error::success()) {} @@ -132,12 +117,6 @@ Status Status::FromErrorStringWithFormat(const char *format, ...) { return Status(string); } -Status Status::FromExpressionError(lldb::ExpressionResults result, - std::string msg) { - return Status(llvm::make_error( - std::error_code(result, expression_category()), msg)); -} - /// Creates a deep copy of all known errors and converts all other /// errors to a new llvm::StringError. static llvm::Error CloneError(const llvm::Error &error) { @@ -211,10 +190,6 @@ std::unique_ptr Win32Error::Clone() const { return std::make_unique(convertToErrorCode()); } -std::unique_ptr ExpressionError::Clone() const { - return std::make_unique(convertToErrorCode(), message()); -} - // Get the error value as a NULL C string. The error string will be fetched and // cached on demand. The cached error string value will remain until the error // value is changed or cleared. @@ -257,26 +232,38 @@ Status::ValueType Status::GetError() const { return result; } -// Access the error type. +static ErrorType ErrorCodeToErrorType(std::error_code ec) { + if (ec.category() == std::generic_category()) + return eErrorTypePOSIX; + if (ec.category() == lldb_generic_category() || + ec == llvm::inconvertibleErrorCode()) + return eErrorTypeGeneric; + return eErrorTypeInvalid; +} + +ErrorType CloneableECError::GetErrorType() const { + return ErrorCodeToErrorType(EC); +} + +lldb::ErrorType MachKernelError::GetErrorType() const { + return lldb::eErrorTypeMachKernel; +} + +lldb::ErrorType Win32Error::GetErrorType() const { + return lldb::eErrorTypeWin32; +} + +lldb::ErrorType ExpressionErrorBase::GetErrorType() const { + return lldb::eErrorTypeExpression; +} + ErrorType Status::GetType() const { ErrorType result = eErrorTypeInvalid; llvm::visitErrors(m_error, [&](const llvm::ErrorInfoBase &error) { // Return the first only. if (result != eErrorTypeInvalid) return; - if (error.isA()) - result = eErrorTypeMachKernel; - else if (error.isA()) - result = eErrorTypeWin32; - else if (error.isA()) - result = eErrorTypeExpression; - else if (error.convertToErrorCode().category() == std::generic_category()) - result = eErrorTypePOSIX; - else if (error.convertToErrorCode().category() == lldb_generic_category() || - error.convertToErrorCode() == llvm::inconvertibleErrorCode()) - result = eErrorTypeGeneric; - else - result = eErrorTypeInvalid; + result = ErrorCodeToErrorType(error.convertToErrorCode()); }); return result; } diff --git a/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py b/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py index ddc1c3598480c..1687b617350d9 100644 --- a/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py +++ b/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py @@ -183,3 +183,54 @@ def test_source_locations_from_objc_modules(self): # The NSLog definition source line should be printed. Return value and # the first argument are probably stable enough that this test can check for them. self.assertIn("void NSLog(NSString *format", value.GetError().GetCString()) + + def test_command_expr_formatting(self): + """Test that the source and caret positions LLDB prints are correct""" + self.build() + + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "// Break here", self.main_source_spec + ) + frame = thread.GetFrameAtIndex(0) + self.expect("settings set show-inline-diagnostics true") + + def check(input_ref): + self.expect(input_ref[0], error=True, substrs=input_ref[1:]) + + check( + [ + "expression -- a+b", + " ^ ^", + " │ ╰─ error: use of undeclared identifier 'b'", + " ╰─ error: use of undeclared identifier 'a'", + ] + ) + + check( + [ + "expr -- a", + " ^", + " ╰─ error: use of undeclared identifier 'a'", + ] + ) + check( + [ + "expr -i 0 -o 0 -- a", + " ^", + " ╰─ error: use of undeclared identifier 'a'", + ] + ) + + self.expect( + "expression --top-level -- template T FOO(T x) { return x/2;}" + ) + check( + [ + 'expression -- FOO("")', + " ^", + " ╰─ note: in instantiation of function template specialization 'FOO' requested here", + "error: b2", error=True, substrs=[ - "warning: :1:4: '<=>' is a single token in C++20; add a space to avoid a change in behavior" + "warning:", + "'<=>' is a single token in C++20; add a space to avoid a change in behavior", ], ) diff --git a/lldb/test/API/lang/mixed/TestMixedLanguages.py b/lldb/test/API/lang/mixed/TestMixedLanguages.py index 8b73254cce4a9..1637d59a5edcb 100644 --- a/lldb/test/API/lang/mixed/TestMixedLanguages.py +++ b/lldb/test/API/lang/mixed/TestMixedLanguages.py @@ -40,7 +40,7 @@ def cleanup(): self.runCmd("run") self.expect("thread backtrace", substrs=["`main", "lang=c"]) # Make sure evaluation of C++11 fails. - self.expect("expr foo != nullptr", error=True, startstr="error") + self.expect("expr foo != nullptr", error=True, substrs=["error"]) # Run to BP at foo (in foo.cpp) and test that the language is C++. self.runCmd("breakpoint set -n foo") diff --git a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py b/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py index 620b6e44fc852..36e302be2525b 100644 --- a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py +++ b/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py @@ -21,7 +21,7 @@ def test(self): "expr @import LLDBTestModule", error=True, substrs=[ - "module.h:4:1: use of undeclared identifier 'syntax_error_for_lldb_to_find'", + "module.h:4:1: error: use of undeclared identifier 'syntax_error_for_lldb_to_find'", "syntax_error_for_lldb_to_find // comment that tests source printing", "could not build module 'LLDBTestModule'", ], diff --git a/lldb/test/Shell/Expr/TestObjCIDCast.test b/lldb/test/Shell/Expr/TestObjCIDCast.test index 0611171da09e2..19ca404643c1d 100644 --- a/lldb/test/Shell/Expr/TestObjCIDCast.test +++ b/lldb/test/Shell/Expr/TestObjCIDCast.test @@ -6,4 +6,4 @@ // RUN: 2>&1 | FileCheck %s // CHECK: (lldb) expression --language objc -- *(id)0x1 -// CHECK: error: Couldn't apply expression side effects : Couldn't dematerialize a result variable: couldn't read its memory +// CHECK: error:{{.*}}Couldn't apply expression side effects : Couldn't dematerialize a result variable: couldn't read its memory diff --git a/lldb/test/Shell/Expr/TestObjCInCXXContext.test b/lldb/test/Shell/Expr/TestObjCInCXXContext.test index 8537799bdeb67..f8cad5b58a1e5 100644 --- a/lldb/test/Shell/Expr/TestObjCInCXXContext.test +++ b/lldb/test/Shell/Expr/TestObjCInCXXContext.test @@ -18,4 +18,4 @@ // CHECK-NEXT: (NSString *){{.*}}= nil // CHECK: (lldb) expression NSString -// CHECK-NEXT: error:{{.*}} use of undeclared identifier 'NSString' +// CHECK: error:{{.*}}use of undeclared identifier 'NSString' diff --git a/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp b/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp index a249057282d89..8c16828690301 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp @@ -13,9 +13,7 @@ // CHECK: (lldb) expression d // CHECK: (D) $1 = {} // CHECK: (lldb) expression static_e_ref -// CHECK: error: {{.*}}incomplete type 'E' where a complete type is required -// CHECK: static_e_ref -// CHECK: ^ +// CHECK: error:{{.*}}incomplete type 'E' where a complete type is required // Complete base class. struct A { int x; A(); }; diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index 14371da64f2f2..afb1a1ff95c3a 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -442,6 +442,7 @@ int Driver::MainLoop() { m_debugger.SetInputFileHandle(stdin, false); m_debugger.SetUseExternalEditor(m_option_data.m_use_external_editor); + m_debugger.SetShowInlineDiagnostics(true); struct winsize window_size; if ((isatty(STDIN_FILENO) != 0) && diff --git a/lldb/unittests/Expression/DiagnosticManagerTest.cpp b/lldb/unittests/Expression/DiagnosticManagerTest.cpp index 05fe7c164d681..7e04d4b023e4c 100644 --- a/lldb/unittests/Expression/DiagnosticManagerTest.cpp +++ b/lldb/unittests/Expression/DiagnosticManagerTest.cpp @@ -19,8 +19,8 @@ class FixItDiag : public Diagnostic { public: FixItDiag(llvm::StringRef msg, bool has_fixits) - : Diagnostic(msg, lldb::eSeverityError, - DiagnosticOrigin::eDiagnosticOriginLLDB, custom_diag_id), + : Diagnostic(DiagnosticOrigin::eDiagnosticOriginLLDB, custom_diag_id, + DiagnosticDetail{{}, lldb::eSeverityError, msg.str(), {}}), m_has_fixits(has_fixits) {} bool HasFixIts() const override { return m_has_fixits; } }; @@ -30,8 +30,8 @@ namespace { class TextDiag : public Diagnostic { public: TextDiag(llvm::StringRef msg, lldb::Severity severity) - : Diagnostic(msg, severity, DiagnosticOrigin::eDiagnosticOriginLLDB, - custom_diag_id) {} + : Diagnostic(DiagnosticOrigin::eDiagnosticOriginLLDB, custom_diag_id, + DiagnosticDetail{{}, severity, msg.str(), msg.str()}) {} }; } // namespace @@ -42,8 +42,8 @@ TEST(DiagnosticManagerTest, AddDiagnostic) { std::string msg = "foo bar has happened"; lldb::Severity severity = lldb::eSeverityError; DiagnosticOrigin origin = DiagnosticOrigin::eDiagnosticOriginLLDB; - auto diag = - std::make_unique(msg, severity, origin, custom_diag_id); + auto diag = std::make_unique( + origin, custom_diag_id, DiagnosticDetail{{}, severity, msg, {}}); mgr.AddDiagnostic(std::move(diag)); EXPECT_EQ(1U, mgr.Diagnostics().size()); const Diagnostic *got = mgr.Diagnostics().front().get(); @@ -72,18 +72,25 @@ TEST(DiagnosticManagerTest, HasFixits) { EXPECT_TRUE(mgr.HasFixIts()); } +static std::string toString(DiagnosticManager &mgr) { + // The error code doesn't really matter since we just convert the + // diagnostics to a string. + auto result = lldb::eExpressionCompleted; + return llvm::toString(mgr.GetAsError(result)); +} + TEST(DiagnosticManagerTest, GetStringNoDiags) { DiagnosticManager mgr; - EXPECT_EQ("", mgr.GetString()); + EXPECT_EQ("", toString(mgr)); std::unique_ptr empty; mgr.AddDiagnostic(std::move(empty)); - EXPECT_EQ("", mgr.GetString()); + EXPECT_EQ("", toString(mgr)); } TEST(DiagnosticManagerTest, GetStringBasic) { DiagnosticManager mgr; mgr.AddDiagnostic(std::make_unique("abc", lldb::eSeverityError)); - EXPECT_EQ("error: abc\n", mgr.GetString()); + EXPECT_EQ("error: abc\n", toString(mgr)); } TEST(DiagnosticManagerTest, GetStringMultiline) { @@ -91,15 +98,15 @@ TEST(DiagnosticManagerTest, GetStringMultiline) { // Multiline diagnostics should only get one severity label. mgr.AddDiagnostic(std::make_unique("b\nc", lldb::eSeverityError)); - EXPECT_EQ("error: b\nc\n", mgr.GetString()); + EXPECT_EQ("error: b\nc\n", toString(mgr)); } TEST(DiagnosticManagerTest, GetStringMultipleDiags) { DiagnosticManager mgr; mgr.AddDiagnostic(std::make_unique("abc", lldb::eSeverityError)); - EXPECT_EQ("error: abc\n", mgr.GetString()); + EXPECT_EQ("error: abc\n", toString(mgr)); mgr.AddDiagnostic(std::make_unique("def", lldb::eSeverityError)); - EXPECT_EQ("error: abc\nerror: def\n", mgr.GetString()); + EXPECT_EQ("error: abc\nerror: def\n", toString(mgr)); } TEST(DiagnosticManagerTest, GetStringSeverityLabels) { @@ -110,7 +117,7 @@ TEST(DiagnosticManagerTest, GetStringSeverityLabels) { mgr.AddDiagnostic(std::make_unique("bar", lldb::eSeverityWarning)); // Remarks have no labels. mgr.AddDiagnostic(std::make_unique("baz", lldb::eSeverityInfo)); - EXPECT_EQ("error: foo\nwarning: bar\nbaz\n", mgr.GetString()); + EXPECT_EQ("error: foo\nwarning: bar\nbaz\n", toString(mgr)); } TEST(DiagnosticManagerTest, GetStringPreserveOrder) { @@ -120,7 +127,7 @@ TEST(DiagnosticManagerTest, GetStringPreserveOrder) { mgr.AddDiagnostic(std::make_unique("baz", lldb::eSeverityInfo)); mgr.AddDiagnostic(std::make_unique("bar", lldb::eSeverityWarning)); mgr.AddDiagnostic(std::make_unique("foo", lldb::eSeverityError)); - EXPECT_EQ("baz\nwarning: bar\nerror: foo\n", mgr.GetString()); + EXPECT_EQ("baz\nwarning: bar\nerror: foo\n", toString(mgr)); } TEST(DiagnosticManagerTest, AppendMessageNoDiag) { @@ -139,7 +146,7 @@ TEST(DiagnosticManagerTest, AppendMessageAttachToLastDiag) { // This should append to 'bar' and not to 'foo'. mgr.AppendMessageToDiagnostic("message text"); - EXPECT_EQ("error: foo\nerror: bar\nmessage text\n", mgr.GetString()); + EXPECT_EQ("error: foo\nerror: bar\nmessage text\n", toString(mgr)); } TEST(DiagnosticManagerTest, AppendMessageSubsequentDiags) { @@ -150,7 +157,7 @@ TEST(DiagnosticManagerTest, AppendMessageSubsequentDiags) { // Pushing another diag after the message should work fine. mgr.AddDiagnostic(std::make_unique("foo", lldb::eSeverityError)); - EXPECT_EQ("error: bar\nmessage text\nerror: foo\n", mgr.GetString()); + EXPECT_EQ("error: bar\nmessage text\nerror: foo\n", toString(mgr)); } TEST(DiagnosticManagerTest, PutString) { @@ -159,7 +166,7 @@ TEST(DiagnosticManagerTest, PutString) { mgr.PutString(lldb::eSeverityError, "foo"); EXPECT_EQ(1U, mgr.Diagnostics().size()); EXPECT_EQ(eDiagnosticOriginLLDB, mgr.Diagnostics().front()->getKind()); - EXPECT_EQ("error: foo\n", mgr.GetString()); + EXPECT_EQ("error: foo\n", toString(mgr)); } TEST(DiagnosticManagerTest, PutStringMultiple) { @@ -169,7 +176,7 @@ TEST(DiagnosticManagerTest, PutStringMultiple) { mgr.PutString(lldb::eSeverityError, "foo"); mgr.PutString(lldb::eSeverityError, "bar"); EXPECT_EQ(2U, mgr.Diagnostics().size()); - EXPECT_EQ("error: foo\nerror: bar\n", mgr.GetString()); + EXPECT_EQ("error: foo\nerror: bar\n", toString(mgr)); } TEST(DiagnosticManagerTest, PutStringSeverities) { @@ -180,7 +187,7 @@ TEST(DiagnosticManagerTest, PutStringSeverities) { mgr.PutString(lldb::eSeverityError, "foo"); mgr.PutString(lldb::eSeverityWarning, "bar"); EXPECT_EQ(2U, mgr.Diagnostics().size()); - EXPECT_EQ("error: foo\nwarning: bar\n", mgr.GetString()); + EXPECT_EQ("error: foo\nwarning: bar\n", toString(mgr)); } TEST(DiagnosticManagerTest, FixedExpression) { @@ -197,3 +204,13 @@ TEST(DiagnosticManagerTest, FixedExpression) { mgr.SetFixedExpression("bar"); EXPECT_EQ("bar", mgr.GetFixedExpression()); } + +TEST(DiagnosticManagerTest, StatusConversion) { + DiagnosticManager mgr; + mgr.AddDiagnostic(std::make_unique("abc", lldb::eSeverityError)); + mgr.AddDiagnostic(std::make_unique("def", lldb::eSeverityWarning)); + Status status = + Status::FromError(mgr.GetAsError(lldb::eExpressionParseError)); + EXPECT_EQ(std::string("error: abc\nwarning: def\n"), + std::string(status.AsCString())); +} diff --git a/lldb/unittests/Interpreter/CMakeLists.txt b/lldb/unittests/Interpreter/CMakeLists.txt index 54cea995084d3..f7d639f50f5bf 100644 --- a/lldb/unittests/Interpreter/CMakeLists.txt +++ b/lldb/unittests/Interpreter/CMakeLists.txt @@ -1,5 +1,6 @@ add_lldb_unittest(InterpreterTests TestCommandPaths.cpp + TestCommandObjectExpression.cpp TestCompletion.cpp TestOptionArgParser.cpp TestOptions.cpp @@ -8,6 +9,7 @@ add_lldb_unittest(InterpreterTests TestRegexCommand.cpp LINK_LIBS + lldbCommands lldbCore lldbHost lldbTarget diff --git a/lldb/unittests/Interpreter/TestCommandObjectExpression.cpp b/lldb/unittests/Interpreter/TestCommandObjectExpression.cpp new file mode 100644 index 0000000000000..9e3417b542892 --- /dev/null +++ b/lldb/unittests/Interpreter/TestCommandObjectExpression.cpp @@ -0,0 +1,27 @@ +#include "../../source/Commands/DiagnosticRendering.h" +#include "lldb/Utility/StreamString.h" +#include "gtest/gtest.h" + +using namespace lldb_private; +using namespace lldb; +using llvm::StringRef; +namespace { +class ErrorDisplayTest : public ::testing::Test {}; +} // namespace + +static std::string Render(std::vector details) { + StreamString stream; + RenderDiagnosticDetails(stream, 0, true, details); + return stream.GetData(); +} + +TEST_F(ErrorDisplayTest, RenderStatus) { + DiagnosticDetail::SourceLocation inline_loc; + inline_loc.in_user_input = true; + { + std::string result = + Render({DiagnosticDetail{inline_loc, eSeverityError, "foo", ""}}); + ASSERT_TRUE(StringRef(result).contains("error:")); + ASSERT_TRUE(StringRef(result).contains("foo")); + } +} diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp index d4601d5f53d07..8f7ab1a376899 100644 --- a/llvm/benchmarks/SandboxIRBench.cpp +++ b/llvm/benchmarks/SandboxIRBench.cpp @@ -18,7 +18,8 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Instruction.h" +#include "llvm/SandboxIR/Module.h" #include "llvm/Support/SourceMgr.h" #include #include diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 9e11b13c101d4..38300863f7889 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1757,6 +1757,55 @@ As part of the AMDGPU MC layer, AMDGPU provides the following target specific =================== ================= ======================================================== +Function Resource Usage +----------------------- + +A function's resource usage depends on each of its callees' resource usage. The +expressions used to denote resource usage reflect this by propagating each +callees' equivalent expressions. Said expressions are emitted as symbols by the +compiler when compiling to either assembly or object format and should not be +overwritten or redefined. + +The following describes all emitted function resource usage symbols: + + .. table:: Function Resource Usage: + :name: function-usage-table + + ===================================== ========= ========================================= =============================================================================== + Symbol Type Description Example + ===================================== ========= ========================================= =============================================================================== + .num_vgpr Integer Number of VGPRs used by , .set foo.num_vgpr, max(32, bar.num_vgpr, baz.num_vgpr) + worst case of itself and its callees' + VGPR use + .num_agpr Integer Number of AGPRs used by , .set foo.num_agpr, max(35, bar.num_agpr) + worst case of itself and its callees' + AGPR use + .numbered_sgpr Integer Number of SGPRs used by , .set foo.num_sgpr, 21 + worst case of itself and its callees' + SGPR use (without any of the implicitly + used SGPRs) + .private_seg_size Integer Total stack size required for .set foo.private_seg_size, 16+max(bar.private_seg_size, baz.private_seg_size) + , expression is the + locally used stack size + the worst case + callee + .uses_vcc Bool Whether , or any of its .set foo.uses_vcc, or(0, bar.uses_vcc) + callees, uses vcc + .uses_flat_scratch Bool Whether , or any of its .set foo.uses_flat_scratch, 1 + callees, uses flat scratch or not + .has_dyn_sized_stack Bool Whether , or any of its .set foo.has_dyn_sized_stack, 1 + callees, is dynamically sized + .has_recursion Bool Whether , or any of its .set foo.has_recursion, 0 + callees, contains recursion + .has_indirect_call Bool Whether , or any of its .set foo.has_indirect_call, max(0, bar.has_indirect_call) + callees, contains an indirect call + ===================================== ========= ========================================= =============================================================================== + +Futhermore, three symbols are additionally emitted describing the compilation +unit's worst case (i.e, maxima) ``num_vgpr``, ``num_agpr``, and +``numbered_sgpr`` which may be referenced and used by the aforementioned +symbolic expressions. These three symbols are ``amdgcn.max_num_vgpr``, +``amdgcn.max_num_agpr``, and ``amdgcn.max_num_sgpr``. + .. _amdgpu-elf-code-object: ELF Code Object diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 3a566bbac3623..8b0b05c0ea424 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -127,69 +127,6 @@ Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda`` NVPTX Intrinsics ================ -Address Space Conversion ------------------------- - -'``llvm.nvvm.ptr.*.to.gen``' Intrinsics -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" - -These are overloaded intrinsics. You can use these on any pointer types. - -.. code-block:: llvm - - declare ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1)) - declare ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3)) - declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4)) - declare ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5)) - -Overview: -""""""""" - -The '``llvm.nvvm.ptr.*.to.gen``' intrinsics convert a pointer in a non-generic -address space to a generic address space pointer. - -Semantics: -"""""""""" - -These intrinsics modify the pointer value to be a valid generic address space -pointer. - - -'``llvm.nvvm.ptr.gen.to.*``' Intrinsics -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" - -These are overloaded intrinsics. You can use these on any pointer types. - -.. code-block:: llvm - - declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr) - declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr) - declare ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr) - declare ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr) - -Overview: -""""""""" - -The '``llvm.nvvm.ptr.gen.to.*``' intrinsics convert a pointer in the generic -address space to a pointer in the target address space. Note that these -intrinsics are only useful if the address space of the target address space of -the pointer is known. It is not legal to use address space conversion -intrinsics to convert a pointer from one non-generic address space to another -non-generic address space. - -Semantics: -"""""""""" - -These intrinsics modify the pointer value to be a valid pointer in the target -non-generic address space. - - Reading PTX Special Registers ----------------------------- diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index ec5c0e7dbbd65..28dc270ca368d 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -2795,7 +2795,7 @@ void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn); /** * Obtain the intrinsic ID number which matches the given function name. * - * @see llvm::Function::lookupIntrinsicID() + * @see llvm::Intrinsic::lookupIntrinsicID() */ unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen); diff --git a/llvm/include/llvm/Analysis/EHUtils.h b/llvm/include/llvm/Analysis/EHUtils.h index 3ad0878bd64f8..aaf2882d697d1 100644 --- a/llvm/include/llvm/Analysis/EHUtils.h +++ b/llvm/include/llvm/Analysis/EHUtils.h @@ -24,10 +24,8 @@ static void computeEHOnlyBlocks(FunctionT &F, DenseSet &EHBlocks) { DenseMap Statuses; auto GetStatus = [&](BlockT *BB) { - if (Statuses.contains(BB)) - return Statuses[BB]; - else - return Unknown; + auto It = Statuses.find(BB); + return It != Statuses.end() ? It->second : Unknown; }; auto CheckPredecessors = [&](BlockT *BB, Status Stat) { diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 68b860725752d..179a2c38d9d3c 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1125,15 +1125,10 @@ class ScalarEvolution { // Not taken either exactly ConstantMaxNotTaken or zero times bool MaxOrZero = false; - /// A set of predicate guards for this ExitLimit. The result is only valid - /// if all of the predicates in \c Predicates evaluate to 'true' at + /// A vector of predicate guards for this ExitLimit. The result is only + /// valid if all of the predicates in \c Predicates evaluate to 'true' at /// run-time. - SmallPtrSet Predicates; - - void addPredicate(const SCEVPredicate *P) { - assert(!isa(P) && "Only add leaf predicates here!"); - Predicates.insert(P); - } + SmallVector Predicates; /// Construct either an exact exit limit from a constant, or an unknown /// one from a SCEVCouldNotCompute. No other types of SCEVs are allowed @@ -1142,12 +1137,11 @@ class ScalarEvolution { ExitLimit(const SCEV *E, const SCEV *ConstantMaxNotTaken, const SCEV *SymbolicMaxNotTaken, bool MaxOrZero, - ArrayRef *> - PredSetList = {}); + ArrayRef> PredLists = {}); ExitLimit(const SCEV *E, const SCEV *ConstantMaxNotTaken, const SCEV *SymbolicMaxNotTaken, bool MaxOrZero, - const SmallPtrSetImpl &PredSet); + ArrayRef PredList); /// Test whether this ExitLimit contains any computed information, or /// whether it's all SCEVCouldNotCompute values. @@ -1297,7 +1291,7 @@ class ScalarEvolution { /// adding additional predicates to \p Preds as required. const SCEVAddRecExpr *convertSCEVToAddRecWithPredicates( const SCEV *S, const Loop *L, - SmallPtrSetImpl &Preds); + SmallVectorImpl &Preds); /// Compute \p LHS - \p RHS and returns the result as an APInt if it is a /// constant, and std::nullopt if it isn't. @@ -1489,12 +1483,13 @@ class ScalarEvolution { const SCEV *ExactNotTaken; const SCEV *ConstantMaxNotTaken; const SCEV *SymbolicMaxNotTaken; - SmallPtrSet Predicates; + SmallVector Predicates; - explicit ExitNotTakenInfo( - PoisoningVH ExitingBlock, const SCEV *ExactNotTaken, - const SCEV *ConstantMaxNotTaken, const SCEV *SymbolicMaxNotTaken, - const SmallPtrSet &Predicates) + explicit ExitNotTakenInfo(PoisoningVH ExitingBlock, + const SCEV *ExactNotTaken, + const SCEV *ConstantMaxNotTaken, + const SCEV *SymbolicMaxNotTaken, + ArrayRef Predicates) : ExitingBlock(ExitingBlock), ExactNotTaken(ExactNotTaken), ConstantMaxNotTaken(ConstantMaxNotTaken), SymbolicMaxNotTaken(SymbolicMaxNotTaken), Predicates(Predicates) {} diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index cb62c86b502c1..c36a346c1b2e0 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1555,7 +1555,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Assume that target intrinsics are cheap. Intrinsic::ID IID = ICA.getID(); - if (Function::isTargetIntrinsic(IID)) + if (Intrinsic::isTargetIntrinsic(IID)) return TargetTransformInfo::TCC_Basic; if (ICA.isTypeBasedOnly()) diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h index 3cbc35400181d..f3c4cc8d0511d 100644 --- a/llvm/include/llvm/CodeGen/FastISel.h +++ b/llvm/include/llvm/CodeGen/FastISel.h @@ -275,7 +275,7 @@ class FastISel { /// This is a wrapper around getRegForValue that also takes care of /// truncating or sign-extending the given getelementptr index value. - Register getRegForGEPIndex(const Value *Idx); + Register getRegForGEPIndex(MVT PtrVT, const Value *Idx); /// We're checking to see if we can fold \p LI into \p FoldInst. Note /// that we could have a sequence where multiple LLVM IR instructions are diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h index b749d9ef588ab..0283fdded7f43 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h @@ -318,11 +318,8 @@ class LegacyLegalizerInfo { const unsigned AddressSpace, const SizeAndActionsVec &SizeAndActions) { const unsigned OpcodeIdx = Opcode - FirstOp; - if (AddrSpace2PointerActions[OpcodeIdx].find(AddressSpace) == - AddrSpace2PointerActions[OpcodeIdx].end()) - AddrSpace2PointerActions[OpcodeIdx][AddressSpace] = {{}}; SmallVector &Actions = - AddrSpace2PointerActions[OpcodeIdx].find(AddressSpace)->second; + AddrSpace2PointerActions[OpcodeIdx][AddressSpace]; setActions(TypeIndex, Actions, SizeAndActions); } @@ -347,11 +344,8 @@ class LegacyLegalizerInfo { const unsigned ElementSize, const SizeAndActionsVec &SizeAndActions) { const unsigned OpcodeIdx = Opcode - FirstOp; - if (NumElements2Actions[OpcodeIdx].find(ElementSize) == - NumElements2Actions[OpcodeIdx].end()) - NumElements2Actions[OpcodeIdx][ElementSize] = {{}}; SmallVector &Actions = - NumElements2Actions[OpcodeIdx].find(ElementSize)->second; + NumElements2Actions[OpcodeIdx][ElementSize]; setActions(TypeIndex, Actions, SizeAndActions); } diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 2367d8d04787d..7a2c23c13a3ce 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -184,6 +184,8 @@ class MachineRegisterInfo { TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg); } + const MachineFunction &getMF() const { return *MF; } + //===--------------------------------------------------------------------===// // Function State //===--------------------------------------------------------------------===// diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h index ec652f448f0f6..d6a1f064ec0a5 100644 --- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -65,6 +65,50 @@ struct PointerLikeTypeTraits { } }; +// The storage for all reaching definitions. +class MBBReachingDefsInfo { +public: + void init(unsigned NumBlockIDs) { AllReachingDefs.resize(NumBlockIDs); } + + unsigned numBlockIDs() const { return AllReachingDefs.size(); } + + void startBasicBlock(unsigned MBBNumber, unsigned NumRegUnits) { + AllReachingDefs[MBBNumber].resize(NumRegUnits); + } + + void append(unsigned MBBNumber, unsigned Unit, int Def) { + AllReachingDefs[MBBNumber][Unit].push_back(Def); + } + + void prepend(unsigned MBBNumber, unsigned Unit, int Def) { + auto &Defs = AllReachingDefs[MBBNumber][Unit]; + Defs.insert(Defs.begin(), Def); + } + + void replaceFront(unsigned MBBNumber, unsigned Unit, int Def) { + assert(!AllReachingDefs[MBBNumber][Unit].empty()); + *AllReachingDefs[MBBNumber][Unit].begin() = Def; + } + + void clear() { AllReachingDefs.clear(); } + + ArrayRef defs(unsigned MBBNumber, unsigned Unit) const { + if (AllReachingDefs[MBBNumber].empty()) + // Block IDs are not necessarily dense. + return ArrayRef(); + return AllReachingDefs[MBBNumber][Unit]; + } + +private: + /// All reaching defs of a given RegUnit for a given MBB. + using MBBRegUnitDefs = TinyPtrVector; + /// All reaching defs of all reg units for a given MBB + using MBBDefsInfo = std::vector; + + /// All reaching defs of all reg units for all MBBs + SmallVector AllReachingDefs; +}; + /// This class provides the reaching def analysis. class ReachingDefAnalysis : public MachineFunctionPass { private: @@ -93,12 +137,6 @@ class ReachingDefAnalysis : public MachineFunctionPass { /// their basic blocks. DenseMap InstIds; - /// All reaching defs of a given RegUnit for a given MBB. - using MBBRegUnitDefs = TinyPtrVector; - /// All reaching defs of all reg units for a given MBB - using MBBDefsInfo = std::vector; - /// All reaching defs of all reg units for a all MBBs - using MBBReachingDefsInfo = SmallVector; MBBReachingDefsInfo MBBReachingDefs; /// Default values are 'nothing happened a long time ago'. diff --git a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h index b00f68c3be84e..9fb1b3f80e2ff 100644 --- a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h +++ b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h @@ -42,15 +42,15 @@ class CachedPathResolver { // If the ParentPath has not yet been resolved, resolve and cache it for // future look-ups. - if (!ResolvedPaths.count(ParentPath)) { + auto [It, Inserted] = ResolvedPaths.try_emplace(ParentPath); + if (Inserted) { SmallString<256> RealPath; sys::fs::real_path(ParentPath, RealPath); - ResolvedPaths.insert( - {ParentPath, std::string(RealPath.c_str(), RealPath.size())}); + It->second = std::string(RealPath); } // Join the file name again with the resolved path. - SmallString<256> ResolvedPath(ResolvedPaths[ParentPath]); + SmallString<256> ResolvedPath(It->second); sys::path::append(ResolvedPath, FileName); return StringPool.internString(ResolvedPath); } diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h index 2c5b110492696..40b85e3272010 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h @@ -21,6 +21,7 @@ #include #include +#include #include namespace llvm { @@ -31,25 +32,37 @@ struct ELFPerObjectSectionsToRegister { ExecutorAddrRange ThreadDataSection; }; -struct ELFNixJITDylibInitializers { - using SectionList = std::vector; +using ELFNixJITDylibDepInfo = std::vector; +using ELFNixJITDylibDepInfoMap = + std::vector>; - ELFNixJITDylibInitializers(std::string Name, ExecutorAddr DSOHandleAddress) - : Name(std::move(Name)), DSOHandleAddress(std::move(DSOHandleAddress)) {} - - std::string Name; - ExecutorAddr DSOHandleAddress; - - StringMap InitSections; +struct RuntimeFunction { + RuntimeFunction(SymbolStringPtr Name) : Name(std::move(Name)) {} + SymbolStringPtr Name; + ExecutorAddr Addr; }; -class ELFNixJITDylibDeinitializers {}; +struct FunctionPairKeyHash { + std::size_t + operator()(const std::pair &key) const { + return std::hash()(key.first->Addr.toPtr()) ^ + std::hash()(key.second->Addr.toPtr()); + } +}; -using ELFNixJITDylibInitializerSequence = - std::vector; +struct FunctionPairKeyEqual { + std::size_t + operator()(const std::pair &lhs, + const std::pair &rhs) const { + return lhs.first == rhs.first && lhs.second == rhs.second; + } +}; -using ELFNixJITDylibDeinitializerSequence = - std::vector; +using DeferredRuntimeFnMap = std::unordered_map< + std::pair, + SmallVector>, + FunctionPairKeyHash, FunctionPairKeyEqual>; /// Mediates between ELFNix initialization and ExecutionSession state. class ELFNixPlatform : public Platform { @@ -126,6 +139,23 @@ class ELFNixPlatform : public Platform { standardRuntimeUtilityAliases(); private: + // Data needed for bootstrap only. + struct BootstrapInfo { + std::mutex Mutex; + std::condition_variable CV; + size_t ActiveGraphs = 0; + ExecutorAddr ELFNixHeaderAddr; + DeferredRuntimeFnMap DeferredRTFnMap; + + void addArgumentsToRTFnMap( + RuntimeFunction *func1, RuntimeFunction *func2, + const shared::WrapperFunctionCall::ArgDataBufferType &arg1, + const shared::WrapperFunctionCall::ArgDataBufferType &arg2) { + auto &argList = DeferredRTFnMap[std::make_pair(func1, func2)]; + argList.emplace_back(arg1, arg2); + } + }; + // The ELFNixPlatformPlugin scans/modifies LinkGraphs to support ELF // platform features including initializers, exceptions, TLV, and language // runtime registration. @@ -151,19 +181,22 @@ class ELFNixPlatform : public Platform { ResourceKey SrcKey) override {} private: - void addInitializerSupportPasses(MaterializationResponsibility &MR, - jitlink::PassConfiguration &Config); + Error bootstrapPipelineStart(jitlink::LinkGraph &G); + Error bootstrapPipelineRecordRuntimeFunctions(jitlink::LinkGraph &G); + Error bootstrapPipelineEnd(jitlink::LinkGraph &G); void addDSOHandleSupportPasses(MaterializationResponsibility &MR, jitlink::PassConfiguration &Config); void addEHAndTLVSupportPasses(MaterializationResponsibility &MR, - jitlink::PassConfiguration &Config); + jitlink::PassConfiguration &Config, + bool IsBootstrapping); Error preserveInitSections(jitlink::LinkGraph &G, MaterializationResponsibility &MR); - Error registerInitSections(jitlink::LinkGraph &G, JITDylib &JD); + Error registerInitSections(jitlink::LinkGraph &G, JITDylib &JD, + bool IsBootstrapping); Error fixTLVSectionsAndEdges(jitlink::LinkGraph &G, JITDylib &JD); @@ -171,11 +204,8 @@ class ELFNixPlatform : public Platform { ELFNixPlatform &MP; }; - using SendInitializerSequenceFn = - unique_function)>; - - using SendDeinitializerSequenceFn = - unique_function)>; + using PushInitializersSendResultFn = + unique_function)>; using SendSymbolAddressFn = unique_function)>; @@ -189,53 +219,58 @@ class ELFNixPlatform : public Platform { // Associate ELFNixPlatform JIT-side runtime support functions with handlers. Error associateRuntimeSupportFunctions(JITDylib &PlatformJD); - void getInitializersBuildSequencePhase(SendInitializerSequenceFn SendResult, - JITDylib &JD, - std::vector DFSLinkOrder); + void pushInitializersLoop(PushInitializersSendResultFn SendResult, + JITDylibSP JD); - void getInitializersLookupPhase(SendInitializerSequenceFn SendResult, - JITDylib &JD); - - void rt_getInitializers(SendInitializerSequenceFn SendResult, - StringRef JDName); - - void rt_getDeinitializers(SendDeinitializerSequenceFn SendResult, - ExecutorAddr Handle); + void rt_recordInitializers(PushInitializersSendResultFn SendResult, + ExecutorAddr JDHeader); void rt_lookupSymbol(SendSymbolAddressFn SendResult, ExecutorAddr Handle, StringRef SymbolName); - // Records the addresses of runtime symbols used by the platform. - Error bootstrapELFNixRuntime(JITDylib &PlatformJD); - - Error registerInitInfo(JITDylib &JD, - ArrayRef InitSections); - - Error registerPerObjectSections(const ELFPerObjectSectionsToRegister &POSR); + Error registerPerObjectSections(jitlink::LinkGraph &G, + const ELFPerObjectSectionsToRegister &POSR, + bool IsBootstrapping); Expected createPThreadKey(); ExecutionSession &ES; + JITDylib &PlatformJD; ObjectLinkingLayer &ObjLinkingLayer; SymbolStringPtr DSOHandleSymbol; - std::atomic RuntimeBootstrapped{false}; - ExecutorAddr orc_rt_elfnix_platform_bootstrap; - ExecutorAddr orc_rt_elfnix_platform_shutdown; - ExecutorAddr orc_rt_elfnix_register_object_sections; - ExecutorAddr orc_rt_elfnix_create_pthread_key; + RuntimeFunction PlatformBootstrap{ + ES.intern("__orc_rt_elfnix_platform_bootstrap")}; + RuntimeFunction PlatformShutdown{ + ES.intern("__orc_rt_elfnix_platform_shutdown")}; + RuntimeFunction RegisterJITDylib{ + ES.intern("__orc_rt_elfnix_register_jitdylib")}; + RuntimeFunction DeregisterJITDylib{ + ES.intern("__orc_rt_elfnix_deregister_jitdylib")}; + RuntimeFunction RegisterObjectSections{ + ES.intern("__orc_rt_elfnix_register_object_sections")}; + RuntimeFunction DeregisterObjectSections{ + ES.intern("__orc_rt_elfnix_deregister_object_sections")}; + RuntimeFunction RegisterInitSections{ + ES.intern("__orc_rt_elfnix_register_init_sections")}; + RuntimeFunction DeregisterInitSections{ + ES.intern("__orc_rt_elfnix_deregister_init_sections")}; + RuntimeFunction CreatePThreadKey{ + ES.intern("__orc_rt_elfnix_create_pthread_key")}; DenseMap RegisteredInitSymbols; // InitSeqs gets its own mutex to avoid locking the whole session when // aggregating data from the jitlink. std::mutex PlatformMutex; - DenseMap InitSeqs; std::vector BootstrapPOSRs; DenseMap HandleAddrToJITDylib; + DenseMap JITDylibToHandleAddr; DenseMap JITDylibToPThreadKey; + + std::atomic Bootstrap; }; namespace shared { @@ -266,63 +301,11 @@ class SPSSerializationTraits>; - -using SPSELFNixJITDylibInitializers = - SPSTuple; - -using SPSELFNixJITDylibInitializerSequence = - SPSSequence; - -/// Serialization traits for ELFNixJITDylibInitializers. -template <> -class SPSSerializationTraits { -public: - static size_t size(const ELFNixJITDylibInitializers &MOJDIs) { - return SPSELFNixJITDylibInitializers::AsArgList::size( - MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections); - } - - static bool serialize(SPSOutputBuffer &OB, - const ELFNixJITDylibInitializers &MOJDIs) { - return SPSELFNixJITDylibInitializers::AsArgList::serialize( - OB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections); - } - - static bool deserialize(SPSInputBuffer &IB, - ELFNixJITDylibInitializers &MOJDIs) { - return SPSELFNixJITDylibInitializers::AsArgList::deserialize( - IB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections); - } -}; - -using SPSELFJITDylibDeinitializers = SPSEmpty; - -using SPSELFJITDylibDeinitializerSequence = - SPSSequence; - -template <> -class SPSSerializationTraits { -public: - static size_t size(const ELFNixJITDylibDeinitializers &MOJDDs) { return 0; } - - static bool serialize(SPSOutputBuffer &OB, - const ELFNixJITDylibDeinitializers &MOJDDs) { - return true; - } - - static bool deserialize(SPSInputBuffer &IB, - ELFNixJITDylibDeinitializers &MOJDDs) { - MOJDDs = ELFNixJITDylibDeinitializers(); - return true; - } -}; +using SPSELFNixJITDylibDepInfoMap = + SPSSequence>>; } // end namespace shared } // end namespace orc } // end namespace llvm -#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H +#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H \ No newline at end of file diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index a4d55285380b0..fec876eaafc86 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -255,10 +255,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { /// returns Intrinsic::not_intrinsic! bool isIntrinsic() const { return HasLLVMReservedName; } - /// isTargetIntrinsic - Returns true if IID is an intrinsic specific to a - /// certain target. If it is a generic intrinsic false is returned. - static bool isTargetIntrinsic(Intrinsic::ID IID); - /// isTargetIntrinsic - Returns true if this function is an intrinsic and the /// intrinsic is specific to a certain target. If this is not an intrinsic /// or a generic intrinsic, false is returned. @@ -269,8 +265,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { /// getIntrinsicID() returns Intrinsic::not_intrinsic. bool isConstrainedFPIntrinsic() const; - static Intrinsic::ID lookupIntrinsicID(StringRef Name); - /// Update internal caches that depend on the function name (such as the /// intrinsic ID and libcall cache). /// Note, this method does not need to be called directly, as it is called diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index 0ec7e47812af4..95df3f2cd654a 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -78,6 +78,12 @@ namespace Intrinsic { /// Returns true if the intrinsic can be overloaded. bool isOverloaded(ID id); + /// isTargetIntrinsic - Returns true if IID is an intrinsic specific to a + /// certain target. If it is a generic intrinsic false is returned. + bool isTargetIntrinsic(ID IID); + + ID lookupIntrinsicID(StringRef Name); + /// Return the attributes for an intrinsic. AttributeList getAttributes(LLVMContext &C, ID id); diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 2d8ce66f53ba8..eda2f69dd230c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3769,6 +3769,12 @@ let TargetPrefix = "aarch64" in { : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], [ImmArg>, ImmArg>, IntrReadMem]>; + + def int_aarch64_sme_luti4_zt_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + // // Register scaling @@ -3794,6 +3800,7 @@ let TargetPrefix = "aarch64" in { [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; + } // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index aa5294f5f9c90..7b8ffe417fccd 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -30,10 +30,18 @@ // * llvm.nvvm.max.ui --> select(x ule y, x, y) // * llvm.nvvm.max.ull --> ibid. // * llvm.nvvm.h2f --> llvm.convert.to.fp16.f32 -// * llvm.nvvm.bitcast.f2i --> bitcast -// * llvm.nvvm.bitcast.i2f --> ibid. -// * llvm.nvvm.bitcast.d2ll --> ibid. -// * llvm.nvvm.bitcast.ll2d --> ibid. +// * llvm.nvvm.bitcast.f2i --> bitcast +// * llvm.nvvm.bitcast.i2f --> ibid. +// * llvm.nvvm.bitcast.d2ll --> ibid. +// * llvm.nvvm.bitcast.ll2d --> ibid. +// * llvm.nvvm.ptr.gen.to.global --> addrspacecast +// * llvm.nvvm.ptr.gen.to.shared --> ibid. +// * llvm.nvvm.ptr.gen.to.constant --> ibid. +// * llvm.nvvm.ptr.gen.to.local --> ibid. +// * llvm.nvvm.ptr.global.to.gen --> ibid. +// * llvm.nvvm.ptr.shared.to.gen --> ibid. +// * llvm.nvvm.ptr.constant.to.gen --> ibid. +// * llvm.nvvm.ptr.local.to.gen --> ibid. def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr @@ -1602,40 +1610,6 @@ def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty], [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture>], "llvm.nvvm.ldg.global.p">; -// Use for generic pointers -// - These intrinsics are used to convert address spaces. -// - The input pointer and output pointer must have the same type, except for -// the address-space. (This restriction is not enforced here as there is -// currently no way to describe it). -// - This complements the llvm bitcast, which can be used to cast one type -// of pointer to another type of pointer, while the address space remains -// the same. -def int_nvvm_ptr_local_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.local.to.gen">; -def int_nvvm_ptr_shared_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.shared.to.gen">; -def int_nvvm_ptr_global_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.global.to.gen">; -def int_nvvm_ptr_constant_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.constant.to.gen">; - -def int_nvvm_ptr_gen_to_global: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.gen.to.global">; -def int_nvvm_ptr_gen_to_shared: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.gen.to.shared">; -def int_nvvm_ptr_gen_to_local: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.gen.to.local">; -def int_nvvm_ptr_gen_to_constant: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.gen.to.constant">; - // Used in nvvm internally to help address space opt and ptx code generation // This is for params that are passed to kernel functions by pointer by-val. def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty], diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h index 80eedc52bc324..fa9c744294a66 100644 --- a/llvm/include/llvm/Passes/StandardInstrumentations.h +++ b/llvm/include/llvm/Passes/StandardInstrumentations.h @@ -171,7 +171,7 @@ class PreservedCFGCheckerInstrumentation { FunctionAnalysisManager::Invalidator &); }; -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS SmallVector PassStack; #endif diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h index bc0e3d8849237..e35d23be6619f 100644 --- a/llvm/include/llvm/SandboxIR/Constant.h +++ b/llvm/include/llvm/SandboxIR/Constant.h @@ -17,6 +17,8 @@ #include "llvm/IR/GlobalObject.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/SandboxIR/Argument.h" +#include "llvm/SandboxIR/BasicBlock.h" #include "llvm/SandboxIR/Context.h" #include "llvm/SandboxIR/Type.h" #include "llvm/SandboxIR/User.h" diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index 092b791bc2acb..acfffd9ccd4a7 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -25,12 +25,9 @@ class Context { LLVMContext &LLVMCtx; friend class Type; // For LLVMCtx. friend class PointerType; // For LLVMCtx. - friend class CmpInst; // For LLVMCtx. TODO: cleanup when sandboxir::VectorType - // is complete - friend class IntegerType; // For LLVMCtx. - friend class StructType; // For LLVMCtx. - friend class ::llvm::TargetExtType; // For LLVMCtx. - friend class Region; // For LLVMCtx. + friend class IntegerType; // For LLVMCtx. + friend class StructType; // For LLVMCtx. + friend class Region; // For LLVMCtx. Tracker IRTracker; @@ -159,6 +156,7 @@ class Context { public: Context(LLVMContext &LLVMCtx); + ~Context(); Tracker &getTracker() { return IRTracker; } /// Convenience function for `getTracker().save()` diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/Instruction.h similarity index 94% rename from llvm/include/llvm/SandboxIR/SandboxIR.h rename to llvm/include/llvm/SandboxIR/Instruction.h index 2376450d19011..f5f5bb5c4443c 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -1,200 +1,26 @@ -//===- SandboxIR.h ----------------------------------------------*- C++ -*-===// +//===- Instruction.h --------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// Sandbox IR is a lightweight overlay transactional IR on top of LLVM IR. -// Features: -// - You can save/rollback the state of the IR at any time. -// - Any changes made to Sandbox IR will automatically update the underlying -// LLVM IR so both IRs are always in sync. -// - Feels like LLVM IR, similar API. -// -// SandboxIR forms a class hierarchy that resembles that of LLVM IR -// but is in the `sandboxir` namespace: -// -// namespace sandboxir { -// -// Value -+- Argument -// | -// +- BasicBlock -// | -// +- User ------+- Constant ------ Function -// | -// +- Instruction -+- BinaryOperator -// | -// +- BranchInst -// | -// +- CastInst --------+- AddrSpaceCastInst -// | | -// | +- BitCastInst -// | | -// | +- FPExtInst -// | | -// | +- FPToSIInst -// | | -// | +- FPToUIInst -// | | -// | +- FPTruncInst -// | | -// | +- IntToPtrInst -// | | -// | +- PtrToIntInst -// | | -// | +- SExtInst -// | | -// | +- SIToFPInst -// | | -// | +- TruncInst -// | | -// | +- UIToFPInst -// | | -// | +- ZExtInst -// | -// +- CallBase --------+- CallBrInst -// | | -// | +- CallInst -// | | -// | +- InvokeInst -// | -// +- CmpInst ---------+- ICmpInst -// | | -// | +- FCmpInst -// | -// +- ExtractElementInst -// | -// +- GetElementPtrInst -// | -// +- InsertElementInst -// | -// +- OpaqueInst -// | -// +- PHINode -// | -// +- ReturnInst -// | -// +- SelectInst -// | -// +- ShuffleVectorInst -// | -// +- ExtractValueInst -// | -// +- InsertValueInst -// | -// +- StoreInst -// | -// +- UnaryInstruction -+- LoadInst -// | | -// | +- CastInst -// | -// +- UnaryOperator -// | -// +- UnreachableInst -// -// Use -// -// } // namespace sandboxir -// -#ifndef LLVM_SANDBOXIR_SANDBOXIR_H -#define LLVM_SANDBOXIR_SANDBOXIR_H +#ifndef LLVM_SANDBOXIR_INSTRUCTION_H +#define LLVM_SANDBOXIR_INSTRUCTION_H -#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/IR/User.h" -#include "llvm/IR/Value.h" -#include "llvm/SandboxIR/Argument.h" #include "llvm/SandboxIR/BasicBlock.h" #include "llvm/SandboxIR/Constant.h" -#include "llvm/SandboxIR/Context.h" -#include "llvm/SandboxIR/Module.h" -#include "llvm/SandboxIR/Tracker.h" -#include "llvm/SandboxIR/Type.h" -#include "llvm/SandboxIR/Use.h" #include "llvm/SandboxIR/User.h" -#include "llvm/SandboxIR/Value.h" -#include "llvm/Support/raw_ostream.h" -#include - -namespace llvm { - -namespace sandboxir { - -class BasicBlock; -class ConstantInt; -class ConstantFP; -class ConstantAggregateZero; -class ConstantPointerNull; -class PoisonValue; -class BlockAddress; -class DSOLocalEquivalent; -class ConstantTokenNone; -class GlobalValue; -class GlobalObject; -class GlobalIFunc; -class GlobalVariable; -class GlobalAlias; -class NoCFIValue; -class ConstantPtrAuth; -class ConstantExpr; -class Context; -class Function; -class Module; -class Instruction; -class VAArgInst; -class FreezeInst; -class FenceInst; -class SelectInst; -class ExtractElementInst; -class InsertElementInst; -class ShuffleVectorInst; -class ExtractValueInst; -class InsertValueInst; -class BranchInst; -class UnaryInstruction; -class LoadInst; -class ReturnInst; -class StoreInst; -class User; -class UnreachableInst; -class Value; -class CallBase; -class CallInst; -class InvokeInst; -class CallBrInst; -class LandingPadInst; -class FuncletPadInst; -class CatchPadInst; -class CleanupPadInst; -class CatchReturnInst; -class CleanupReturnInst; -class GetElementPtrInst; -class CastInst; -class PossiblyNonNegInst; -class PtrToIntInst; -class BitCastInst; -class AllocaInst; -class ResumeInst; -class CatchSwitchInst; -class SwitchInst; -class UnaryOperator; -class BinaryOperator; -class PossiblyDisjointInst; -class AtomicRMWInst; -class AtomicCmpXchgInst; -class CmpInst; -class ICmpInst; -class FCmpInst; + +namespace llvm::sandboxir { /// A sandboxir::User with operands, opcode and linked with previous/next /// instructions in an instruction list. -class Instruction : public sandboxir::User { +class Instruction : public User { public: enum class Opcode { #define OP(OPC) OPC, @@ -206,7 +32,7 @@ class Instruction : public sandboxir::User { protected: Instruction(ClassID ID, Opcode Opc, llvm::Instruction *I, sandboxir::Context &SBCtx) - : sandboxir::User(ID, I, SBCtx), Opc(Opc) {} + : User(ID, I, SBCtx), Opc(Opc) {} Opcode Opc; @@ -2910,7 +2736,6 @@ class OpaqueInst : public SingleLLVMInstructionImpl { } }; -} // namespace sandboxir -} // namespace llvm +} // namespace llvm::sandboxir -#endif // LLVM_SANDBOXIR_SANDBOXIR_H +#endif // LLVM_SANDBOXIR_INSTRUCTION_H diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h index 884f1324df782..67411f3fb741d 100644 --- a/llvm/include/llvm/SandboxIR/Region.h +++ b/llvm/include/llvm/SandboxIR/Region.h @@ -13,7 +13,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/iterator_range.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Instruction.h" #include "llvm/Support/raw_ostream.h" namespace llvm::sandboxir { diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index 829c9f3c72125..94ea4652c72c8 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -31,6 +31,7 @@ class IntegerType; class FunctionType; class ArrayType; class StructType; +class TargetExtType; class Module; #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS; #define DEF_CONST(ID, CLASS) class CLASS; diff --git a/llvm/include/llvm/SandboxIR/Utils.h b/llvm/include/llvm/SandboxIR/Utils.h index 4e8a175f54705..17fc837f555b8 100644 --- a/llvm/include/llvm/SandboxIR/Utils.h +++ b/llvm/include/llvm/SandboxIR/Utils.h @@ -12,8 +12,12 @@ #ifndef LLVM_SANDBOXIR_UTILS_H #define LLVM_SANDBOXIR_UTILS_H +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/SandboxIR/Instruction.h" +#include namespace llvm::sandboxir { @@ -57,6 +61,38 @@ class Utils { memoryLocationGetOrNone(const Instruction *I) { return llvm::MemoryLocation::getOrNone(cast(I->Val)); } + + /// \Returns the gap between the memory locations accessed by \p I0 and + /// \p I1 in bytes. + template + static std::optional + getPointerDiffInBytes(LoadOrStoreT *I0, LoadOrStoreT *I1, ScalarEvolution &SE, + const DataLayout &DL) { + static_assert(std::is_same_v || + std::is_same_v, + "Expected sandboxir::Load or sandboxir::Store!"); + llvm::Value *Opnd0 = I0->getPointerOperand()->Val; + llvm::Value *Opnd1 = I1->getPointerOperand()->Val; + llvm::Value *Ptr0 = getUnderlyingObject(Opnd0); + llvm::Value *Ptr1 = getUnderlyingObject(Opnd1); + if (Ptr0 != Ptr1) + return false; + llvm::Type *ElemTy = llvm::Type::getInt8Ty(SE.getContext()); + return getPointersDiff(ElemTy, Opnd0, ElemTy, Opnd1, DL, SE, + /*StrictCheck=*/false, /*CheckType=*/false); + } + + /// \Returns true if \p I0 accesses a memory location lower than \p I1. + /// Returns false if the difference cannot be determined, if the memory + /// locations are equal, or if I1 accesses a memory location greater than I0. + template + static bool atLowerAddress(LoadOrStoreT *I0, LoadOrStoreT *I1, + ScalarEvolution &SE, const DataLayout &DL) { + auto Diff = getPointerDiffInBytes(I0, I1, SE, DL); + if (!Diff) + return false; + return *Diff > 0; + } }; } // namespace llvm::sandboxir diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index 49bd9be82b0df..e7d516f38156c 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -22,6 +22,11 @@ namespace llvm::sandboxir { class Context; class FuncletPadInst; class Type; +class GlobalValue; +class GlobalObject; +class Module; +class UnaryInstruction; +class CmpInst; /// Iterator for the `Use` edges of a Value's users. /// \Returns a `Use` when dereferenced. diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h index 2e21bdc9fce2d..9aab5ec60f4a2 100644 --- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h +++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h @@ -640,7 +640,7 @@ struct SemiNCAInfo { Bucket; SmallDenseSet Visited; SmallVector Affected; -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS SmallVector VisitedUnaffected; #endif }; @@ -915,7 +915,7 @@ struct SemiNCAInfo { LLVM_DEBUG(dbgs() << "Deleting edge " << BlockNamePrinter(From) << " -> " << BlockNamePrinter(To) << "\n"); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // Ensure that the edge was in fact deleted from the CFG before informing // the DomTree about it. // The check is O(N), so run it only in debug configuration. diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 921fe94553951..b32f07e6427e8 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1341,6 +1341,9 @@ struct InformationCache { const ArrayRef getIndirectlyCallableFunctions(Attributor &A) const; + /// Return the flat address space if the associated target has. + std::optional getFlatAddressSpace() const; + private: struct FunctionInfo { ~FunctionInfo(); @@ -6267,11 +6270,12 @@ struct AAAddressSpace : public StateWrapper { return (AA->getIdAddr() == &ID); } - // No address space which indicates the associated value is dead. - static const uint32_t NoAddressSpace = ~0U; - /// Unique ID (due to the unique address) static const char ID; + +protected: + // Invalid address space which indicates the associated value is dead. + static const uint32_t InvalidAddressSpace = ~0U; }; struct AAAllocationInfo : public StateWrapper { diff --git a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h index db479f55d9b03..3858be05c61fa 100644 --- a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h +++ b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h @@ -256,7 +256,7 @@ class LPMUpdater { } void setParentLoop(Loop *L) { -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS ParentL = L; #endif } @@ -347,7 +347,7 @@ class LPMUpdater { const bool LoopNestMode; bool LoopNestChanged; -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // In debug builds we also track the parent loop to implement asserts even in // the face of loop deletion. Loop *ParentL; diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h index 8ed03d7f3ddbf..e5f78ac228683 100644 --- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h +++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h @@ -26,6 +26,7 @@ class AssumptionCache; class CallBase; class CallInst; class DominatorTree; +class EarliestEscapeInfo; class Function; class Instruction; class LoadInst; @@ -48,6 +49,7 @@ class MemCpyOptPass : public PassInfoMixin { PostDominatorTree *PDT = nullptr; MemorySSA *MSSA = nullptr; MemorySSAUpdater *MSSAU = nullptr; + EarliestEscapeInfo *EEI = nullptr; public: MemCpyOptPass() = default; diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index a761859465210..4b3d6fbed8362 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -224,9 +224,9 @@ bool promoteLoopAccessesToScalars( bool AllowSpeculation, bool HasReadsOutsideSet); /// Does a BFS from a given node to all of its children inside a given loop. -/// The returned vector of nodes includes the starting point. -SmallVector collectChildrenInLoop(DomTreeNode *N, - const Loop *CurLoop); +/// The returned vector of basic blocks includes the starting point. +SmallVector +collectChildrenInLoop(DominatorTree *DT, DomTreeNode *N, const Loop *CurLoop); /// Returns the instructions that use values defined in the loop. SmallVector findDefsUsedOutsideOfLoop(Loop *L); diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h index b4ea1ad840f9d..7231e45fe8eb7 100644 --- a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h @@ -247,9 +247,10 @@ FlowFunction SampleProfileInference::createFlowFunction( // Create FlowBlocks for (const auto *BB : BasicBlocks) { FlowBlock Block; - if (SampleBlockWeights.contains(BB)) { + auto It = SampleBlockWeights.find(BB); + if (It != SampleBlockWeights.end()) { Block.HasUnknownWeight = false; - Block.Weight = SampleBlockWeights[BB]; + Block.Weight = It->second; } else { Block.HasUnknownWeight = true; Block.Weight = 0; diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h index 468b50092efcf..62c1e15a9a60e 100644 --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -167,7 +167,7 @@ class SCEVExpander : public SCEVVisitor { /// consistent when instructions are moved. SmallVector InsertPointGuards; -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS const char *DebugType; #endif @@ -183,7 +183,7 @@ class SCEVExpander : public SCEVVisitor { Builder(se.getContext(), InstSimplifyFolder(DL), IRBuilderCallbackInserter( [this](Instruction *I) { rememberInstruction(I); })) { -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS DebugType = ""; #endif } @@ -193,7 +193,7 @@ class SCEVExpander : public SCEVVisitor { assert(InsertPointGuards.empty()); } -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS void setDebugType(const char *s) { DebugType = s; } #endif diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index 5437853c366ae..7f6e6d11e5f53 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -24,7 +24,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/iterator_range.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Instruction.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h" namespace llvm::sandboxir { diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h index 5c40d1eb28c7a..d088c6c556f3a 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h @@ -20,7 +20,7 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_INSTRINTERVAL_H #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_INSTRINTERVAL_H -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/ADT/ArrayRef.h" #include namespace llvm::sandboxir { diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h index 78c1c0e4c0464..50fa56c5b2194 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h @@ -12,11 +12,12 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_LEGALITY_H #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_LEGALITY_H -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/ADT/ArrayRef.h" namespace llvm::sandboxir { class LegalityAnalysis; +class Value; enum class LegalityResultID { Widen, ///> Vectorize by combining scalars to a vector. diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h index 99582e3e0e023..a2108f07c28e5 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h @@ -13,8 +13,8 @@ #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_BOTTOMUPVEC_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/SandboxIR/Constant.h" #include "llvm/SandboxIR/Pass.h" -#include "llvm/SandboxIR/SandboxIR.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h" namespace llvm::sandboxir { diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index d90bb213f4208..20cdbb6320322 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -892,7 +892,10 @@ bool llvm::isWritableObject(const Value *Object, return true; if (auto *A = dyn_cast(Object)) { - if (A->hasAttribute(Attribute::Writable)) { + // Also require noalias, otherwise writability at function entry cannot be + // generalized to writability at other program points, even if the pointer + // does not escape. + if (A->hasAttribute(Attribute::Writable) && A->hasNoAliasAttr()) { ExplicitlyDereferenceableOnly = true; return true; } diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 4b65fa0ae41b2..d2c329ba748e5 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -504,8 +504,9 @@ class CallAnalyzer : public InstVisitor { InlineResult analyze(); std::optional getSimplifiedValue(Instruction *I) { - if (SimplifiedValues.contains(I)) - return SimplifiedValues[I]; + auto It = SimplifiedValues.find(I); + if (It != SimplifiedValues.end()) + return It->second; return std::nullopt; } @@ -1129,8 +1130,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { void print(raw_ostream &OS); std::optional getCostDetails(const Instruction *I) { - if (InstructionCostDetailMap.contains(I)) - return InstructionCostDetailMap[I]; + auto It = InstructionCostDetailMap.find(I); + if (It != InstructionCostDetailMap.end()) + return It->second; return std::nullopt; } diff --git a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp index 2ca02eb174171..6db5737ef4268 100644 --- a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp +++ b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp @@ -114,7 +114,7 @@ std::unique_ptr ReplayInlineAdvisor::getAdviceImpl(CallBase &CB) { // Replay decision, if it has one auto Iter = InlineSitesFromRemarks.find(Combined); if (Iter != InlineSitesFromRemarks.end()) { - if (InlineSitesFromRemarks[Combined]) { + if (Iter->second) { LLVM_DEBUG(dbgs() << "Replay Inliner: Inlined " << Callee << " @ " << CallSiteLoc << "\n"); return std::make_unique( diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 09e5c080c19cf..c939270ed39a6 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -8693,12 +8693,12 @@ bool ScalarEvolution::BackedgeTakenInfo::isConstantMaxOrZero( } ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E) - : ExitLimit(E, E, E, false, {}) {} + : ExitLimit(E, E, E, false) {} ScalarEvolution::ExitLimit::ExitLimit( const SCEV *E, const SCEV *ConstantMaxNotTaken, const SCEV *SymbolicMaxNotTaken, bool MaxOrZero, - ArrayRef *> PredSetList) + ArrayRef> PredLists) : ExactNotTaken(E), ConstantMaxNotTaken(ConstantMaxNotTaken), SymbolicMaxNotTaken(SymbolicMaxNotTaken), MaxOrZero(MaxOrZero) { // If we prove the max count is zero, so is the symbolic bound. This happens @@ -8721,9 +8721,15 @@ ScalarEvolution::ExitLimit::ExitLimit( assert((isa(ConstantMaxNotTaken) || isa(ConstantMaxNotTaken)) && "No point in having a non-constant max backedge taken count!"); - for (const auto *PredSet : PredSetList) - for (const auto *P : *PredSet) - addPredicate(P); + SmallPtrSet SeenPreds; + for (const auto PredList : PredLists) + for (const auto *P : PredList) { + if (SeenPreds.contains(P)) + continue; + assert(!isa(P) && "Only add leaf predicates here!"); + SeenPreds.insert(P); + Predicates.push_back(P); + } assert((isa(E) || !E->getType()->isPointerTy()) && "Backedge count should be int"); assert((isa(ConstantMaxNotTaken) || @@ -8731,12 +8737,13 @@ ScalarEvolution::ExitLimit::ExitLimit( "Max backedge count should be int"); } -ScalarEvolution::ExitLimit::ExitLimit( - const SCEV *E, const SCEV *ConstantMaxNotTaken, - const SCEV *SymbolicMaxNotTaken, bool MaxOrZero, - const SmallPtrSetImpl &PredSet) +ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, + const SCEV *ConstantMaxNotTaken, + const SCEV *SymbolicMaxNotTaken, + bool MaxOrZero, + ArrayRef PredList) : ExitLimit(E, ConstantMaxNotTaken, SymbolicMaxNotTaken, MaxOrZero, - { &PredSet }) {} + ArrayRef({PredList})) {} /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each /// computable exit into a persistent ExitNotTakenInfo array. @@ -9098,7 +9105,7 @@ ScalarEvolution::computeExitLimitFromCondFromBinOp( SymbolicMaxBECount = isa(BECount) ? ConstantMaxBECount : BECount; return ExitLimit(BECount, ConstantMaxBECount, SymbolicMaxBECount, false, - { &EL0.Predicates, &EL1.Predicates }); + {ArrayRef(EL0.Predicates), ArrayRef(EL1.Predicates)}); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp( @@ -10129,8 +10136,11 @@ const SCEV *ScalarEvolution::stripInjectiveFunctions(const SCEV *S) const { /// A and B isn't important. /// /// If the equation does not have a solution, SCEVCouldNotCompute is returned. -static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B, - ScalarEvolution &SE) { +static const SCEV * +SolveLinEquationWithOverflow(const APInt &A, const SCEV *B, + SmallVectorImpl *Predicates, + + ScalarEvolution &SE) { uint32_t BW = A.getBitWidth(); assert(BW == SE.getTypeSizeInBits(B->getType())); assert(A != 0 && "A must be non-zero."); @@ -10146,8 +10156,22 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B, // // B is divisible by D if and only if the multiplicity of prime factor 2 for B // is not less than multiplicity of this prime factor for D. - if (SE.getMinTrailingZeros(B) < Mult2) - return SE.getCouldNotCompute(); + if (SE.getMinTrailingZeros(B) < Mult2) { + // Check if we can prove there's no remainder using URem. + const SCEV *URem = + SE.getURemExpr(B, SE.getConstant(APInt::getOneBitSet(BW, Mult2))); + const SCEV *Zero = SE.getZero(B->getType()); + if (!SE.isKnownPredicate(CmpInst::ICMP_EQ, URem, Zero)) { + // Try to add a predicate ensuring B is a multiple of 1 << Mult2. + if (!Predicates) + return SE.getCouldNotCompute(); + + // Avoid adding a predicate that is known to be false. + if (SE.isKnownPredicate(CmpInst::ICMP_NE, URem, Zero)) + return SE.getCouldNotCompute(); + Predicates->push_back(SE.getEqualPredicate(URem, Zero)); + } + } // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic // modulo (N / D). @@ -10449,7 +10473,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V, // effectively V != 0. We know and take advantage of the fact that this // expression only being used in a comparison by zero context. - SmallPtrSet Predicates; + SmallVector Predicates; // If the value is a constant if (const SCEVConstant *C = dyn_cast(V)) { // If the value is already zero, the branch will execute zero times. @@ -10577,8 +10601,9 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V, // Solve the general equation. if (!StepC || StepC->getValue()->isZero()) return getCouldNotCompute(); - const SCEV *E = SolveLinEquationWithOverflow(StepC->getAPInt(), - getNegativeSCEV(Start), *this); + const SCEV *E = SolveLinEquationWithOverflow( + StepC->getAPInt(), getNegativeSCEV(Start), + AllowPredicates ? &Predicates : nullptr, *this); const SCEV *M = E; if (E != getCouldNotCompute()) { @@ -12867,7 +12892,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS, const Loop *L, bool IsSigned, bool ControlsOnlyExit, bool AllowPredicates) { - SmallPtrSet Predicates; + SmallVector Predicates; const SCEVAddRecExpr *IV = dyn_cast(LHS); bool PredicatedIV = false; @@ -13307,7 +13332,7 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS, ScalarEvolution::ExitLimit ScalarEvolution::howManyGreaterThans( const SCEV *LHS, const SCEV *RHS, const Loop *L, bool IsSigned, bool ControlsOnlyExit, bool AllowPredicates) { - SmallPtrSet Predicates; + SmallVector Predicates; // We handle only IV > Invariant if (!isLoopInvariant(RHS, L)) return getCouldNotCompute(); @@ -13677,7 +13702,7 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE, PrintSCEVWithTypeHint(OS, EC); if (isa(EC)) { // Retry with predicates. - SmallVector Predicates; + SmallVector Predicates; EC = SE->getPredicatedExitCount(L, ExitingBlock, &Predicates); if (!isa(EC)) { OS << "\n predicated exit count for " << ExitingBlock->getName() @@ -13729,7 +13754,7 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE, PrintSCEVWithTypeHint(OS, ExitBTC); if (isa(ExitBTC)) { // Retry with predicates. - SmallVector Predicates; + SmallVector Predicates; ExitBTC = SE->getPredicatedExitCount(L, ExitingBlock, &Predicates, ScalarEvolution::SymbolicMaximum); if (!isa(ExitBTC)) { @@ -14709,7 +14734,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor { /// If \p NewPreds is non-null, rewrite is free to add further predicates to /// \p NewPreds such that the result will be an AddRecExpr. static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE, - SmallPtrSetImpl *NewPreds, + SmallVectorImpl *NewPreds, const SCEVPredicate *Pred) { SCEVPredicateRewriter Rewriter(L, SE, NewPreds, Pred); return Rewriter.visit(S); @@ -14765,9 +14790,10 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor { } private: - explicit SCEVPredicateRewriter(const Loop *L, ScalarEvolution &SE, - SmallPtrSetImpl *NewPreds, - const SCEVPredicate *Pred) + explicit SCEVPredicateRewriter( + const Loop *L, ScalarEvolution &SE, + SmallVectorImpl *NewPreds, + const SCEVPredicate *Pred) : SCEVRewriteVisitor(SE), NewPreds(NewPreds), Pred(Pred), L(L) {} bool addOverflowAssumption(const SCEVPredicate *P) { @@ -14775,7 +14801,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor { // Check if we've already made this assumption. return Pred && Pred->implies(P); } - NewPreds->insert(P); + NewPreds->push_back(P); return true; } @@ -14811,7 +14837,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor { return PredicatedRewrite->first; } - SmallPtrSetImpl *NewPreds; + SmallVectorImpl *NewPreds; const SCEVPredicate *Pred; const Loop *L; }; @@ -14826,8 +14852,8 @@ ScalarEvolution::rewriteUsingPredicate(const SCEV *S, const Loop *L, const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates( const SCEV *S, const Loop *L, - SmallPtrSetImpl &Preds) { - SmallPtrSet TransformPreds; + SmallVectorImpl &Preds) { + SmallVector TransformPreds; S = SCEVPredicateRewriter::rewrite(S, L, *this, &TransformPreds, nullptr); auto *AddRec = dyn_cast(S); @@ -14836,7 +14862,7 @@ const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates( // Since the transformation was successful, we can now transfer the SCEV // predicates. - Preds.insert(TransformPreds.begin(), TransformPreds.end()); + Preds.append(TransformPreds.begin(), TransformPreds.end()); return AddRec; } @@ -15083,7 +15109,7 @@ bool PredicatedScalarEvolution::hasNoOverflow( const SCEVAddRecExpr *PredicatedScalarEvolution::getAsAddRec(Value *V) { const SCEV *Expr = this->getSCEV(V); - SmallPtrSet NewPreds; + SmallVector NewPreds; auto *New = SE.convertSCEVToAddRecWithPredicates(Expr, &L, NewPreds); if (!New) diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index e088c312c7b44..d84521d2e6e10 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -338,7 +338,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { for (const auto &[Name, Info] : make_early_inc_range(ForwardRefVals)) { if (StringRef(Name).starts_with("llvm.")) { - Intrinsic::ID IID = Function::lookupIntrinsicID(Name); + Intrinsic::ID IID = Intrinsic::lookupIntrinsicID(Name); if (IID == Intrinsic::not_intrinsic) // Don't do anything for unknown intrinsics. continue; @@ -6301,7 +6301,7 @@ bool isOldDbgFormatIntrinsic(StringRef Name) { // intrinsics in the new debug info format. if (!Name.starts_with("llvm.dbg.")) return false; - Intrinsic::ID FnID = Function::lookupIntrinsicID(Name); + Intrinsic::ID FnID = Intrinsic::lookupIntrinsicID(Name); return FnID == Intrinsic::dbg_declare || FnID == Intrinsic::dbg_value || FnID == Intrinsic::dbg_assign; } diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 5cba2cbc241e4..a692e7aef6268 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -3406,10 +3406,8 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) { OS.emitInt32(getCompleteTypeIndex(DIGV->getType()).getIndex()); OS.AddComment("DataOffset"); - uint64_t Offset = 0; - if (CVGlobalVariableOffsets.contains(DIGV)) - // Use the offset seen while collecting info on globals. - Offset = CVGlobalVariableOffsets[DIGV]; + // Use the offset seen while collecting info on globals. + uint64_t Offset = CVGlobalVariableOffsets.lookup(DIGV); OS.emitCOFFSecRel32(GVSym, Offset); OS.AddComment("Segment"); diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index a73a3aa59403b..a9d28a39c4418 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -3958,12 +3958,10 @@ class LDVSSAUpdater { /// For a given MBB, create a wrapper block for it. Stores it in the /// LDVSSAUpdater block map. LDVSSABlock *getSSALDVBlock(MachineBasicBlock *BB) { - auto it = BlockMap.find(BB); - if (it == BlockMap.end()) { - BlockMap[BB] = new LDVSSABlock(*BB, *this); - it = BlockMap.find(BB); - } - return it->second; + auto [It, Inserted] = BlockMap.try_emplace(BB); + if (Inserted) + It->second = new LDVSSABlock(*BB, *this); + return It->second; } /// Find the live-in value number for the given block. Looks up the value at diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index 5c095e79599f6..f157ffc6bcc2d 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -1070,9 +1070,7 @@ class VLocTracker { : DbgValue(Properties, DbgValue::Undef); // Attempt insertion; overwrite if it's already mapped. - auto Result = Vars.insert(std::make_pair(VarID, Rec)); - if (!Result.second) - Result.first->second = Rec; + Vars.insert_or_assign(VarID, Rec); Scopes[VarID] = MI.getDebugLoc().get(); considerOverlaps(Var, MI.getDebugLoc().get()); @@ -1100,9 +1098,7 @@ class VLocTracker { DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef); // Attempt insertion; overwrite if it's already mapped. - auto Result = Vars.insert(std::make_pair(OverlappedID, Rec)); - if (!Result.second) - Result.first->second = Rec; + Vars.insert_or_assign(OverlappedID, Rec); Scopes[OverlappedID] = Loc; } } diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index a0f0e27478d02..74f38e886a6b9 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -2654,7 +2654,7 @@ bool MIParser::parseIntrinsicOperand(MachineOperand &Dest) { // Find out what intrinsic we're dealing with, first try the global namespace // and then the target's private intrinsics if that fails. const TargetIntrinsicInfo *TII = MF.getTarget().getIntrinsicInfo(); - Intrinsic::ID ID = Function::lookupIntrinsicID(Name); + Intrinsic::ID ID = Intrinsic::lookupIntrinsicID(Name); if (ID == Intrinsic::not_intrinsic && TII) ID = static_cast(TII->lookupName(Name)); diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 3289a692221ba..793ad75759ccb 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -148,13 +148,13 @@ namespace { DenseMap> ExitBlockMap; bool isExitBlock(MachineLoop *CurLoop, const MachineBasicBlock *MBB) { - if (ExitBlockMap.contains(CurLoop)) - return is_contained(ExitBlockMap[CurLoop], MBB); - - SmallVector ExitBlocks; - CurLoop->getExitBlocks(ExitBlocks); - ExitBlockMap[CurLoop] = ExitBlocks; - return is_contained(ExitBlocks, MBB); + auto [It, Inserted] = ExitBlockMap.try_emplace(CurLoop); + if (Inserted) { + SmallVector ExitBlocks; + CurLoop->getExitBlocks(ExitBlocks); + It->second = ExitBlocks; + } + return is_contained(It->second, MBB); } // Track 'estimated' register pressure. @@ -1010,12 +1010,8 @@ MachineLICMImpl::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, if (RCCost == 0) continue; const int *PS = TRI->getRegClassPressureSets(RC); - for (; *PS != -1; ++PS) { - if (!Cost.contains(*PS)) - Cost[*PS] = RCCost; - else - Cost[*PS] += RCCost; - } + for (; *PS != -1; ++PS) + Cost[*PS] += RCCost; } return Cost; } diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index cd8333931bb5f..5475743905032 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -1415,14 +1415,12 @@ class HighRegisterPressureDetector { auto Reg = Use.RegUnit; if (!TargetRegs.contains(Reg)) continue; - auto Ite = LastUseMI.find(Reg); - if (Ite == LastUseMI.end()) { - LastUseMI[Reg] = MI; - } else { + auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI); + if (!Inserted) { MachineInstr *Orig = Ite->second; MachineInstr *New = MI; if (InstrScore(Orig) < InstrScore(New)) - LastUseMI[Reg] = New; + Ite->second = New; } } } diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index 07fa92889d885..0e8220ec6251c 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -50,9 +50,9 @@ static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg, void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) { unsigned MBBNumber = MBB->getNumber(); - assert(MBBNumber < MBBReachingDefs.size() && + assert(MBBNumber < MBBReachingDefs.numBlockIDs() && "Unexpected basic block number."); - MBBReachingDefs[MBBNumber].resize(NumRegUnits); + MBBReachingDefs.startBasicBlock(MBBNumber, NumRegUnits); // Reset instruction counter in each basic block. CurInstr = 0; @@ -71,7 +71,7 @@ void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) { // before the call. if (LiveRegs[Unit] != -1) { LiveRegs[Unit] = -1; - MBBReachingDefs[MBBNumber][Unit].push_back(-1); + MBBReachingDefs.append(MBBNumber, Unit, -1); } } } @@ -97,7 +97,7 @@ void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) { // Insert the most recent reaching definition we found. for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit) if (LiveRegs[Unit] != ReachingDefDefaultVal) - MBBReachingDefs[MBBNumber][Unit].push_back(LiveRegs[Unit]); + MBBReachingDefs.append(MBBNumber, Unit, LiveRegs[Unit]); } void ReachingDefAnalysis::leaveBasicBlock(MachineBasicBlock *MBB) { @@ -122,7 +122,7 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) { assert(!MI->isDebugInstr() && "Won't process debug instructions"); unsigned MBBNumber = MI->getParent()->getNumber(); - assert(MBBNumber < MBBReachingDefs.size() && + assert(MBBNumber < MBBReachingDefs.numBlockIDs() && "Unexpected basic block number."); for (auto &MO : MI->operands()) { @@ -136,7 +136,7 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) { // How many instructions since this reg unit was last written? if (LiveRegs[Unit] != CurInstr) { LiveRegs[Unit] = CurInstr; - MBBReachingDefs[MBBNumber][Unit].push_back(CurInstr); + MBBReachingDefs.append(MBBNumber, Unit, CurInstr); } } } @@ -146,7 +146,7 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) { void ReachingDefAnalysis::reprocessBasicBlock(MachineBasicBlock *MBB) { unsigned MBBNumber = MBB->getNumber(); - assert(MBBNumber < MBBReachingDefs.size() && + assert(MBBNumber < MBBReachingDefs.numBlockIDs() && "Unexpected basic block number."); // Count number of non-debug instructions for end of block adjustment. @@ -169,16 +169,16 @@ void ReachingDefAnalysis::reprocessBasicBlock(MachineBasicBlock *MBB) { if (Def == ReachingDefDefaultVal) continue; - auto Start = MBBReachingDefs[MBBNumber][Unit].begin(); - if (Start != MBBReachingDefs[MBBNumber][Unit].end() && *Start < 0) { - if (*Start >= Def) + auto Defs = MBBReachingDefs.defs(MBBNumber, Unit); + if (!Defs.empty() && Defs.front() < 0) { + if (Defs.front() >= Def) continue; // Update existing reaching def from predecessor to a more recent one. - *Start = Def; + MBBReachingDefs.replaceFront(MBBNumber, Unit, Def); } else { // Insert new reaching def from predecessor. - MBBReachingDefs[MBBNumber][Unit].insert(Start, Def); + MBBReachingDefs.prepend(MBBNumber, Unit, Def); } // Update reaching def at end of BB. Keep in mind that these are @@ -234,7 +234,7 @@ void ReachingDefAnalysis::reset() { void ReachingDefAnalysis::init() { NumRegUnits = TRI->getNumRegUnits(); - MBBReachingDefs.resize(MF->getNumBlockIDs()); + MBBReachingDefs.init(MF->getNumBlockIDs()); // Initialize the MBBOutRegsInfos MBBOutRegsInfos.resize(MF->getNumBlockIDs()); LoopTraversal Traversal; @@ -247,10 +247,11 @@ void ReachingDefAnalysis::traverse() { processBasicBlock(TraversedMBB); #ifndef NDEBUG // Make sure reaching defs are sorted and unique. - for (MBBDefsInfo &MBBDefs : MBBReachingDefs) { - for (MBBRegUnitDefs &RegUnitDefs : MBBDefs) { + for (unsigned MBBNumber = 0, NumBlockIDs = MF->getNumBlockIDs(); + MBBNumber != NumBlockIDs; ++MBBNumber) { + for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit) { int LastDef = ReachingDefDefaultVal; - for (int Def : RegUnitDefs) { + for (int Def : MBBReachingDefs.defs(MBBNumber, Unit)) { assert(Def > LastDef && "Defs must be sorted and unique"); LastDef = Def; } @@ -265,11 +266,11 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int InstId = InstIds.lookup(MI); int DefRes = ReachingDefDefaultVal; unsigned MBBNumber = MI->getParent()->getNumber(); - assert(MBBNumber < MBBReachingDefs.size() && + assert(MBBNumber < MBBReachingDefs.numBlockIDs() && "Unexpected basic block number."); int LatestDef = ReachingDefDefaultVal; for (MCRegUnit Unit : TRI->regunits(PhysReg)) { - for (int Def : MBBReachingDefs[MBBNumber][Unit]) { + for (int Def : MBBReachingDefs.defs(MBBNumber, Unit)) { if (Def >= InstId) break; DefRes = Def; @@ -299,7 +300,8 @@ bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B, MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB, int InstId) const { - assert(static_cast(MBB->getNumber()) < MBBReachingDefs.size() && + assert(static_cast(MBB->getNumber()) < + MBBReachingDefs.numBlockIDs() && "Unexpected basic block number."); assert(InstId < static_cast(MBB->size()) && "Unexpected instruction id."); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c6f6fc2508054..65a620b70d8f0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10287,8 +10287,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, SDValue LeftOp = ShiftOperand.getOperand(0); SDValue RightOp = ShiftOperand.getOperand(1); - bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND; - bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND; + // Treat zext nneg as sext - we might need to support handling these as zext + // as well in the future, but for now just prefer sext. + bool IsSignExt = sd_match(LeftOp, m_SExtLike(m_Value())); + bool IsZeroExt = sd_match(LeftOp, m_ZExt(m_Value())); if (!IsSignExt && !IsZeroExt) return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 162af2d9d708a..a4b0db90abb9f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -381,14 +381,13 @@ void FastISel::updateValueMap(const Value *I, Register Reg, unsigned NumRegs) { } } -Register FastISel::getRegForGEPIndex(const Value *Idx) { +Register FastISel::getRegForGEPIndex(MVT PtrVT, const Value *Idx) { Register IdxN = getRegForValue(Idx); if (!IdxN) // Unhandled operand. Halt "fast" selection and bail. return Register(); // If the index is smaller or larger than intptr_t, truncate or extend it. - MVT PtrVT = TLI.getPointerTy(DL); EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); if (IdxVT.bitsLT(PtrVT)) { IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN); @@ -544,7 +543,8 @@ bool FastISel::selectGetElementPtr(const User *I) { uint64_t TotalOffs = 0; // FIXME: What's a good SWAG number for MaxOffs? uint64_t MaxOffs = 2048; - MVT VT = TLI.getPointerTy(DL); + MVT VT = TLI.getValueType(DL, I->getType()).getSimpleVT(); + for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I); GTI != E; ++GTI) { const Value *Idx = GTI.getOperand(); @@ -585,7 +585,7 @@ bool FastISel::selectGetElementPtr(const User *I) { // N = N + Idx * ElementSize; uint64_t ElementSize = GTI.getSequentialElementStride(DL); - Register IdxN = getRegForGEPIndex(Idx); + Register IdxN = getRegForGEPIndex(VT, Idx); if (!IdxN) // Unhandled operand. Halt "fast" selection and bail. return false; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1c466ed0b7799..0a22f06271984 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4537,6 +4537,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_FMINIMUM: case ISD::FMAXIMUM: case ISD::VP_FMAXIMUM: + case ISD::FMINIMUMNUM: + case ISD::FMAXIMUMNUM: case ISD::SMIN: case ISD::VP_SMIN: case ISD::SMAX: case ISD::VP_SMAX: case ISD::UMIN: case ISD::VP_UMIN: diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a2a232ed93b72..f19975557a0a7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8606,6 +8606,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, return DAG.getNode(IEEE2008Op, DL, VT, LHS, RHS, Flags); } + if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) + return DAG.UnrollVectorOp(Node); + // If only one operand is NaN, override it with another operand. if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS)) { LHS = DAG.getSelectCC(DL, LHS, LHS, RHS, LHS, ISD::SETUO); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index eb2751ab30ac5..fa3e8ad21dbd4 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -1014,10 +1014,8 @@ void DWARFVerifier::verifyDebugLineRows() { DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FullPath); assert(HasFullPath && "Invalid index?"); (void)HasFullPath; - auto It = FullPathMap.find(FullPath); - if (It == FullPathMap.end()) - FullPathMap[FullPath] = FileIndex; - else if (It->second != FileIndex && DumpOpts.Verbose) { + auto [It, Inserted] = FullPathMap.try_emplace(FullPath, FileIndex); + if (!Inserted && It->second != FileIndex && DumpOpts.Verbose) { warn() << ".debug_line[" << format("0x%08" PRIx64, *toSectionOffset(Die.find(DW_AT_stmt_list))) diff --git a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp index eab0dfa47e1e7..cdfcae86f79c2 100644 --- a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp @@ -521,10 +521,8 @@ void COFFPlatform::pushInitializersLoop(PushInitializersSendResultFn SendResult, } for (auto *DepJD : JDDepMap[CurJD]) - if (!Visited.count(DepJD)) { + if (Visited.insert(DepJD).second) Worklist.push_back(DepJD); - Visited.insert(DepJD); - } } }); diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp index 67c920a40ea2e..d92077dbcbd03 100644 --- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp @@ -1,4 +1,5 @@ -//===------ ELFNixPlatform.cpp - Utilities for executing MachO in Orc -----===// +//===------ ELFNixPlatform.cpp - Utilities for executing ELFNix in Orc +//-----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +16,7 @@ #include "llvm/ExecutionEngine/JITLink/x86_64.h" #include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h" #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h" #include "llvm/Support/BinaryByteStream.h" #include "llvm/Support/Debug.h" @@ -28,6 +30,125 @@ using namespace llvm::orc::shared; namespace { +template +shared::WrapperFunctionCall::ArgDataBufferType +getArgDataBufferType(const ArgTs &...Args) { + shared::WrapperFunctionCall::ArgDataBufferType ArgData; + ArgData.resize(SPSSerializer::size(Args...)); + SPSOutputBuffer OB(ArgData.empty() ? nullptr : ArgData.data(), + ArgData.size()); + if (SPSSerializer::serialize(OB, Args...)) + return ArgData; + return {}; +} + +std::unique_ptr createPlatformGraph(ELFNixPlatform &MOP, + std::string Name) { + unsigned PointerSize; + llvm::endianness Endianness; + const auto &TT = MOP.getExecutionSession().getTargetTriple(); + + switch (TT.getArch()) { + case Triple::x86_64: + PointerSize = 8; + Endianness = llvm::endianness::little; + break; + case Triple::aarch64: + PointerSize = 8; + Endianness = llvm::endianness::little; + break; + case Triple::ppc64: + PointerSize = 8; + Endianness = llvm::endianness::big; + break; + case Triple::ppc64le: + PointerSize = 8; + Endianness = llvm::endianness::little; + break; + default: + llvm_unreachable("Unrecognized architecture"); + } + + return std::make_unique(std::move(Name), TT, PointerSize, + Endianness, + jitlink::getGenericEdgeKindName); +} + +// Creates a Bootstrap-Complete LinkGraph to run deferred actions. +class ELFNixPlatformCompleteBootstrapMaterializationUnit + : public MaterializationUnit { +public: + ELFNixPlatformCompleteBootstrapMaterializationUnit( + ELFNixPlatform &MOP, StringRef PlatformJDName, + SymbolStringPtr CompleteBootstrapSymbol, DeferredRuntimeFnMap DeferredAAs, + ExecutorAddr ELFNixHeaderAddr, ExecutorAddr PlatformBootstrap, + ExecutorAddr PlatformShutdown, ExecutorAddr RegisterJITDylib, + ExecutorAddr DeregisterJITDylib) + : MaterializationUnit( + {{{CompleteBootstrapSymbol, JITSymbolFlags::None}}, nullptr}), + MOP(MOP), PlatformJDName(PlatformJDName), + CompleteBootstrapSymbol(std::move(CompleteBootstrapSymbol)), + DeferredAAsMap(std::move(DeferredAAs)), + ELFNixHeaderAddr(ELFNixHeaderAddr), + PlatformBootstrap(PlatformBootstrap), + PlatformShutdown(PlatformShutdown), RegisterJITDylib(RegisterJITDylib), + DeregisterJITDylib(DeregisterJITDylib) {} + + StringRef getName() const override { + return "ELFNixPlatformCompleteBootstrap"; + } + + void materialize(std::unique_ptr R) override { + using namespace jitlink; + auto G = createPlatformGraph(MOP, ""); + auto &PlaceholderSection = + G->createSection("__orc_rt_cplt_bs", MemProt::Read); + auto &PlaceholderBlock = + G->createZeroFillBlock(PlaceholderSection, 1, ExecutorAddr(), 1, 0); + G->addDefinedSymbol(PlaceholderBlock, 0, *CompleteBootstrapSymbol, 1, + Linkage::Strong, Scope::Hidden, false, true); + + // 1. Bootstrap the platform support code. + G->allocActions().push_back( + {cantFail(WrapperFunctionCall::Create>( + PlatformBootstrap, ELFNixHeaderAddr)), + cantFail( + WrapperFunctionCall::Create>(PlatformShutdown))}); + + // 2. Register the platform JITDylib. + G->allocActions().push_back( + {cantFail(WrapperFunctionCall::Create< + SPSArgList>( + RegisterJITDylib, PlatformJDName, ELFNixHeaderAddr)), + cantFail(WrapperFunctionCall::Create>( + DeregisterJITDylib, ELFNixHeaderAddr))}); + + // 4. Add the deferred actions to the graph. + for (auto &[Fn, CallDatas] : DeferredAAsMap) { + for (auto &CallData : CallDatas) { + G->allocActions().push_back( + {WrapperFunctionCall(Fn.first->Addr, std::move(CallData.first)), + WrapperFunctionCall(Fn.second->Addr, std::move(CallData.second))}); + } + } + + MOP.getObjectLinkingLayer().emit(std::move(R), std::move(G)); + } + + void discard(const JITDylib &JD, const SymbolStringPtr &Sym) override {} + +private: + ELFNixPlatform &MOP; + StringRef PlatformJDName; + SymbolStringPtr CompleteBootstrapSymbol; + DeferredRuntimeFnMap DeferredAAsMap; + ExecutorAddr ELFNixHeaderAddr; + ExecutorAddr PlatformBootstrap; + ExecutorAddr PlatformShutdown; + ExecutorAddr RegisterJITDylib; + ExecutorAddr DeregisterJITDylib; +}; + class DSOHandleMaterializationUnit : public MaterializationUnit { public: DSOHandleMaterializationUnit(ELFNixPlatform &ENP, @@ -174,16 +295,28 @@ ELFNixPlatform::Create(ExecutionSession &ES, } Error ELFNixPlatform::setupJITDylib(JITDylib &JD) { - return JD.define( - std::make_unique(*this, DSOHandleSymbol)); + if (auto Err = JD.define(std::make_unique( + *this, DSOHandleSymbol))) + return Err; + + return ES.lookup({&JD}, DSOHandleSymbol).takeError(); } Error ELFNixPlatform::teardownJITDylib(JITDylib &JD) { + std::lock_guard Lock(PlatformMutex); + auto I = JITDylibToHandleAddr.find(&JD); + if (I != JITDylibToHandleAddr.end()) { + assert(HandleAddrToJITDylib.count(I->second) && + "HandleAddrToJITDylib missing entry"); + HandleAddrToJITDylib.erase(I->second); + JITDylibToHandleAddr.erase(I); + } return Error::success(); } Error ELFNixPlatform::notifyAdding(ResourceTracker &RT, const MaterializationUnit &MU) { + auto &JD = RT.getJITDylib(); const auto &InitSym = MU.getInitializerSymbol(); if (!InitSym) @@ -262,14 +395,16 @@ ELFNixPlatform::ELFNixPlatform( ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, std::unique_ptr OrcRuntimeGenerator, Error &Err) - : ES(ES), ObjLinkingLayer(ObjLinkingLayer), + : ES(ES), PlatformJD(PlatformJD), ObjLinkingLayer(ObjLinkingLayer), DSOHandleSymbol(ES.intern("__dso_handle")) { ErrorAsOutParameter _(&Err); - ObjLinkingLayer.addPlugin(std::make_unique(*this)); PlatformJD.addGenerator(std::move(OrcRuntimeGenerator)); + BootstrapInfo BI; + Bootstrap = &BI; + // PlatformJD hasn't been 'set-up' by the platform yet (since we're creating // the platform now), so set it up. if (auto E2 = setupJITDylib(PlatformJD)) { @@ -277,19 +412,44 @@ ELFNixPlatform::ELFNixPlatform( return; } - RegisteredInitSymbols[&PlatformJD].add( - DSOHandleSymbol, SymbolLookupFlags::WeaklyReferencedSymbol); - - // Associate wrapper function tags with JIT-side function implementations. - if (auto E2 = associateRuntimeSupportFunctions(PlatformJD)) { - Err = std::move(E2); + // Step (2) Request runtime registration functions to trigger + // materialization.. + if ((Err = ES.lookup( + makeJITDylibSearchOrder(&PlatformJD), + SymbolLookupSet( + {PlatformBootstrap.Name, PlatformShutdown.Name, + RegisterJITDylib.Name, DeregisterJITDylib.Name, + RegisterInitSections.Name, DeregisterInitSections.Name, + RegisterObjectSections.Name, + DeregisterObjectSections.Name, CreatePThreadKey.Name})) + .takeError())) return; + + // Step (3) Wait for any incidental linker work to complete. + { + std::unique_lock Lock(BI.Mutex); + BI.CV.wait(Lock, [&]() { return BI.ActiveGraphs == 0; }); + Bootstrap = nullptr; } - // Lookup addresses of runtime functions callable by the platform, - // call the platform bootstrap function to initialize the platform-state - // object in the executor. - if (auto E2 = bootstrapELFNixRuntime(PlatformJD)) { + // Step (4) Add complete-bootstrap materialization unit and request. + auto BootstrapCompleteSymbol = + ES.intern("__orc_rt_elfnix_complete_bootstrap"); + if ((Err = PlatformJD.define( + std::make_unique( + *this, PlatformJD.getName(), BootstrapCompleteSymbol, + std::move(BI.DeferredRTFnMap), BI.ELFNixHeaderAddr, + PlatformBootstrap.Addr, PlatformShutdown.Addr, + RegisterJITDylib.Addr, DeregisterJITDylib.Addr)))) + return; + if ((Err = ES.lookup(makeJITDylibSearchOrder( + &PlatformJD, JITDylibLookupFlags::MatchAllSymbols), + std::move(BootstrapCompleteSymbol)) + .takeError())) + return; + + // Associate wrapper function tags with JIT-side function implementations. + if (auto E2 = associateRuntimeSupportFunctions(PlatformJD)) { Err = std::move(E2); return; } @@ -298,17 +458,11 @@ ELFNixPlatform::ELFNixPlatform( Error ELFNixPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) { ExecutionSession::JITDispatchHandlerAssociationMap WFs; - using GetInitializersSPSSig = - SPSExpected(SPSString); - WFs[ES.intern("__orc_rt_elfnix_get_initializers_tag")] = - ES.wrapAsyncWithSPS( - this, &ELFNixPlatform::rt_getInitializers); - - using GetDeinitializersSPSSig = - SPSExpected(SPSExecutorAddr); - WFs[ES.intern("__orc_rt_elfnix_get_deinitializers_tag")] = - ES.wrapAsyncWithSPS( - this, &ELFNixPlatform::rt_getDeinitializers); + using RecordInitializersSPSSig = + SPSExpected(SPSExecutorAddr); + WFs[ES.intern("__orc_rt_elfnix_push_initializers_tag")] = + ES.wrapAsyncWithSPS( + this, &ELFNixPlatform::rt_recordInitializers); using LookupSymbolSPSSig = SPSExpected(SPSExecutorAddr, SPSString); @@ -319,110 +473,120 @@ Error ELFNixPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) { return ES.registerJITDispatchHandlers(PlatformJD, std::move(WFs)); } -void ELFNixPlatform::getInitializersBuildSequencePhase( - SendInitializerSequenceFn SendResult, JITDylib &JD, - std::vector DFSLinkOrder) { - ELFNixJITDylibInitializerSequence FullInitSeq; - { - std::lock_guard Lock(PlatformMutex); - for (auto &InitJD : reverse(DFSLinkOrder)) { - LLVM_DEBUG({ - dbgs() << "ELFNixPlatform: Appending inits for \"" << InitJD->getName() - << "\" to sequence\n"; - }); - auto ISItr = InitSeqs.find(InitJD.get()); - if (ISItr != InitSeqs.end()) { - FullInitSeq.emplace_back(std::move(ISItr->second)); - InitSeqs.erase(ISItr); - } - } - } +void ELFNixPlatform::pushInitializersLoop( + PushInitializersSendResultFn SendResult, JITDylibSP JD) { + DenseMap NewInitSymbols; + DenseMap> JDDepMap; + SmallVector Worklist({JD.get()}); - SendResult(std::move(FullInitSeq)); -} + ES.runSessionLocked([&]() { + while (!Worklist.empty()) { + // FIXME: Check for defunct dylibs. -void ELFNixPlatform::getInitializersLookupPhase( - SendInitializerSequenceFn SendResult, JITDylib &JD) { + auto DepJD = Worklist.back(); + Worklist.pop_back(); - auto DFSLinkOrder = JD.getDFSLinkOrder(); - if (!DFSLinkOrder) { - SendResult(DFSLinkOrder.takeError()); - return; - } + // If we've already visited this JITDylib on this iteration then continue. + if (JDDepMap.count(DepJD)) + continue; - DenseMap NewInitSymbols; - ES.runSessionLocked([&]() { - for (auto &InitJD : *DFSLinkOrder) { - auto RISItr = RegisteredInitSymbols.find(InitJD.get()); + // Add dep info. + auto &DM = JDDepMap[DepJD]; + DepJD->withLinkOrderDo([&](const JITDylibSearchOrder &O) { + for (auto &KV : O) { + if (KV.first == DepJD) + continue; + DM.push_back(KV.first); + Worklist.push_back(KV.first); + } + }); + + // Add any registered init symbols. + auto RISItr = RegisteredInitSymbols.find(DepJD); if (RISItr != RegisteredInitSymbols.end()) { - NewInitSymbols[InitJD.get()] = std::move(RISItr->second); + NewInitSymbols[DepJD] = std::move(RISItr->second); RegisteredInitSymbols.erase(RISItr); } } }); - // If there are no further init symbols to look up then move on to the next - // phase. + // If there are no further init symbols to look up then send the link order + // (as a list of header addresses) to the caller. if (NewInitSymbols.empty()) { - getInitializersBuildSequencePhase(std::move(SendResult), JD, - std::move(*DFSLinkOrder)); + + // To make the list intelligible to the runtime we need to convert all + // JITDylib pointers to their header addresses. Only include JITDylibs + // that appear in the JITDylibToHandleAddr map (i.e. those that have been + // through setupJITDylib) -- bare JITDylibs aren't managed by the platform. + DenseMap HeaderAddrs; + HeaderAddrs.reserve(JDDepMap.size()); + { + std::lock_guard Lock(PlatformMutex); + for (auto &KV : JDDepMap) { + auto I = JITDylibToHandleAddr.find(KV.first); + if (I != JITDylibToHandleAddr.end()) + HeaderAddrs[KV.first] = I->second; + } + } + + // Build the dep info map to return. + ELFNixJITDylibDepInfoMap DIM; + DIM.reserve(JDDepMap.size()); + for (auto &KV : JDDepMap) { + auto HI = HeaderAddrs.find(KV.first); + // Skip unmanaged JITDylibs. + if (HI == HeaderAddrs.end()) + continue; + auto H = HI->second; + ELFNixJITDylibDepInfo DepInfo; + for (auto &Dep : KV.second) { + auto HJ = HeaderAddrs.find(Dep); + if (HJ != HeaderAddrs.end()) + DepInfo.push_back(HJ->second); + } + DIM.push_back(std::make_pair(H, std::move(DepInfo))); + } + SendResult(DIM); return; } // Otherwise issue a lookup and re-run this phase when it completes. lookupInitSymbolsAsync( - [this, SendResult = std::move(SendResult), &JD](Error Err) mutable { + [this, SendResult = std::move(SendResult), JD](Error Err) mutable { if (Err) SendResult(std::move(Err)); else - getInitializersLookupPhase(std::move(SendResult), JD); + pushInitializersLoop(std::move(SendResult), JD); }, ES, std::move(NewInitSymbols)); } -void ELFNixPlatform::rt_getInitializers(SendInitializerSequenceFn SendResult, - StringRef JDName) { - LLVM_DEBUG({ - dbgs() << "ELFNixPlatform::rt_getInitializers(\"" << JDName << "\")\n"; - }); - - JITDylib *JD = ES.getJITDylibByName(JDName); - if (!JD) { - LLVM_DEBUG({ - dbgs() << " No such JITDylib \"" << JDName << "\". Sending error.\n"; - }); - SendResult(make_error("No JITDylib named " + JDName, - inconvertibleErrorCode())); - return; - } - - getInitializersLookupPhase(std::move(SendResult), *JD); -} - -void ELFNixPlatform::rt_getDeinitializers( - SendDeinitializerSequenceFn SendResult, ExecutorAddr Handle) { - LLVM_DEBUG({ - dbgs() << "ELFNixPlatform::rt_getDeinitializers(\"" << Handle << "\")\n"; - }); - - JITDylib *JD = nullptr; - +void ELFNixPlatform::rt_recordInitializers( + PushInitializersSendResultFn SendResult, ExecutorAddr JDHeaderAddr) { + JITDylibSP JD; { std::lock_guard Lock(PlatformMutex); - auto I = HandleAddrToJITDylib.find(Handle); + auto I = HandleAddrToJITDylib.find(JDHeaderAddr); if (I != HandleAddrToJITDylib.end()) JD = I->second; } + LLVM_DEBUG({ + dbgs() << "ELFNixPlatform::rt_recordInitializers(" << JDHeaderAddr << ") "; + if (JD) + dbgs() << "pushing initializers for " << JD->getName() << "\n"; + else + dbgs() << "No JITDylib for header address.\n"; + }); + if (!JD) { - LLVM_DEBUG(dbgs() << " No JITDylib for handle " << Handle << "\n"); - SendResult(make_error("No JITDylib associated with handle " + - formatv("{0:x}", Handle), + SendResult(make_error("No JITDylib with header addr " + + formatv("{0:x}", JDHeaderAddr), inconvertibleErrorCode())); return; } - SendResult(ELFNixJITDylibDeinitializerSequence()); + pushInitializersLoop(std::move(SendResult), JD); } void ELFNixPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult, @@ -473,116 +637,98 @@ void ELFNixPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult, RtLookupNotifyComplete(std::move(SendResult)), NoDependenciesToRegister); } -Error ELFNixPlatform::bootstrapELFNixRuntime(JITDylib &PlatformJD) { - - std::pair Symbols[] = { - {"__orc_rt_elfnix_platform_bootstrap", &orc_rt_elfnix_platform_bootstrap}, - {"__orc_rt_elfnix_platform_shutdown", &orc_rt_elfnix_platform_shutdown}, - {"__orc_rt_elfnix_register_object_sections", - &orc_rt_elfnix_register_object_sections}, - {"__orc_rt_elfnix_create_pthread_key", - &orc_rt_elfnix_create_pthread_key}}; - - SymbolLookupSet RuntimeSymbols; - std::vector> AddrsToRecord; - for (const auto &KV : Symbols) { - auto Name = ES.intern(KV.first); - RuntimeSymbols.add(Name); - AddrsToRecord.push_back({std::move(Name), KV.second}); - } - - auto RuntimeSymbolAddrs = ES.lookup( - {{&PlatformJD, JITDylibLookupFlags::MatchAllSymbols}}, RuntimeSymbols); - if (!RuntimeSymbolAddrs) - return RuntimeSymbolAddrs.takeError(); - - for (const auto &KV : AddrsToRecord) { - auto &Name = KV.first; - assert(RuntimeSymbolAddrs->count(Name) && "Missing runtime symbol?"); - *KV.second = (*RuntimeSymbolAddrs)[Name].getAddress(); - } - - auto PJDDSOHandle = ES.lookup( - {{&PlatformJD, JITDylibLookupFlags::MatchAllSymbols}}, DSOHandleSymbol); - if (!PJDDSOHandle) - return PJDDSOHandle.takeError(); - - if (auto Err = ES.callSPSWrapper( - orc_rt_elfnix_platform_bootstrap, - PJDDSOHandle->getAddress().getValue())) - return Err; - - // FIXME: Ordering is fuzzy here. We're probably best off saying - // "behavior is undefined if code that uses the runtime is added before - // the platform constructor returns", then move all this to the constructor. - RuntimeBootstrapped = true; - std::vector DeferredPOSRs; - { - std::lock_guard Lock(PlatformMutex); - DeferredPOSRs = std::move(BootstrapPOSRs); - } - - for (auto &D : DeferredPOSRs) - if (auto Err = registerPerObjectSections(D)) - return Err; - +Error ELFNixPlatform::ELFNixPlatformPlugin::bootstrapPipelineStart( + jitlink::LinkGraph &G) { + // Increment the active graphs count in BootstrapInfo. + std::lock_guard Lock(MP.Bootstrap.load()->Mutex); + ++MP.Bootstrap.load()->ActiveGraphs; return Error::success(); } -Error ELFNixPlatform::registerInitInfo( - JITDylib &JD, ArrayRef InitSections) { - - std::unique_lock Lock(PlatformMutex); - - ELFNixJITDylibInitializers *InitSeq = nullptr; - { - auto I = InitSeqs.find(&JD); - if (I == InitSeqs.end()) { - // If there's no init sequence entry yet then we need to look up the - // header symbol to force creation of one. - Lock.unlock(); - - auto SearchOrder = - JD.withLinkOrderDo([](const JITDylibSearchOrder &SO) { return SO; }); - if (auto Err = ES.lookup(SearchOrder, DSOHandleSymbol).takeError()) - return Err; - - Lock.lock(); - I = InitSeqs.find(&JD); - assert(I != InitSeqs.end() && - "Entry missing after header symbol lookup?"); +Error ELFNixPlatform::ELFNixPlatformPlugin:: + bootstrapPipelineRecordRuntimeFunctions(jitlink::LinkGraph &G) { + // Record bootstrap function names. + std::pair RuntimeSymbols[] = { + {*MP.DSOHandleSymbol, &MP.Bootstrap.load()->ELFNixHeaderAddr}, + {*MP.PlatformBootstrap.Name, &MP.PlatformBootstrap.Addr}, + {*MP.PlatformShutdown.Name, &MP.PlatformShutdown.Addr}, + {*MP.RegisterJITDylib.Name, &MP.RegisterJITDylib.Addr}, + {*MP.DeregisterJITDylib.Name, &MP.DeregisterJITDylib.Addr}, + {*MP.RegisterObjectSections.Name, &MP.RegisterObjectSections.Addr}, + {*MP.DeregisterObjectSections.Name, &MP.DeregisterObjectSections.Addr}, + {*MP.RegisterInitSections.Name, &MP.RegisterInitSections.Addr}, + {*MP.DeregisterInitSections.Name, &MP.DeregisterInitSections.Addr}, + {*MP.CreatePThreadKey.Name, &MP.CreatePThreadKey.Addr}}; + + bool RegisterELFNixHeader = false; + + for (auto *Sym : G.defined_symbols()) { + for (auto &RTSym : RuntimeSymbols) { + if (Sym->hasName() && Sym->getName() == RTSym.first) { + if (*RTSym.second) + return make_error( + "Duplicate " + RTSym.first + + " detected during ELFNixPlatform bootstrap", + inconvertibleErrorCode()); + + if (Sym->getName() == *MP.DSOHandleSymbol) + RegisterELFNixHeader = true; + + *RTSym.second = Sym->getAddress(); + } } - InitSeq = &I->second; } - for (auto *Sec : InitSections) { - // FIXME: Avoid copy here. - jitlink::SectionRange R(*Sec); - InitSeq->InitSections[Sec->getName()].push_back(R.getRange()); + if (RegisterELFNixHeader) { + // If this graph defines the elfnix header symbol then create the internal + // mapping between it and PlatformJD. + std::lock_guard Lock(MP.PlatformMutex); + MP.JITDylibToHandleAddr[&MP.PlatformJD] = + MP.Bootstrap.load()->ELFNixHeaderAddr; + MP.HandleAddrToJITDylib[MP.Bootstrap.load()->ELFNixHeaderAddr] = + &MP.PlatformJD; } return Error::success(); } +Error ELFNixPlatform::ELFNixPlatformPlugin::bootstrapPipelineEnd( + jitlink::LinkGraph &G) { + std::lock_guard Lock(MP.Bootstrap.load()->Mutex); + assert(MP.Bootstrap && "DeferredAAs reset before bootstrap completed"); + --MP.Bootstrap.load()->ActiveGraphs; + // Notify Bootstrap->CV while holding the mutex because the mutex is + // also keeping Bootstrap->CV alive. + if (MP.Bootstrap.load()->ActiveGraphs == 0) + MP.Bootstrap.load()->CV.notify_all(); + return Error::success(); +} + Error ELFNixPlatform::registerPerObjectSections( - const ELFPerObjectSectionsToRegister &POSR) { + jitlink::LinkGraph &G, const ELFPerObjectSectionsToRegister &POSR, + bool IsBootstrapping) { + using SPSRegisterPerObjSectionsArgs = + SPSArgList; + + if (LLVM_UNLIKELY(IsBootstrapping)) { + Bootstrap.load()->addArgumentsToRTFnMap( + &RegisterObjectSections, &DeregisterObjectSections, + getArgDataBufferType(POSR), + getArgDataBufferType(POSR)); + return Error::success(); + } - if (!orc_rt_elfnix_register_object_sections) - return make_error("Attempting to register per-object " - "sections, but runtime support has not " - "been loaded yet", - inconvertibleErrorCode()); + G.allocActions().push_back( + {cantFail(WrapperFunctionCall::Create( + RegisterObjectSections.Addr, POSR)), + cantFail(WrapperFunctionCall::Create( + DeregisterObjectSections.Addr, POSR))}); - Error ErrResult = Error::success(); - if (auto Err = ES.callSPSWrapper( - orc_rt_elfnix_register_object_sections, ErrResult, POSR)) - return Err; - return ErrResult; + return Error::success(); } Expected ELFNixPlatform::createPThreadKey() { - if (!orc_rt_elfnix_create_pthread_key) + if (!CreatePThreadKey.Addr) return make_error( "Attempting to create pthread key in target, but runtime support has " "not been loaded yet", @@ -590,7 +736,7 @@ Expected ELFNixPlatform::createPThreadKey() { Expected Result(0); if (auto Err = ES.callSPSWrapper(void)>( - orc_rt_elfnix_create_pthread_key, Result)) + CreatePThreadKey.Addr, Result)) return std::move(Err); return Result; } @@ -598,38 +744,53 @@ Expected ELFNixPlatform::createPThreadKey() { void ELFNixPlatform::ELFNixPlatformPlugin::modifyPassConfig( MaterializationResponsibility &MR, jitlink::LinkGraph &LG, jitlink::PassConfiguration &Config) { + using namespace jitlink; + + bool InBootstrapPhase = + &MR.getTargetJITDylib() == &MP.PlatformJD && MP.Bootstrap; + + // If we're in the bootstrap phase then increment the active graphs. + if (InBootstrapPhase) { + Config.PrePrunePasses.push_back( + [this](LinkGraph &G) { return bootstrapPipelineStart(G); }); + Config.PostAllocationPasses.push_back([this](LinkGraph &G) { + return bootstrapPipelineRecordRuntimeFunctions(G); + }); + } // If the initializer symbol is the __dso_handle symbol then just add // the DSO handle support passes. - if (MR.getInitializerSymbol() == MP.DSOHandleSymbol) { - addDSOHandleSupportPasses(MR, Config); - // The DSOHandle materialization unit doesn't require any other - // support, so we can bail out early. - return; - } + if (auto InitSymbol = MR.getInitializerSymbol()) { + if (InitSymbol == MP.DSOHandleSymbol && !InBootstrapPhase) { + addDSOHandleSupportPasses(MR, Config); + // The DSOHandle materialization unit doesn't require any other + // support, so we can bail out early. + return; + } - // If the object contains initializers then add passes to record them. - if (MR.getInitializerSymbol()) - addInitializerSupportPasses(MR, Config); + /// Preserve init sections. + Config.PrePrunePasses.push_back( + [this, &MR](jitlink::LinkGraph &G) -> Error { + if (auto Err = preserveInitSections(G, MR)) + return Err; + return Error::success(); + }); + } // Add passes for eh-frame and TLV support. - addEHAndTLVSupportPasses(MR, Config); -} + addEHAndTLVSupportPasses(MR, Config, InBootstrapPhase); -void ELFNixPlatform::ELFNixPlatformPlugin::addInitializerSupportPasses( - MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) { - - /// Preserve init sections. - Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) -> Error { - if (auto Err = preserveInitSections(G, MR)) - return Err; - return Error::success(); + // If the object contains initializers then add passes to record them. + Config.PostFixupPasses.push_back([this, &JD = MR.getTargetJITDylib(), + InBootstrapPhase](jitlink::LinkGraph &G) { + return registerInitSections(G, JD, InBootstrapPhase); }); - Config.PostFixupPasses.push_back( - [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) { - return registerInitSections(G, JD); - }); + // If we're in the bootstrap phase then steal allocation actions and then + // decrement the active graphs. + if (InBootstrapPhase) + Config.PostFixupPasses.push_back( + [this](LinkGraph &G) { return bootstrapPipelineEnd(G); }); } void ELFNixPlatform::ELFNixPlatformPlugin::addDSOHandleSupportPasses( @@ -645,16 +806,22 @@ void ELFNixPlatform::ELFNixPlatformPlugin::addDSOHandleSupportPasses( std::lock_guard Lock(MP.PlatformMutex); auto HandleAddr = (*I)->getAddress(); MP.HandleAddrToJITDylib[HandleAddr] = &JD; - assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists"); - MP.InitSeqs.insert(std::make_pair( - &JD, ELFNixJITDylibInitializers(JD.getName(), HandleAddr))); + MP.JITDylibToHandleAddr[&JD] = HandleAddr; + + G.allocActions().push_back( + {cantFail(WrapperFunctionCall::Create< + SPSArgList>( + MP.RegisterJITDylib.Addr, JD.getName(), HandleAddr)), + cantFail(WrapperFunctionCall::Create>( + MP.DeregisterJITDylib.Addr, HandleAddr))}); } return Error::success(); }); } void ELFNixPlatform::ELFNixPlatformPlugin::addEHAndTLVSupportPasses( - MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) { + MaterializationResponsibility &MR, jitlink::PassConfiguration &Config, + bool IsBootstrapping) { // Insert TLV lowering at the start of the PostPrunePasses, since we want // it to run before GOT/PLT lowering. @@ -668,7 +835,8 @@ void ELFNixPlatform::ELFNixPlatformPlugin::addEHAndTLVSupportPasses( // Add a pass to register the final addresses of the eh-frame and TLV sections // with the runtime. - Config.PostFixupPasses.push_back([this](jitlink::LinkGraph &G) -> Error { + Config.PostFixupPasses.push_back([this, IsBootstrapping]( + jitlink::LinkGraph &G) -> Error { ELFPerObjectSectionsToRegister POSR; if (auto *EHFrameSection = G.findSectionByName(ELFEHFrameSectionName)) { @@ -702,17 +870,7 @@ void ELFNixPlatform::ELFNixPlatformPlugin::addEHAndTLVSupportPasses( } if (POSR.EHFrameSection.Start || POSR.ThreadDataSection.Start) { - - // If we're still bootstrapping the runtime then just record this - // frame for now. - if (!MP.RuntimeBootstrapped) { - std::lock_guard Lock(MP.PlatformMutex); - MP.BootstrapPOSRs.push_back(POSR); - return Error::success(); - } - - // Otherwise register it immediately. - if (auto Err = MP.registerPerObjectSections(POSR)) + if (auto Err = MP.registerPerObjectSections(G, POSR, IsBootstrapping)) return Err; } @@ -757,28 +915,55 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::preserveInitSections( } Error ELFNixPlatform::ELFNixPlatformPlugin::registerInitSections( - jitlink::LinkGraph &G, JITDylib &JD) { - - SmallVector InitSections; - + jitlink::LinkGraph &G, JITDylib &JD, bool IsBootstrapping) { + SmallVector ELFNixPlatformSecs; LLVM_DEBUG(dbgs() << "ELFNixPlatform::registerInitSections\n"); for (auto &Sec : G.sections()) { if (isELFInitializerSection(Sec.getName())) { - InitSections.push_back(&Sec); + jitlink::SectionRange R(Sec); + ELFNixPlatformSecs.push_back(R.getRange()); } } // Dump the scraped inits. LLVM_DEBUG({ dbgs() << "ELFNixPlatform: Scraped " << G.getName() << " init sections:\n"; - for (auto *Sec : InitSections) { - jitlink::SectionRange R(*Sec); - dbgs() << " " << Sec->getName() << ": " << R.getRange() << "\n"; + for (auto &Sec : G.sections()) { + jitlink::SectionRange R(Sec); + dbgs() << " " << Sec.getName() << ": " << R.getRange() << "\n"; } }); - return MP.registerInitInfo(JD, InitSections); + ExecutorAddr HeaderAddr; + { + std::lock_guard Lock(MP.PlatformMutex); + auto I = MP.JITDylibToHandleAddr.find(&JD); + assert(I != MP.JITDylibToHandleAddr.end() && "No header registered for JD"); + assert(I->second && "Null header registered for JD"); + HeaderAddr = I->second; + } + + using SPSRegisterInitSectionsArgs = + SPSArgList>; + + if (LLVM_UNLIKELY(IsBootstrapping)) { + MP.Bootstrap.load()->addArgumentsToRTFnMap( + &MP.RegisterInitSections, &MP.DeregisterInitSections, + getArgDataBufferType(HeaderAddr, + ELFNixPlatformSecs), + getArgDataBufferType(HeaderAddr, + ELFNixPlatformSecs)); + return Error::success(); + } + + G.allocActions().push_back( + {cantFail(WrapperFunctionCall::Create( + MP.RegisterInitSections.Addr, HeaderAddr, ELFNixPlatformSecs)), + cantFail(WrapperFunctionCall::Create( + MP.DeregisterInitSections.Addr, HeaderAddr, ELFNixPlatformSecs))}); + + return Error::success(); } Error ELFNixPlatform::ELFNixPlatformPlugin::fixTLVSectionsAndEdges( diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 70e3af941bf77..280e347739cdb 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1338,12 +1338,8 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) { void SlotTracker::CreateAttributeSetSlot(AttributeSet AS) { assert(AS.hasAttributes() && "Doesn't need a slot!"); - as_iterator I = asMap.find(AS); - if (I != asMap.end()) - return; - - unsigned DestSlot = asNext++; - asMap[AS] = DestSlot; + if (asMap.try_emplace(AS, asNext).second) + ++asNext; } /// Create a new slot for the specified Module diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 3390d651d6c69..6f833acd6dbc0 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1275,6 +1275,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, else if (Name.consume_front("rotate.")) // nvvm.rotate.{b32,b64,right.b64} Expand = Name == "b32" || Name == "b64" || Name == "right.b64"; + else if (Name.consume_front("ptr.gen.to.")) + // nvvm.ptr.gen.to.{local,shared,global,constant} + Expand = Name.starts_with("local") || Name.starts_with("shared") || + Name.starts_with("global") || Name.starts_with("constant"); + else if (Name.consume_front("ptr.")) + // nvvm.ptr.{local,shared,global,constant}.to.gen + Expand = + (Name.consume_front("local") || Name.consume_front("shared") || + Name.consume_front("global") || Name.consume_front("constant")) && + Name.starts_with(".to.gen"); else Expand = false; @@ -2338,6 +2348,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty); Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr, {Arg, Arg, ZExtShiftAmt}); + } else if ((Name.consume_front("ptr.gen.to.") && + (Name.starts_with("local") || Name.starts_with("shared") || + Name.starts_with("global") || Name.starts_with("constant"))) || + (Name.consume_front("ptr.") && + (Name.consume_front("local") || Name.consume_front("shared") || + Name.consume_front("global") || + Name.consume_front("constant")) && + Name.starts_with(".to.gen"))) { + Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType()); } else { Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); if (IID != Intrinsic::not_intrinsic && @@ -5498,6 +5517,18 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { return Res; } + if (T.isSPARC()) { + // Add "-i128:128" + std::string I64 = "-i64:64"; + std::string I128 = "-i128:128"; + if (!StringRef(Res).contains(I128)) { + size_t Pos = Res.find(I64); + assert(Pos != size_t(-1) && "no i64 data layout found!"); + Res.insert(Pos + I64.size(), I128); + } + return Res; + } + if (!T.isX86()) return Res; diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index c1ca2c255aa58..ee084e870263d 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -2508,7 +2508,7 @@ const char *LLVMIntrinsicCopyOverloadedName2(LLVMModuleRef Mod, unsigned ID, } unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen) { - return Function::lookupIntrinsicID({Name, NameLen}); + return Intrinsic::lookupIntrinsicID({Name, NameLen}); } LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID) { diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 863900c3f14b2..052ee1fdc9390 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -952,12 +952,12 @@ static constexpr const char *const IntrinsicNameTable[] = { #include "llvm/IR/IntrinsicImpl.inc" #undef GET_INTRINSIC_TARGET_DATA -bool Function::isTargetIntrinsic(Intrinsic::ID IID) { +bool Intrinsic::isTargetIntrinsic(Intrinsic::ID IID) { return IID > TargetInfos[0].Count; } bool Function::isTargetIntrinsic() const { - return isTargetIntrinsic(IntID); + return Intrinsic::isTargetIntrinsic(IntID); } /// Find the segment of \c IntrinsicNameTable for intrinsics with the same @@ -982,7 +982,7 @@ findTargetSubtable(StringRef Name) { /// This does the actual lookup of an intrinsic ID which matches the given /// function name. -Intrinsic::ID Function::lookupIntrinsicID(StringRef Name) { +Intrinsic::ID Intrinsic::lookupIntrinsicID(StringRef Name) { auto [NameTable, Target] = findTargetSubtable(Name); int Idx = Intrinsic::lookupLLVMIntrinsicByName(NameTable, Name, Target); if (Idx == -1) @@ -1011,7 +1011,7 @@ void Function::updateAfterNameChange() { return; } HasLLVMReservedName = true; - IntID = lookupIntrinsicID(Name); + IntID = Intrinsic::lookupIntrinsicID(Name); } /// Returns a stable mangling for the type specified for use in the name diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index c5ff1e7acbfd2..d88fd09a1aa07 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -479,9 +479,7 @@ class MasmParser : public MCAsmParser { void addDirectiveHandler(StringRef Directive, ExtensionDirectiveHandler Handler) override { ExtensionDirectiveMap[Directive] = Handler; - if (!DirectiveKindMap.contains(Directive)) { - DirectiveKindMap[Directive] = DK_HANDLER_DIRECTIVE; - } + DirectiveKindMap.try_emplace(Directive, DK_HANDLER_DIRECTIVE); } void addAliasForDirective(StringRef Directive, StringRef Alias) override { diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp index 114045561366d..c61ba868efe60 100644 --- a/llvm/lib/Object/ArchiveWriter.cpp +++ b/llvm/lib/Object/ArchiveWriter.cpp @@ -754,9 +754,8 @@ static Expected> getSymbols(SymbolicFile *Obj, raw_string_ostream NameStream(Name); if (Error E = S.printName(NameStream)) return std::move(E); - if (Map->find(Name) != Map->end()) + if (!Map->try_emplace(Name, Index).second) continue; // ignore duplicated symbol - (*Map)[Name] = Index; if (Map == &SymMap->Map) { Ret.push_back(SymNames.tell()); SymNames << Name << '\0'; diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index a545ae5862397..036484c9c1c0c 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -1357,7 +1357,7 @@ void PreservedCFGCheckerInstrumentation::registerCallbacks( bool Registered = false; PIC.registerBeforeNonSkippedPassCallback([this, &MAM, Registered]( StringRef P, Any IR) mutable { -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS assert(&PassStack.emplace_back(P)); #endif (void)this; @@ -1386,7 +1386,7 @@ void PreservedCFGCheckerInstrumentation::registerCallbacks( PIC.registerAfterPassInvalidatedCallback( [this](StringRef P, const PreservedAnalyses &PassPA) { -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS assert(PassStack.pop_back_val() == P && "Before and After callbacks must correspond"); #endif @@ -1395,7 +1395,7 @@ void PreservedCFGCheckerInstrumentation::registerCallbacks( PIC.registerAfterPassCallback([this, &MAM](StringRef P, Any IR, const PreservedAnalyses &PassPA) { -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS assert(PassStack.pop_back_val() == P && "Before and After callbacks must correspond"); #endif diff --git a/llvm/lib/SandboxIR/BasicBlock.cpp b/llvm/lib/SandboxIR/BasicBlock.cpp index 7eba53ffb5ec4..ebca41aa39da8 100644 --- a/llvm/lib/SandboxIR/BasicBlock.cpp +++ b/llvm/lib/SandboxIR/BasicBlock.cpp @@ -8,7 +8,7 @@ #include "llvm/SandboxIR/BasicBlock.h" #include "llvm/SandboxIR/Context.h" -#include "llvm/SandboxIR/SandboxIR.h" // TODO: remove this +#include "llvm/SandboxIR/Instruction.h" namespace llvm::sandboxir { diff --git a/llvm/lib/SandboxIR/CMakeLists.txt b/llvm/lib/SandboxIR/CMakeLists.txt index deea86d442d39..293be1849f29d 100644 --- a/llvm/lib/SandboxIR/CMakeLists.txt +++ b/llvm/lib/SandboxIR/CMakeLists.txt @@ -3,14 +3,15 @@ add_llvm_component_library(LLVMSandboxIR BasicBlock.cpp Constant.cpp Context.cpp + Instruction.cpp Module.cpp Pass.cpp PassManager.cpp Region.cpp - SandboxIR.cpp Tracker.cpp Type.cpp User.cpp + Use.cpp Value.cpp ADDITIONAL_HEADER_DIRS diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index d10cb18e6d368..0a61e329b78c5 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -7,7 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Context.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Instruction.h" +#include "llvm/SandboxIR/Module.h" namespace llvm::sandboxir { @@ -670,6 +671,8 @@ Context::Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx), IRTracker(*this), LLVMIRBuilder(LLVMCtx, ConstantFolder()) {} +Context::~Context() {} + Module *Context::getModule(llvm::Module *LLVMM) const { auto It = LLVMModuleToModuleMap.find(LLVMM); if (It != LLVMModuleToModuleMap.end()) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/Instruction.cpp similarity index 96% rename from llvm/lib/SandboxIR/SandboxIR.cpp rename to llvm/lib/SandboxIR/Instruction.cpp index 5baeffef32e5e..919a44fca8b04 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/Instruction.cpp @@ -1,4 +1,4 @@ -//===- SandboxIR.cpp - A transactional overlay IR on top of LLVM IR -------===// +//===- Instruction.cpp - The Instructions of Sandbox IR -------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,106 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/SandboxIR/SandboxIR.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Constants.h" -#include "llvm/SandboxIR/Argument.h" -#include "llvm/SandboxIR/BasicBlock.h" -#include "llvm/Support/Debug.h" -#include +#include "llvm/SandboxIR/Instruction.h" -using namespace llvm::sandboxir; - -Value *Use::get() const { return Ctx->getValue(LLVMUse->get()); } - -void Use::set(Value *V) { - Ctx->getTracker().emplaceIfTracking(*this); - LLVMUse->set(V->Val); -} - -unsigned Use::getOperandNo() const { return Usr->getUseOperandNo(*this); } - -void Use::swap(Use &OtherUse) { - Ctx->getTracker().emplaceIfTracking(*this, OtherUse); - LLVMUse->swap(*OtherUse.LLVMUse); -} - -#ifndef NDEBUG -void Use::dumpOS(raw_ostream &OS) const { - Value *Def = nullptr; - if (LLVMUse == nullptr) - OS << " LLVM Use! "; - else - Def = Ctx->getValue(LLVMUse->get()); - OS << "Def: "; - if (Def == nullptr) - OS << "NULL"; - else - OS << *Def; - OS << "\n"; - - OS << "User: "; - if (Usr == nullptr) - OS << "NULL"; - else - OS << *Usr; - OS << "\n"; - - OS << "OperandNo: "; - if (Usr == nullptr) - OS << "N/A"; - else - OS << getOperandNo(); - OS << "\n"; -} - -void Use::dump() const { dumpOS(dbgs()); } -#endif // NDEBUG - -Use OperandUseIterator::operator*() const { return Use; } - -OperandUseIterator &OperandUseIterator::operator++() { - assert(Use.LLVMUse != nullptr && "Already at end!"); - User *User = Use.getUser(); - Use = User->getOperandUseInternal(Use.getOperandNo() + 1, /*Verify=*/false); - return *this; -} - -UserUseIterator &UserUseIterator::operator++() { - // Get the corresponding llvm::Use, get the next in the list, and update the - // sandboxir::Use. - llvm::Use *&LLVMUse = Use.LLVMUse; - assert(LLVMUse != nullptr && "Already at end!"); - LLVMUse = LLVMUse->getNext(); - if (LLVMUse == nullptr) { - Use.Usr = nullptr; - return *this; - } - auto *Ctx = Use.Ctx; - auto *LLVMUser = LLVMUse->getUser(); - Use.Usr = cast_or_null(Ctx->getValue(LLVMUser)); - return *this; -} - -OperandUseIterator OperandUseIterator::operator+(unsigned Num) const { - sandboxir::Use U = Use.getUser()->getOperandUseInternal( - Use.getOperandNo() + Num, /*Verify=*/true); - return OperandUseIterator(U); -} - -OperandUseIterator OperandUseIterator::operator-(unsigned Num) const { - assert(Use.getOperandNo() >= Num && "Out of bounds!"); - sandboxir::Use U = Use.getUser()->getOperandUseInternal( - Use.getOperandNo() - Num, /*Verify=*/true); - return OperandUseIterator(U); -} - -int OperandUseIterator::operator-(const OperandUseIterator &Other) const { - int ThisOpNo = Use.getOperandNo(); - int OtherOpNo = Other.Use.getOperandNo(); - return ThisOpNo - OtherOpNo; -} +namespace llvm::sandboxir { const char *Instruction::getOpcodeName(Opcode Opc) { switch (Opc) { @@ -2058,3 +1961,5 @@ ConstantTokenNone *ConstantTokenNone::get(Context &Ctx) { auto *LLVMC = llvm::ConstantTokenNone::get(Ctx.LLVMCtx); return cast(Ctx.getOrCreateConstant(LLVMC)); } + +} // namespace llvm::sandboxir diff --git a/llvm/lib/SandboxIR/Module.cpp b/llvm/lib/SandboxIR/Module.cpp index 7510f621556d4..a6a5fb2aae8a1 100644 --- a/llvm/lib/SandboxIR/Module.cpp +++ b/llvm/lib/SandboxIR/Module.cpp @@ -7,7 +7,9 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Module.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/Context.h" +#include "llvm/SandboxIR/Value.h" using namespace llvm::sandboxir; diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp index 4abd39b28e87a..4168420a01ce2 100644 --- a/llvm/lib/SandboxIR/PassManager.cpp +++ b/llvm/lib/SandboxIR/PassManager.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/PassManager.h" -#include "llvm/SandboxIR/SandboxIR.h" using namespace llvm::sandboxir; diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index b1f472d7928f4..abcad39330094 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -10,7 +10,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instruction.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Instruction.h" #include using namespace llvm::sandboxir; diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp index 87dcb726dde35..7bb788ecf25a6 100644 --- a/llvm/lib/SandboxIR/Type.cpp +++ b/llvm/lib/SandboxIR/Type.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Type.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Context.h" using namespace llvm::sandboxir; diff --git a/llvm/lib/SandboxIR/Use.cpp b/llvm/lib/SandboxIR/Use.cpp new file mode 100644 index 0000000000000..ffbd41da51849 --- /dev/null +++ b/llvm/lib/SandboxIR/Use.cpp @@ -0,0 +1,61 @@ +//===- Use.cpp ------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/SandboxIR/Use.h" +#include "llvm/SandboxIR/Context.h" +#include "llvm/SandboxIR/User.h" + +namespace llvm::sandboxir { + +Value *Use::get() const { return Ctx->getValue(LLVMUse->get()); } + +void Use::set(Value *V) { + Ctx->getTracker().emplaceIfTracking(*this); + LLVMUse->set(V->Val); +} + +unsigned Use::getOperandNo() const { return Usr->getUseOperandNo(*this); } + +void Use::swap(Use &OtherUse) { + Ctx->getTracker().emplaceIfTracking(*this, OtherUse); + LLVMUse->swap(*OtherUse.LLVMUse); +} + +#ifndef NDEBUG +void Use::dumpOS(raw_ostream &OS) const { + Value *Def = nullptr; + if (LLVMUse == nullptr) + OS << " LLVM Use! "; + else + Def = Ctx->getValue(LLVMUse->get()); + OS << "Def: "; + if (Def == nullptr) + OS << "NULL"; + else + OS << *Def; + OS << "\n"; + + OS << "User: "; + if (Usr == nullptr) + OS << "NULL"; + else + OS << *Usr; + OS << "\n"; + + OS << "OperandNo: "; + if (Usr == nullptr) + OS << "N/A"; + else + OS << getOperandNo(); + OS << "\n"; +} + +void Use::dump() const { dumpOS(dbgs()); } +#endif // NDEBUG + +} // namespace llvm::sandboxir diff --git a/llvm/lib/SandboxIR/User.cpp b/llvm/lib/SandboxIR/User.cpp index 8afa52e32b762..148d75199439a 100644 --- a/llvm/lib/SandboxIR/User.cpp +++ b/llvm/lib/SandboxIR/User.cpp @@ -11,6 +11,50 @@ namespace llvm::sandboxir { +Use OperandUseIterator::operator*() const { return Use; } + +OperandUseIterator &OperandUseIterator::operator++() { + assert(Use.LLVMUse != nullptr && "Already at end!"); + User *User = Use.getUser(); + Use = User->getOperandUseInternal(Use.getOperandNo() + 1, /*Verify=*/false); + return *this; +} + +UserUseIterator &UserUseIterator::operator++() { + // Get the corresponding llvm::Use, get the next in the list, and update the + // sandboxir::Use. + llvm::Use *&LLVMUse = Use.LLVMUse; + assert(LLVMUse != nullptr && "Already at end!"); + LLVMUse = LLVMUse->getNext(); + if (LLVMUse == nullptr) { + Use.Usr = nullptr; + return *this; + } + auto *Ctx = Use.Ctx; + auto *LLVMUser = LLVMUse->getUser(); + Use.Usr = cast_or_null(Ctx->getValue(LLVMUser)); + return *this; +} + +OperandUseIterator OperandUseIterator::operator+(unsigned Num) const { + sandboxir::Use U = Use.getUser()->getOperandUseInternal( + Use.getOperandNo() + Num, /*Verify=*/true); + return OperandUseIterator(U); +} + +OperandUseIterator OperandUseIterator::operator-(unsigned Num) const { + assert(Use.getOperandNo() >= Num && "Out of bounds!"); + sandboxir::Use U = Use.getUser()->getOperandUseInternal( + Use.getOperandNo() - Num, /*Verify=*/true); + return OperandUseIterator(U); +} + +int OperandUseIterator::operator-(const OperandUseIterator &Other) const { + int ThisOpNo = Use.getOperandNo(); + int OtherOpNo = Other.Use.getOperandNo(); + return ThisOpNo - OtherOpNo; +} + Use User::getOperandUseDefault(unsigned OpIdx, bool Verify) const { assert((!Verify || OpIdx < getNumOperands()) && "Out of bounds!"); assert(isa(Val) && "Non-users have no operands!"); diff --git a/llvm/lib/SandboxIR/Value.cpp b/llvm/lib/SandboxIR/Value.cpp index 40cf14c7e9b6f..b9d91c7e11f74 100644 --- a/llvm/lib/SandboxIR/Value.cpp +++ b/llvm/lib/SandboxIR/Value.cpp @@ -8,7 +8,7 @@ #include "llvm/SandboxIR/Value.h" #include "llvm/SandboxIR/Context.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/User.h" #include namespace llvm::sandboxir { diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index fde07d84e97f5..be33331be4e8f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -285,6 +285,11 @@ static cl::opt StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming", cl::init(false), cl::Hidden); +static cl::opt DisableMultiVectorSpillFill( + "aarch64-disable-multivector-spill-fill", + cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false), + cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// Returns how much of the incoming argument stack area (in bytes) we should @@ -2954,6 +2959,24 @@ unsigned findFreePredicateReg(BitVector &SavedRegs) { return AArch64::NoRegister; } +// The multivector LD/ST are available only for SME or SVE2p1 targets +bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget, + MachineFunction &MF) { + if (DisableMultiVectorSpillFill) + return false; + + SMEAttrs FuncAttrs(MF.getFunction()); + bool IsLocallyStreaming = + FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface(); + + // Only when in streaming mode SME2 instructions can be safely used. + // It is not safe to use SME2 instructions when in streaming compatible or + // locally streaming mode. + return Subtarget.hasSVE2p1() || + (Subtarget.hasSME2() && + (!IsLocallyStreaming && Subtarget.isStreaming())); +} + static void computeCalleeSaveRegisterPairs( MachineFunction &MF, ArrayRef CSI, const TargetRegisterInfo *TRI, SmallVectorImpl &RegPairs, @@ -3330,7 +3353,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MF.getSubtarget(); AArch64FunctionInfo *AFI = MF.getInfo(); unsigned PnReg = AFI->getPredicateRegForFillSpill(); - assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) && + assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) && "Expects SVE2.1 or SME2 target and a predicate register"); #ifdef EXPENSIVE_CHECKS auto IsPPR = [](const RegPairInfo &c) { @@ -3508,7 +3531,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( [[maybe_unused]] const AArch64Subtarget &Subtarget = MF.getSubtarget(); unsigned PnReg = AFI->getPredicateRegForFillSpill(); - assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) && + assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) && "Expects SVE2.1 or SME2 target and a predicate register"); #ifdef EXPENSIVE_CHECKS assert(!(PPRBegin < ZPRBegin) && @@ -3722,7 +3745,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.test(CSRegs[i ^ 1])); } - if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2())) { + if (HasPairZReg && enableMultiVectorSpillFill(Subtarget, MF)) { AArch64FunctionInfo *AFI = MF.getInfo(); // Find a suitable predicate register for the multi-vector spill/fill // instructions. diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index dfb6b08b1f73b..6133580a3cd77 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -400,8 +400,10 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { return SelectSVERegRegAddrMode(N, Scale, Base, Offset); } - void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc, - uint32_t MaxImm); + void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, + unsigned Opc, uint32_t MaxImm); + + void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc); template bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) { @@ -1975,9 +1977,10 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } -void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, - unsigned NumOutVecs, - unsigned Opc, uint32_t MaxImm) { +void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node, + unsigned NumOutVecs, + unsigned Opc, + uint32_t MaxImm) { if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4))) if (Imm->getZExtValue() > MaxImm) return; @@ -1985,6 +1988,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, SDValue ZtValue; if (!ImmToReg(Node->getOperand(2), ZtValue)) return; + SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; SDLoc DL(Node); EVT VT = Node->getValueType(0); @@ -2003,6 +2007,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, CurDAG->RemoveDeadNode(Node); } +void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, + unsigned NumOutVecs, + unsigned Opc) { + + SDValue ZtValue; + SmallVector Ops; + if (!ImmToReg(Node->getOperand(2), ZtValue)) + return; + + Ops.push_back(ZtValue); + Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)})); + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + + SDNode *Instruction = + CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue SuperReg = SDValue(Instruction, 0); + + for (unsigned I = 0; I < NumOutVecs; ++I) + ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + I, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumOutVecs; + ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1)); + CurDAG->RemoveDeadNode(Node); +} + void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs, unsigned Op) { SDLoc DL(N); @@ -5478,7 +5510,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { {AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_4ZTZI_H, AArch64::LUTI2_4ZTZI_S})) // Second Immediate must be <= 3: - SelectMultiVectorLuti(Node, 4, Opc, 3); + SelectMultiVectorLutiLane(Node, 4, Opc, 3); return; } case Intrinsic::aarch64_sme_luti4_lane_zt_x4: { @@ -5486,7 +5518,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { Node->getValueType(0), {0, AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_4ZTZI_S})) // Second Immediate must be <= 1: - SelectMultiVectorLuti(Node, 4, Opc, 1); + SelectMultiVectorLutiLane(Node, 4, Opc, 1); return; } case Intrinsic::aarch64_sme_luti2_lane_zt_x2: { @@ -5495,7 +5527,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { {AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H, AArch64::LUTI2_2ZTZI_S})) // Second Immediate must be <= 7: - SelectMultiVectorLuti(Node, 2, Opc, 7); + SelectMultiVectorLutiLane(Node, 2, Opc, 7); return; } case Intrinsic::aarch64_sme_luti4_lane_zt_x2: { @@ -5504,7 +5536,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { {AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H, AArch64::LUTI4_2ZTZI_S})) // Second Immediate must be <= 3: - SelectMultiVectorLuti(Node, 2, Opc, 3); + SelectMultiVectorLutiLane(Node, 2, Opc, 3); + return; + } + case Intrinsic::aarch64_sme_luti4_zt_x4: { + SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z); return; } } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index ebe4121c944b1..e2261694d658c 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -940,7 +940,7 @@ defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>; let Predicates = [HasSME2, HasSME_LUTv2] in { defm MOVT : sme2_movt_zt_to_zt<"movt", 0b0011111>; -def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">; +def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">; } //[HasSME2, HasSME_LUTv2] let Predicates = [HasSME2p1, HasSME_LUTv2] in { diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4abb5a63ab6d2..342d55e828bca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -57,6 +57,7 @@ FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *); ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPULateCodeGenPrepareLegacyPass(); +FunctionPass *createAMDGPUReserveWWMRegsPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr); @@ -154,6 +155,9 @@ struct AMDGPULowerBufferFatPointersPass const TargetMachine &TM; }; +void initializeAMDGPUReserveWWMRegsPass(PassRegistry &); +extern char &AMDGPUReserveWWMRegsID; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3626fd8bc78c1..dc94edf85586f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1118,7 +1118,7 @@ class GCNSubtargetFeatureGeneration (F); + + auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool { + int64_t Val; + if (Value->evaluateAsAbsolute(Val)) { + Res = Val; + return true; + } + return false; + }; + + const uint64_t MaxScratchPerWorkitem = + STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); + MCSymbol *ScratchSizeSymbol = + RI.getSymbol(F.getName(), RIK::RIK_PrivateSegSize, OutContext); + uint64_t ScratchSize; + if (ScratchSizeSymbol->isVariable() && + TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) && + ScratchSize > MaxScratchPerWorkitem) { + DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem, + DS_Error); + F.getContext().diagnose(DiagStackSize); + } + + // Validate addressable scalar registers (i.e., prior to added implicit + // SGPRs). + MCSymbol *NumSGPRSymbol = + RI.getSymbol(F.getName(), RIK::RIK_NumSGPR, OutContext); + if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + !STM.hasSGPRInitBug()) { + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); + uint64_t NumSgpr; + if (NumSGPRSymbol->isVariable() && + TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) && + NumSgpr > MaxAddressableNumSGPRs) { + DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers", + NumSgpr, MaxAddressableNumSGPRs, + DS_Error, DK_ResourceLimit); + F.getContext().diagnose(Diag); + return; + } + } + + MCSymbol *VCCUsedSymbol = + RI.getSymbol(F.getName(), RIK::RIK_UsesVCC, OutContext); + MCSymbol *FlatUsedSymbol = + RI.getSymbol(F.getName(), RIK::RIK_UsesFlatScratch, OutContext); + uint64_t VCCUsed, FlatUsed, NumSgpr; + + if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() && + FlatUsedSymbol->isVariable() && + TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) && + TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) && + TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) { + + // Recomputes NumSgprs + implicit SGPRs but all symbols should now be + // resolvable. + NumSgpr += IsaInfo::getNumExtraSGPRs( + &STM, VCCUsed, FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny()); + if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || + STM.hasSGPRInitBug()) { + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); + if (NumSgpr > MaxAddressableNumSGPRs) { + DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr, + MaxAddressableNumSGPRs, DS_Error, + DK_ResourceLimit); + F.getContext().diagnose(Diag); + return; + } + } + + MCSymbol *NumVgprSymbol = + RI.getSymbol(F.getName(), RIK::RIK_NumVGPR, OutContext); + MCSymbol *NumAgprSymbol = + RI.getSymbol(F.getName(), RIK::RIK_NumAGPR, OutContext); + uint64_t NumVgpr, NumAgpr; + + MachineModuleInfo &MMI = + getAnalysis().getMMI(); + MachineFunction *MF = MMI.getMachineFunction(F); + if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() && + TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) && + TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) { + const SIMachineFunctionInfo &MFI = *MF->getInfo(); + unsigned MaxWaves = MFI.getMaxWavesPerEU(); + uint64_t TotalNumVgpr = + getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr); + uint64_t NumVGPRsForWavesPerEU = std::max( + {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)}); + uint64_t NumSGPRsForWavesPerEU = std::max( + {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); + const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( + STM.computeOccupancy(F, MFI.getLDSSize()), + MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), + MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM, + OutContext); + uint64_t Occupancy; + + const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute( + F, "amdgpu-waves-per-eu", {0, 0}, true); + + if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) { + DiagnosticInfoOptimizationFailure Diag( + F, F.getSubprogram(), + "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " + "'" + + F.getName() + "': desired occupancy was " + Twine(MinWEU) + + ", final occupancy is " + Twine(Occupancy)); + F.getContext().diagnose(Diag); + return; + } + } + } +} + bool AMDGPUAsmPrinter::doFinalization(Module &M) { // Pad with s_code_end to help tools and guard against instruction prefetch // causing stale data in caches. Arguably this should be done by the linker, @@ -371,25 +494,24 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { getTargetStreamer()->EmitCodeEnd(STI); } - return AsmPrinter::doFinalization(M); -} + // Assign expressions which can only be resolved when all other functions are + // known. + RI.finalize(OutContext); -// Print comments that apply to both callable functions and entry points. -void AMDGPUAsmPrinter::emitCommonFunctionComments( - uint32_t NumVGPR, std::optional NumAGPR, uint32_t TotalNumVGPR, - uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, - const AMDGPUMachineFunction *MFI) { - OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); - OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); - OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); - if (NumAGPR) { - OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); - OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), - false); - } - OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); - OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), - false); + // Switch section and emit all GPR maximums within the processed module. + OutStreamer->pushSection(); + MCSectionELF *MaxGPRSection = + OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0); + OutStreamer->switchSection(MaxGPRSection); + getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext), + RI.getMaxAGPRSymbol(OutContext), + RI.getMaxSGPRSymbol(OutContext)); + OutStreamer->popSection(); + + for (Function &F : M.functions()) + validateMCResourceInfo(F); + + return AsmPrinter::doFinalization(M); } SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { @@ -402,12 +524,14 @@ SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { return Str; } +// Print comments that apply to both callable functions and entry points. void AMDGPUAsmPrinter::emitCommonFunctionComments( const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize, const AMDGPUMachineFunction *MFI) { OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); - OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false); + OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR), + false); OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false); if (NumAGPR && TotalNumVGPR) { OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false); @@ -540,6 +664,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->switchSection(ConfigSection); } + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = + ResourceUsage->getResourceInfo(); + RI.gatherResourceInfo(MF, Info, OutContext); + if (MFI->isModuleEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); } @@ -571,21 +699,44 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(), STM.hasMAIInsts()); + { + using RIK = MCResourceInfo::ResourceInfoKind; + getTargetStreamer()->EmitMCResourceInfo( + RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_NumSGPR, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_UsesVCC, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_HasRecursion, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_HasIndirectCall, OutContext)); + } + if (isVerbose()) { MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); OutStreamer->switchSection(CommentSection); if (!MFI->isEntryFunction()) { + using RIK = MCResourceInfo::ResourceInfoKind; OutStreamer->emitRawComment(" Function info:", false); - const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = - ResourceUsage->getResourceInfo(&MF.getFunction()); + emitCommonFunctionComments( - Info.NumVGPR, - STM.hasMAIInsts() ? Info.NumAGPR : std::optional(), - Info.getTotalNumVGPRs(STM), - Info.getTotalNumSGPRs(MF.getSubtarget()), - Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI); + RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext) + ->getVariableValue(), + STM.hasMAIInsts() + ? RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext) + ->getVariableValue() + : nullptr, + RI.createTotalNumVGPRs(MF, Ctx), + RI.createTotalNumSGPRs( + MF, + MF.getSubtarget().getTargetID().isXnackOnOrAny(), + Ctx), + RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext) + ->getVariableValue(), + getFunctionCodeSize(MF), MFI); return false; } @@ -751,10 +902,26 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const return CodeSize; } +// AccumOffset computed for the MCExpr equivalent of: +// alignTo(std::max(1, NumVGPR), 4) / 4 - 1; +static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) { + const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx); + const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx); + + // Can't be lower than 1 for subsequent alignTo. + const MCExpr *MaximumTaken = + AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx); + + // Practically, it's computing divideCeil(MaximumTaken, 4). + const MCExpr *DivCeil = MCBinaryExpr::createDiv( + AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour, + Ctx); + + return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx); +} + void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) { - const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = - ResourceUsage->getResourceInfo(&MF.getFunction()); const GCNSubtarget &STM = MF.getSubtarget(); MCContext &Ctx = MF.getContext(); @@ -771,28 +938,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, return false; }; - ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR); - ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR); - ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM)); - ProgInfo.AccumOffset = - CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1); + auto GetSymRefExpr = + [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * { + MCSymbol *Sym = RI.getSymbol(MF.getName(), RIK, OutContext); + return MCSymbolRefExpr::create(Sym, Ctx); + }; + + using RIK = MCResourceInfo::ResourceInfoKind; + ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR); + ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR); + ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( + ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); + + ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx); ProgInfo.TgSplit = STM.isTgSplitEnabled(); - ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR); - ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize); - ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC); - ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch); + ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR); + ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize); + ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC); + ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch); ProgInfo.DynamicCallStack = - CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion); - - const uint64_t MaxScratchPerWorkitem = - STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); - uint64_t ScratchSize; - if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) && - ScratchSize > MaxScratchPerWorkitem) { - DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize, - MaxScratchPerWorkitem, DS_Error); - MF.getFunction().getContext().diagnose(DiagStackSize); - } + MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack), + GetSymRefExpr(RIK::RIK_HasRecursion), Ctx); const SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -1477,6 +1643,8 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AsmPrinter::getAnalysisUsage(AU); } @@ -1522,7 +1690,7 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks( // printing multiple diagnostic location and diag opts. EmitResourceUsageRemark("FunctionName", "Function Name", MF.getFunction().getName()); - EmitResourceUsageRemark("NumSGPR", "SGPRs", + EmitResourceUsageRemark("NumSGPR", "TotalSGPRs", getMCExprStr(CurrentProgramInfo.NumSGPR)); EmitResourceUsageRemark("NumVGPR", "VGPRs", getMCExprStr(CurrentProgramInfo.NumArchVGPR)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index f66bbde42ce27..cc8c4411805e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H +#include "AMDGPUMCResourceInfo.h" #include "SIProgramInfo.h" #include "llvm/CodeGen/AsmPrinter.h" @@ -24,6 +25,7 @@ struct AMDGPUResourceUsageAnalysis; class AMDGPUTargetStreamer; class MCCodeEmitter; class MCOperand; +class MCResourceInfo; namespace AMDGPU { struct MCKernelDescriptor; @@ -40,6 +42,8 @@ class AMDGPUAsmPrinter final : public AsmPrinter { AMDGPUResourceUsageAnalysis *ResourceUsage; + MCResourceInfo RI; + SIProgramInfo CurrentProgramInfo; std::unique_ptr HSAMetadataStream; @@ -60,11 +64,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter { void EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &KernelInfo); void emitPALFunctionMetadata(const MachineFunction &MF); - void emitCommonFunctionComments(uint32_t NumVGPR, - std::optional NumAGPR, - uint32_t TotalNumVGPR, uint32_t NumSGPR, - uint64_t ScratchSize, uint64_t CodeSize, - const AMDGPUMachineFunction *MFI); void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, const MCExpr *NumSGPR, @@ -84,6 +83,11 @@ class AMDGPUAsmPrinter final : public AsmPrinter { SmallString<128> getMCExprStr(const MCExpr *Value); + /// Attempts to replace the validation that is missed in getSIProgramInfo due + /// to MCExpr being unknown. Invoked during doFinalization such that the + /// MCResourceInfo symbols are known. + void validateMCResourceInfo(Function &F); + public: explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td index 3533087bbfd1b..f832a2a55d622 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -18,15 +18,17 @@ def FeatureFMA : SubtargetFeature<"fmaf", "Enable single precision FMA (not as fast as mul+add, but fused)" >; -class SubtargetFeatureLocalMemorySize : SubtargetFeature< - "localmemorysize"#Value, - "LocalMemorySize", +// Addressable local memory size is the maximum number of bytes of LDS that can +// be allocated to a single workgroup. +class SubtargetFeatureAddressableLocalMemorySize : SubtargetFeature< + "addressablelocalmemorysize"#Value, + "AddressableLocalMemorySize", !cast(Value), "The size of local memory in bytes" >; -def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; -def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; +def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>; +def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>; class SubtargetFeatureWavefrontSize : SubtargetFeature< "wavefrontsize"#!shl(1, ValueLog2), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index d3d5bc924525f..ff8798edb3cc0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1911,7 +1911,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32); return true; } @@ -1966,7 +1966,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; - Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); return true; } } @@ -1999,7 +1999,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp new file mode 100644 index 0000000000000..f608a9a4f470f --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp @@ -0,0 +1,224 @@ +//===- AMDGPUMCResourceInfo.cpp --- MC Resource Info ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief MC infrastructure to propagate the function level resource usage +/// info. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCResourceInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSymbol.h" + +using namespace llvm; + +MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK, + MCContext &OutContext) { + auto GOCS = [FuncName, &OutContext](StringRef Suffix) { + return OutContext.getOrCreateSymbol(FuncName + Twine(Suffix)); + }; + switch (RIK) { + case RIK_NumVGPR: + return GOCS(".num_vgpr"); + case RIK_NumAGPR: + return GOCS(".num_agpr"); + case RIK_NumSGPR: + return GOCS(".numbered_sgpr"); + case RIK_PrivateSegSize: + return GOCS(".private_seg_size"); + case RIK_UsesVCC: + return GOCS(".uses_vcc"); + case RIK_UsesFlatScratch: + return GOCS(".uses_flat_scratch"); + case RIK_HasDynSizedStack: + return GOCS(".has_dyn_sized_stack"); + case RIK_HasRecursion: + return GOCS(".has_recursion"); + case RIK_HasIndirectCall: + return GOCS(".has_indirect_call"); + } + llvm_unreachable("Unexpected ResourceInfoKind."); +} + +const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName, + ResourceInfoKind RIK, + MCContext &Ctx) { + return MCSymbolRefExpr::create(getSymbol(FuncName, RIK, Ctx), Ctx); +} + +void MCResourceInfo::assignMaxRegs(MCContext &OutContext) { + // Assign expression to get the max register use to the max_num_Xgpr symbol. + MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext); + MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext); + MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext); + + auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) { + const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext); + Sym->setVariableValue(MaxExpr); + }; + + assignMaxRegSym(MaxVGPRSym, MaxVGPR); + assignMaxRegSym(MaxAGPRSym, MaxAGPR); + assignMaxRegSym(MaxSGPRSym, MaxSGPR); +} + +void MCResourceInfo::finalize(MCContext &OutContext) { + assert(!Finalized && "Cannot finalize ResourceInfo again."); + Finalized = true; + assignMaxRegs(OutContext); +} + +MCSymbol *MCResourceInfo::getMaxVGPRSymbol(MCContext &OutContext) { + return OutContext.getOrCreateSymbol("amdgpu.max_num_vgpr"); +} + +MCSymbol *MCResourceInfo::getMaxAGPRSymbol(MCContext &OutContext) { + return OutContext.getOrCreateSymbol("amdgpu.max_num_agpr"); +} + +MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) { + return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr"); +} + +void MCResourceInfo::assignResourceInfoExpr( + int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind, + const MachineFunction &MF, const SmallVectorImpl &Callees, + MCContext &OutContext) { + const MCConstantExpr *LocalConstExpr = + MCConstantExpr::create(LocalValue, OutContext); + const MCExpr *SymVal = LocalConstExpr; + if (!Callees.empty()) { + SmallVector ArgExprs; + // Avoid recursive symbol assignment. + SmallPtrSet Seen; + ArgExprs.push_back(LocalConstExpr); + const Function &F = MF.getFunction(); + Seen.insert(&F); + + for (const Function *Callee : Callees) { + if (!Seen.insert(Callee).second) + continue; + MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK, OutContext); + ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext)); + } + SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext); + } + MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext); + Sym->setVariableValue(SymVal); +} + +void MCResourceInfo::gatherResourceInfo( + const MachineFunction &MF, + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI, + MCContext &OutContext) { + // Worst case VGPR use for non-hardware-entrypoints. + MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext); + MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext); + MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext); + + if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) { + addMaxVGPRCandidate(FRI.NumVGPR); + addMaxAGPRCandidate(FRI.NumAGPR); + addMaxSGPRCandidate(FRI.NumExplicitSGPR); + } + + auto SetMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs, + ResourceInfoKind RIK) { + if (!FRI.HasIndirectCall) { + assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF, + FRI.Callees, OutContext); + } else { + const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext); + MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK, OutContext); + const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax( + {MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext); + LocalNumSym->setVariableValue(MaxWithLocal); + } + }; + + SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR); + SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR); + SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR); + + { + // The expression for private segment size should be: FRI.PrivateSegmentSize + // + max(FRI.Callees, FRI.CalleeSegmentSize) + SmallVector ArgExprs; + if (FRI.CalleeSegmentSize) + ArgExprs.push_back( + MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext)); + + if (!FRI.HasIndirectCall) { + for (const Function *Callee : FRI.Callees) { + MCSymbol *calleeValSym = + getSymbol(Callee->getName(), RIK_PrivateSegSize, OutContext); + ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext)); + } + } + const MCExpr *localConstExpr = + MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext); + if (!ArgExprs.empty()) { + const AMDGPUMCExpr *transitiveExpr = + AMDGPUMCExpr::createMax(ArgExprs, OutContext); + localConstExpr = + MCBinaryExpr::createAdd(localConstExpr, transitiveExpr, OutContext); + } + getSymbol(MF.getName(), RIK_PrivateSegSize, OutContext) + ->setVariableValue(localConstExpr); + } + + auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) { + MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext); + Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext)); + }; + + if (!FRI.HasIndirectCall) { + assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + assignResourceInfoExpr(FRI.UsesFlatScratch, + ResourceInfoKind::RIK_UsesFlatScratch, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + assignResourceInfoExpr(FRI.HasDynamicallySizedStack, + ResourceInfoKind::RIK_HasDynSizedStack, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + assignResourceInfoExpr(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + assignResourceInfoExpr(FRI.HasIndirectCall, + ResourceInfoKind::RIK_HasIndirectCall, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + } else { + SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC); + SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch); + SetToLocal(FRI.HasDynamicallySizedStack, + ResourceInfoKind::RIK_HasDynSizedStack); + SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion); + SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall); + } +} + +const MCExpr *MCResourceInfo::createTotalNumVGPRs(const MachineFunction &MF, + MCContext &Ctx) { + return AMDGPUMCExpr::createTotalNumVGPR( + getSymRefExpr(MF.getName(), RIK_NumAGPR, Ctx), + getSymRefExpr(MF.getName(), RIK_NumVGPR, Ctx), Ctx); +} + +const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF, + bool hasXnack, + MCContext &Ctx) { + return MCBinaryExpr::createAdd( + getSymRefExpr(MF.getName(), RIK_NumSGPR, Ctx), + AMDGPUMCExpr::createExtraSGPRs( + getSymRefExpr(MF.getName(), RIK_UsesVCC, Ctx), + getSymRefExpr(MF.getName(), RIK_UsesFlatScratch, Ctx), hasXnack, Ctx), + Ctx); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h new file mode 100644 index 0000000000000..08c0c106d5aa9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h @@ -0,0 +1,102 @@ +//===- AMDGPUMCResourceInfo.h ----- MC Resource Info --------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief MC infrastructure to propagate the function level resource usage +/// info. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H + +#include "AMDGPUResourceUsageAnalysis.h" +#include "MCTargetDesc/AMDGPUMCExpr.h" + +namespace llvm { + +class MCContext; +class MCSymbol; +class StringRef; +class MachineFunction; + +class MCResourceInfo { +public: + enum ResourceInfoKind { + RIK_NumVGPR, + RIK_NumAGPR, + RIK_NumSGPR, + RIK_PrivateSegSize, + RIK_UsesVCC, + RIK_UsesFlatScratch, + RIK_HasDynSizedStack, + RIK_HasRecursion, + RIK_HasIndirectCall + }; + +private: + int32_t MaxVGPR = 0; + int32_t MaxAGPR = 0; + int32_t MaxSGPR = 0; + + // Whether the MCResourceInfo has been finalized through finalize(MCContext + // &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR + // symbols to their final value. + bool Finalized = false; + + void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK, + AMDGPUMCExpr::VariantKind Kind, + const MachineFunction &MF, + const SmallVectorImpl &Callees, + MCContext &OutContext); + + // Assigns expression for Max S/V/A-GPRs to the referenced symbols. + void assignMaxRegs(MCContext &OutContext); + +public: + MCResourceInfo() = default; + void addMaxVGPRCandidate(int32_t candidate) { + MaxVGPR = std::max(MaxVGPR, candidate); + } + void addMaxAGPRCandidate(int32_t candidate) { + MaxAGPR = std::max(MaxAGPR, candidate); + } + void addMaxSGPRCandidate(int32_t candidate) { + MaxSGPR = std::max(MaxSGPR, candidate); + } + + MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK, + MCContext &OutContext); + const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK, + MCContext &Ctx); + + // Resolves the final symbols that requires the inter-function resource info + // to be resolved. + void finalize(MCContext &OutContext); + + MCSymbol *getMaxVGPRSymbol(MCContext &OutContext); + MCSymbol *getMaxAGPRSymbol(MCContext &OutContext); + MCSymbol *getMaxSGPRSymbol(MCContext &OutContext); + + /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function + /// granularity. However, some resource info has to be assigned the call + /// transitive maximum or accumulative. For example, if A calls B and B's VGPR + /// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore, + /// functions with indirect calls should be assigned the module level maximum. + void gatherResourceInfo( + const MachineFunction &MF, + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI, + MCContext &OutContext); + + const MCExpr *createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx); + const MCExpr *createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack, + MCContext &Ctx); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp new file mode 100644 index 0000000000000..7dc492a8f7adf --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp @@ -0,0 +1,96 @@ +//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to reserved regs list -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass should be invoked at the end of wwm-regalloc pipeline. +/// It identifies the WWM regs allocated during this pipeline and add +/// them to the list of reserved registers so that they won't be available for +/// per-thread VGPR allocation in the subsequent regalloc pipeline. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-reserve-wwm-regs" + +namespace { + +class AMDGPUReserveWWMRegs : public MachineFunctionPass { +public: + static char ID; + + AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) { + initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "AMDGPU Reserve WWM Registers"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE, + "AMDGPU Reserve WWM Registers", false, false) + +char AMDGPUReserveWWMRegs::ID = 0; + +char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID; + +bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + unsigned Opc = MI.getOpcode(); + if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR && + Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR) + continue; + + Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR + ? MI.getOperand(0).getReg() + : MI.getOperand(1).getReg(); + + assert(Reg.isPhysical() && + "All WWM registers should have been allocated by now."); + + MFI->reserveWWMRegister(Reg); + Changed |= true; + } + } + + // The renamable flag can't be set for reserved registers. Reset the flag for + // MOs involving wwm-regs as they will be reserved during vgpr-regalloc + // pipeline. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (Register Reg : MFI->getWWMReservedRegs()) { + for (MachineOperand &MO : MRI.reg_operands(Reg)) + MO.setIsRenamable(false); + } + + // Now clear the NonWWMRegMask earlier set during wwm-regalloc. + MFI->clearNonWWMRegAllocMask(); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 0aca99a82d197..1ee3c40d69a3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -13,14 +13,6 @@ /// The results of this analysis are used to fill the register usage, flat /// usage, etc. into hardware registers. /// -/// The analysis takes callees into account. E.g. if a function A that needs 10 -/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A -/// will return 20. -/// It is assumed that an indirect call can go into any function except -/// hardware-entrypoints. Therefore the register usage of functions with -/// indirect calls is estimated as the maximum of all non-entrypoint functions -/// in the module. -/// //===----------------------------------------------------------------------===// #include "AMDGPUResourceUsageAnalysis.h" @@ -28,8 +20,8 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" @@ -78,92 +70,37 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, return false; } -int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( - const GCNSubtarget &ST) const { - return NumExplicitSGPR + - IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, - ST.getTargetID().isXnackOnOrAny()); -} - -int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( - const GCNSubtarget &ST) const { - return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR); -} - -bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { +bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; - MachineModuleInfo &MMI = getAnalysis().getMMI(); const TargetMachine &TM = TPC->getTM(); const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); - bool HasIndirectCall = false; - - CallGraph CG = CallGraph(M); - auto End = po_end(&CG); // By default, for code object v5 and later, track only the minimum scratch // size uint32_t AssumedStackSizeForDynamicSizeObjects = clAssumedStackSizeForDynamicSizeObjects; uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; - if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || + if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= + AMDGPU::AMDHSA_COV5 || STI.getTargetTriple().getOS() == Triple::AMDPAL) { - if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0) + if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) AssumedStackSizeForDynamicSizeObjects = 0; - if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0) + if (!clAssumedStackSizeForExternalCall.getNumOccurrences()) AssumedStackSizeForExternalCall = 0; } - for (auto IT = po_begin(&CG); IT != End; ++IT) { - Function *F = IT->getFunction(); - if (!F || F->isDeclaration()) - continue; - - MachineFunction *MF = MMI.getMachineFunction(*F); - assert(MF && "function must have been generated already"); - - auto CI = - CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); - SIFunctionResourceInfo &Info = CI.first->second; - assert(CI.second && "should only be called once per function"); - Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects, - AssumedStackSizeForExternalCall); - HasIndirectCall |= Info.HasIndirectCall; - } - - // It's possible we have unreachable functions in the module which weren't - // visited by the PO traversal. Make sure we have some resource counts to - // report. - for (const auto &IT : CG) { - const Function *F = IT.first; - if (!F || F->isDeclaration()) - continue; - - auto CI = - CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); - if (!CI.second) // Skip already visited functions - continue; - - SIFunctionResourceInfo &Info = CI.first->second; - MachineFunction *MF = MMI.getMachineFunction(*F); - assert(MF && "function must have been generated already"); - Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects, - AssumedStackSizeForExternalCall); - HasIndirectCall |= Info.HasIndirectCall; - } - - if (HasIndirectCall) - propagateIndirectCallRegisterUsage(); + ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects, + AssumedStackSizeForExternalCall); return false; } AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo AMDGPUResourceUsageAnalysis::analyzeResourceUsage( - const MachineFunction &MF, const TargetMachine &TM, - uint32_t AssumedStackSizeForDynamicSizeObjects, + const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, uint32_t AssumedStackSizeForExternalCall) const { SIFunctionResourceInfo Info; @@ -253,7 +190,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( int32_t MaxVGPR = -1; int32_t MaxAGPR = -1; int32_t MaxSGPR = -1; - uint64_t CalleeFrameSize = 0; + Info.CalleeSegmentSize = 0; for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -512,8 +449,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( TII->getNamedOperand(MI, AMDGPU::OpName::callee); const Function *Callee = getCalleeFunction(*CalleeOp); - DenseMap::const_iterator I = - CallGraphResourceInfo.end(); // Avoid crashing on undefined behavior with an illegal call to a // kernel. If a callsite's calling convention doesn't match the @@ -522,9 +457,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) report_fatal_error("invalid call to entry function"); + auto isSameFunction = [](const MachineFunction &MF, const Function *F) { + return F == &MF.getFunction(); + }; + + if (Callee && !isSameFunction(MF, Callee)) + Info.Callees.push_back(Callee); + bool IsIndirect = !Callee || Callee->isDeclaration(); - if (!IsIndirect) - I = CallGraphResourceInfo.find(Callee); // FIXME: Call site could have norecurse on it if (!Callee || !Callee->doesNotRecurse()) { @@ -539,15 +479,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( // directly call the tail called function. If a kernel directly // calls a tail recursive function, we'll assume maximum stack size // based on the regular call instruction. - CalleeFrameSize = std::max( - CalleeFrameSize, + Info.CalleeSegmentSize = std::max( + Info.CalleeSegmentSize, static_cast(AssumedStackSizeForExternalCall)); } } - if (IsIndirect || I == CallGraphResourceInfo.end()) { - CalleeFrameSize = - std::max(CalleeFrameSize, + if (IsIndirect) { + Info.CalleeSegmentSize = + std::max(Info.CalleeSegmentSize, static_cast(AssumedStackSizeForExternalCall)); // Register usage of indirect calls gets handled later @@ -555,19 +495,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( Info.UsesFlatScratch = ST.hasFlatAddressSpace(); Info.HasDynamicallySizedStack = true; Info.HasIndirectCall = true; - } else { - // We force CodeGen to run in SCC order, so the callee's register - // usage etc. should be the cumulative usage of all callees. - MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); - MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); - MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); - CalleeFrameSize = - std::max(I->second.PrivateSegmentSize, CalleeFrameSize); - Info.UsesVCC |= I->second.UsesVCC; - Info.UsesFlatScratch |= I->second.UsesFlatScratch; - Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; - Info.HasRecursion |= I->second.HasRecursion; - Info.HasIndirectCall |= I->second.HasIndirectCall; } } } @@ -576,36 +503,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; Info.NumAGPR = MaxAGPR + 1; - Info.PrivateSegmentSize += CalleeFrameSize; return Info; } - -void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { - // Collect the maximum number of registers from non-hardware-entrypoints. - // All these functions are potential targets for indirect calls. - int32_t NonKernelMaxSGPRs = 0; - int32_t NonKernelMaxVGPRs = 0; - int32_t NonKernelMaxAGPRs = 0; - - for (const auto &I : CallGraphResourceInfo) { - if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { - auto &Info = I.getSecond(); - NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); - NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); - NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); - } - } - - // Add register usage for functions with indirect calls. - // For calls to unknown functions, we assume the maximum register usage of - // all non-hardware-entrypoints in the current module. - for (auto &I : CallGraphResourceInfo) { - auto &Info = I.getSecond(); - if (Info.HasIndirectCall) { - Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); - Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); - Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); - } - } -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h index 7f71de6749dce..92ef41f49b3ba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h @@ -15,8 +15,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H -#include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunctionPass.h" namespace llvm { @@ -24,10 +24,9 @@ class GCNSubtarget; class MachineFunction; class TargetMachine; -struct AMDGPUResourceUsageAnalysis : public ModulePass { - static char ID; - +struct AMDGPUResourceUsageAnalysis : public MachineFunctionPass { public: + static char ID; // Track resource usage for callee functions. struct SIFunctionResourceInfo { // Track the number of explicitly used VGPRs. Special registers reserved at @@ -35,48 +34,33 @@ struct AMDGPUResourceUsageAnalysis : public ModulePass { int32_t NumVGPR = 0; int32_t NumAGPR = 0; int32_t NumExplicitSGPR = 0; + uint64_t CalleeSegmentSize = 0; uint64_t PrivateSegmentSize = 0; bool UsesVCC = false; bool UsesFlatScratch = false; bool HasDynamicallySizedStack = false; bool HasRecursion = false; bool HasIndirectCall = false; - - int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; - // Total number of VGPRs is actually a combination of AGPR and VGPR - // depending on architecture - and some alignment constraints - int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; + SmallVector Callees; }; - AMDGPUResourceUsageAnalysis() : ModulePass(ID) {} + AMDGPUResourceUsageAnalysis() : MachineFunctionPass(ID) {} - bool doInitialization(Module &M) override { - CallGraphResourceInfo.clear(); - return ModulePass::doInitialization(M); - } + bool runOnMachineFunction(MachineFunction &MF) override; - bool runOnModule(Module &M) override; + const SIFunctionResourceInfo &getResourceInfo() const { return ResourceInfo; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); AU.setPreservesAll(); - } - - const SIFunctionResourceInfo &getResourceInfo(const Function *F) const { - auto Info = CallGraphResourceInfo.find(F); - assert(Info != CallGraphResourceInfo.end() && - "Failed to find resource info for function"); - return Info->getSecond(); + MachineFunctionPass::getAnalysisUsage(AU); } private: SIFunctionResourceInfo - analyzeResourceUsage(const MachineFunction &MF, const TargetMachine &TM, + analyzeResourceUsage(const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, uint32_t AssumedStackSizeForExternalCall) const; - void propagateIndirectCallRegisterUsage(); - - DenseMap CallGraphResourceInfo; + SIFunctionResourceInfo ResourceInfo; }; } // namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 49ccd2c9ae511..334322f533e54 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -226,10 +226,18 @@ class AMDGPUSubtarget { return WavefrontSizeLog2; } + /// Return the maximum number of bytes of LDS available for all workgroups + /// running on the same WGP or CU. + /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is + /// limited to 64k. unsigned getLocalMemorySize() const { return LocalMemorySize; } + /// Return the maximum number of bytes of LDS that can be allocated to a + /// single workgroup. + /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has + /// 128k in total. unsigned getAddressableLocalMemorySize() const { return AddressableLocalMemorySize; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index abd50748f2cc0..1f2148c2922de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -108,6 +108,12 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase { : RegisterRegAllocBase(N, D, C) {} }; +class WWMRegisterRegAlloc : public RegisterRegAllocBase { +public: + WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register Reg) { @@ -122,13 +128,24 @@ static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, return !static_cast(TRI).isSGPRClass(RC); } -/// -{sgpr|vgpr}-regalloc=... command line option. +static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + const Register Reg) { + const SIMachineFunctionInfo *MFI = + MRI.getMF().getInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + return !static_cast(TRI).isSGPRClass(RC) && + MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); +} + +/// -{sgpr|wwm|vgpr}-regalloc=... command line option. static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } /// A dummy default pass factory indicates whether the register allocator is /// overridden on the command line. static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; +static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag; static SGPRRegisterRegAlloc defaultSGPRRegAlloc("default", @@ -145,6 +162,11 @@ static cl::opt> + WWMRegAlloc("wwm-regalloc", cl::Hidden, + cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for WWM registers")); static void initializeDefaultSGPRRegisterAllocatorOnce() { RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); @@ -164,6 +186,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() { } } +static void initializeDefaultWWMRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = WWMRegAlloc; + WWMRegisterRegAlloc::setDefault(WWMRegAlloc); + } +} + static FunctionPass *createBasicSGPRRegisterAllocator() { return createBasicRegisterAllocator(onlyAllocateSGPRs); } @@ -188,6 +219,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() { return createFastRegisterAllocator(onlyAllocateVGPRs, true); } +static FunctionPass *createBasicWWMRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateWWMRegs); +} + +static FunctionPass *createGreedyWWMRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateWWMRegs); +} + +static FunctionPass *createFastWWMRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateWWMRegs, false); +} + static SGPRRegisterRegAlloc basicRegAllocSGPR( "basic", "basic register allocator", createBasicSGPRRegisterAllocator); static SGPRRegisterRegAlloc greedyRegAllocSGPR( @@ -204,6 +247,14 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR( static VGPRRegisterRegAlloc fastRegAllocVGPR( "fast", "fast register allocator", createFastVGPRRegisterAllocator); +static WWMRegisterRegAlloc basicRegAllocWWMReg("basic", + "basic register allocator", + createBasicWWMRegisterAllocator); +static WWMRegisterRegAlloc + greedyRegAllocWWMReg("greedy", "greedy register allocator", + createGreedyWWMRegisterAllocator); +static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator", + createFastWWMRegisterAllocator); } // anonymous namespace static cl::opt @@ -440,6 +491,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPULowerBufferFatPointersPass(*PR); + initializeAMDGPUReserveWWMRegsPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); @@ -799,12 +851,11 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { const auto *LD = dyn_cast(V); - if (!LD) + if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; // It must be a generic pointer loaded. - assert(V->getType()->isPointerTy() && - V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); + assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); const auto *Ptr = LD->getPointerOperand(); if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) @@ -990,6 +1041,7 @@ class GCNPassConfig final : public AMDGPUPassConfig { FunctionPass *createSGPRAllocPass(bool Optimized); FunctionPass *createVGPRAllocPass(bool Optimized); + FunctionPass *createWWMRegAllocPass(bool Optimized); FunctionPass *createRegAllocPass(bool Optimized) override; bool addRegAssignAndRewriteFast() override; @@ -1383,7 +1435,6 @@ void GCNPassConfig::addOptimizedRegAlloc() { } bool GCNPassConfig::addPreRewrite() { - addPass(&SILowerWWMCopiesID); if (EnableRegReassign) addPass(&GCNNSAReassignID); return true; @@ -1419,12 +1470,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { return createFastVGPRRegisterAllocator(); } +FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag, + initializeDefaultWWMRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyWWMRegisterAllocator(); + + return createFastWWMRegisterAllocator(); +} + FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { llvm_unreachable("should not be used"); } static const char RegAllocOptNotSupportedMessage[] = - "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; + "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, " + "and -vgpr-regalloc"; bool GCNPassConfig::addRegAssignAndRewriteFast() { if (!usingDefaultRegAlloc()) @@ -1436,11 +1503,19 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() { // Equivalent of PEI for SGPRs. addPass(&SILowerSGPRSpillsLegacyID); + + // To Allocate wwm registers used in whole quad mode operations (for shaders). addPass(&SIPreAllocateWWMRegsID); - addPass(createVGPRAllocPass(false)); + // For allocating other wwm register operands. + addPass(createWWMRegAllocPass(false)); addPass(&SILowerWWMCopiesID); + addPass(&AMDGPUReserveWWMRegsID); + + // For allocating per-thread VGPRs. + addPass(createVGPRAllocPass(false)); + return true; } @@ -1460,8 +1535,17 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() { // Equivalent of PEI for SGPRs. addPass(&SILowerSGPRSpillsLegacyID); + + // To Allocate wwm registers used in whole quad mode operations (for shaders). addPass(&SIPreAllocateWWMRegsID); + // For allocating other whole wave mode registers. + addPass(createWWMRegAllocPass(true)); + addPass(&SILowerWWMCopiesID); + addPass(createVirtRegRewriter(false)); + addPass(&AMDGPUReserveWWMRegsID); + + // For allocating per-thread VGPRs. addPass(createVGPRAllocPass(true)); addPreRewrite(); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 555b8cb5c6e53..e12db4ab058ed 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -203,7 +203,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNo; + MCRegister RegNo; Modifiers Mods; }; @@ -1192,10 +1192,9 @@ class AMDGPUOperand : public MCParsedAsmOperand { } static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser, - unsigned RegNo, SMLoc S, - SMLoc E) { + MCRegister Reg, SMLoc S, SMLoc E) { auto Op = std::make_unique(Register, AsmParser); - Op->Reg.RegNo = RegNo; + Op->Reg.RegNo = Reg; Op->Reg.Mods = Modifiers(); Op->StartLoc = S; Op->EndLoc = E; @@ -1357,7 +1356,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool ParseAMDKernelCodeTValue(StringRef ID, AMDGPUMCKernelCodeT &Header); bool ParseDirectiveAMDKernelCodeT(); // TODO: Possibly make subtargetHasRegister const. - bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo); + bool subtargetHasRegister(const MCRegisterInfo &MRI, MCRegister Reg); bool ParseDirectiveAMDGPUHsaKernel(); bool ParseDirectiveISAVersion(); @@ -1372,25 +1371,26 @@ class AMDGPUAsmParser : public MCTargetAsmParser { const char *AssemblerDirectiveEnd, std::string &CollectString); - bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, - RegisterKind RegKind, unsigned Reg1, SMLoc Loc); - bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, + bool AddNextRegisterToList(MCRegister &Reg, unsigned &RegWidth, + RegisterKind RegKind, MCRegister Reg1, SMLoc Loc); + bool ParseAMDGPURegister(RegisterKind &RegKind, MCRegister &Reg, unsigned &RegNum, unsigned &RegWidth, bool RestoreOnFailure = false); - bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, + bool ParseAMDGPURegister(RegisterKind &RegKind, MCRegister &Reg, unsigned &RegNum, unsigned &RegWidth, SmallVectorImpl &Tokens); - unsigned ParseRegularReg(RegisterKind &RegKind, unsigned &RegNum, - unsigned &RegWidth, - SmallVectorImpl &Tokens); - unsigned ParseSpecialReg(RegisterKind &RegKind, unsigned &RegNum, - unsigned &RegWidth, - SmallVectorImpl &Tokens); - unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum, - unsigned &RegWidth, SmallVectorImpl &Tokens); + MCRegister ParseRegularReg(RegisterKind &RegKind, unsigned &RegNum, + unsigned &RegWidth, + SmallVectorImpl &Tokens); + MCRegister ParseSpecialReg(RegisterKind &RegKind, unsigned &RegNum, + unsigned &RegWidth, + SmallVectorImpl &Tokens); + MCRegister ParseRegList(RegisterKind &RegKind, unsigned &RegNum, + unsigned &RegWidth, + SmallVectorImpl &Tokens); bool ParseRegRange(unsigned& Num, unsigned& Width); - unsigned getRegularReg(RegisterKind RegKind, unsigned RegNum, unsigned SubReg, - unsigned RegWidth, SMLoc Loc); + MCRegister getRegularReg(RegisterKind RegKind, unsigned RegNum, + unsigned SubReg, unsigned RegWidth, SMLoc Loc); bool isRegister(); bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const; @@ -1746,7 +1746,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { SMLoc getOperandLoc(std::function Test, const OperandVector &Operands) const; SMLoc getImmLoc(AMDGPUOperand::ImmTy Type, const OperandVector &Operands) const; - SMLoc getRegLoc(unsigned Reg, const OperandVector &Operands) const; + SMLoc getRegLoc(MCRegister Reg, const OperandVector &Operands) const; SMLoc getLitLoc(const OperandVector &Operands, bool SearchMandatoryLiterals = false) const; SMLoc getMandatoryLitLoc(const OperandVector &Operands) const; @@ -1773,7 +1773,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool validateOpSel(const MCInst &Inst); bool validateNeg(const MCInst &Inst, int OpName); bool validateDPP(const MCInst &Inst, const OperandVector &Operands); - bool validateVccOperand(unsigned Reg) const; + bool validateVccOperand(MCRegister Reg) const; bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands); bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands); bool validateMAISrc2(const MCInst &Inst, const OperandVector &Operands); @@ -1838,10 +1838,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { ParseStatus parseSOPPBrTarget(OperandVector &Operands); ParseStatus parseBoolReg(OperandVector &Operands); - bool parseSwizzleOperand(int64_t &Op, - const unsigned MinVal, - const unsigned MaxVal, - const StringRef ErrMsg, + bool parseSwizzleOperand(int64_t &Op, const unsigned MinVal, + const unsigned MaxVal, const Twine &ErrMsg, SMLoc &Loc); bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op, const unsigned MinVal, @@ -1855,6 +1853,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool parseSwizzleBroadcast(int64_t &Imm); bool parseSwizzleSwap(int64_t &Imm); bool parseSwizzleReverse(int64_t &Imm); + bool parseSwizzleFFT(int64_t &Imm); + bool parseSwizzleRotate(int64_t &Imm); ParseStatus parseGPRIdxMode(OperandVector &Operands); int64_t parseGPRIdxMacro(); @@ -2637,7 +2637,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { return -1; } -static unsigned getSpecialRegForName(StringRef RegName) { +static MCRegister getSpecialRegForName(StringRef RegName) { return StringSwitch(RegName) .Case("exec", AMDGPU::EXEC) .Case("vcc", AMDGPU::VCC) @@ -2709,9 +2709,9 @@ ParseStatus AMDGPUAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, return ParseStatus::Success; } -bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, - RegisterKind RegKind, unsigned Reg1, - SMLoc Loc) { +bool AMDGPUAsmParser::AddNextRegisterToList(MCRegister &Reg, unsigned &RegWidth, + RegisterKind RegKind, + MCRegister Reg1, SMLoc Loc) { switch (RegKind) { case IS_SPECIAL: if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { @@ -2824,7 +2824,7 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token, } } - return getSpecialRegForName(Str) != AMDGPU::NoRegister; + return getSpecialRegForName(Str).isValid(); } bool @@ -2833,9 +2833,9 @@ AMDGPUAsmParser::isRegister() return isRegister(getToken(), peekToken()); } -unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum, - unsigned SubReg, unsigned RegWidth, - SMLoc Loc) { +MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum, + unsigned SubReg, unsigned RegWidth, + SMLoc Loc) { assert(isRegularReg(RegKind)); unsigned AlignSize = 1; @@ -2847,24 +2847,24 @@ unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum, if (RegNum % AlignSize != 0) { Error(Loc, "invalid register alignment"); - return AMDGPU::NoRegister; + return MCRegister(); } unsigned RegIdx = RegNum / AlignSize; int RCID = getRegClass(RegKind, RegWidth); if (RCID == -1) { Error(Loc, "invalid or unsupported register size"); - return AMDGPU::NoRegister; + return MCRegister(); } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); const MCRegisterClass RC = TRI->getRegClass(RCID); if (RegIdx >= RC.getNumRegs()) { Error(Loc, "register index is out of range"); - return AMDGPU::NoRegister; + return MCRegister(); } - unsigned Reg = RC.getRegister(RegIdx); + MCRegister Reg = RC.getRegister(RegIdx); if (SubReg) { Reg = TRI->getSubReg(Reg, SubReg); @@ -2919,11 +2919,12 @@ bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) { return true; } -unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind, - unsigned &RegNum, unsigned &RegWidth, - SmallVectorImpl &Tokens) { +MCRegister AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth, + SmallVectorImpl &Tokens) { assert(isToken(AsmToken::Identifier)); - unsigned Reg = getSpecialRegForName(getTokenStr()); + MCRegister Reg = getSpecialRegForName(getTokenStr()); if (Reg) { RegNum = 0; RegWidth = 32; @@ -2934,9 +2935,10 @@ unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind, return Reg; } -unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, - unsigned &RegNum, unsigned &RegWidth, - SmallVectorImpl &Tokens) { +MCRegister AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth, + SmallVectorImpl &Tokens) { assert(isToken(AsmToken::Identifier)); StringRef RegName = getTokenStr(); auto Loc = getLoc(); @@ -2944,7 +2946,7 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, const RegInfo *RI = getRegularRegInfo(RegName); if (!RI) { Error(Loc, "invalid register name"); - return AMDGPU::NoRegister; + return MCRegister(); } Tokens.push_back(getToken()); @@ -2962,64 +2964,65 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, // Single 32-bit register: vXX. if (!getRegNum(RegSuffix, RegNum)) { Error(Loc, "invalid register index"); - return AMDGPU::NoRegister; + return MCRegister(); } RegWidth = 32; } else { // Range of registers: v[XX:YY]. ":YY" is optional. if (!ParseRegRange(RegNum, RegWidth)) - return AMDGPU::NoRegister; + return MCRegister(); } return getRegularReg(RegKind, RegNum, SubReg, RegWidth, Loc); } -unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum, - unsigned &RegWidth, - SmallVectorImpl &Tokens) { - unsigned Reg = AMDGPU::NoRegister; +MCRegister AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, + unsigned &RegNum, unsigned &RegWidth, + SmallVectorImpl &Tokens) { + MCRegister Reg; auto ListLoc = getLoc(); if (!skipToken(AsmToken::LBrac, "expected a register or a list of registers")) { - return AMDGPU::NoRegister; + return MCRegister(); } // List of consecutive registers, e.g.: [s0,s1,s2,s3] auto Loc = getLoc(); if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) - return AMDGPU::NoRegister; + return MCRegister(); if (RegWidth != 32) { Error(Loc, "expected a single 32-bit register"); - return AMDGPU::NoRegister; + return MCRegister(); } for (; trySkipToken(AsmToken::Comma); ) { RegisterKind NextRegKind; - unsigned NextReg, NextRegNum, NextRegWidth; + MCRegister NextReg; + unsigned NextRegNum, NextRegWidth; Loc = getLoc(); if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth, Tokens)) { - return AMDGPU::NoRegister; + return MCRegister(); } if (NextRegWidth != 32) { Error(Loc, "expected a single 32-bit register"); - return AMDGPU::NoRegister; + return MCRegister(); } if (NextRegKind != RegKind) { Error(Loc, "registers in a list must be of the same kind"); - return AMDGPU::NoRegister; + return MCRegister(); } if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg, Loc)) - return AMDGPU::NoRegister; + return MCRegister(); } if (!skipToken(AsmToken::RBrac, "expected a comma or a closing square bracket")) { - return AMDGPU::NoRegister; + return MCRegister(); } if (isRegularReg(RegKind)) @@ -3028,22 +3031,23 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum, return Reg; } -bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, - unsigned &RegNum, unsigned &RegWidth, +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, + MCRegister &Reg, unsigned &RegNum, + unsigned &RegWidth, SmallVectorImpl &Tokens) { auto Loc = getLoc(); - Reg = AMDGPU::NoRegister; + Reg = MCRegister(); if (isToken(AsmToken::Identifier)) { Reg = ParseSpecialReg(RegKind, RegNum, RegWidth, Tokens); - if (Reg == AMDGPU::NoRegister) + if (!Reg) Reg = ParseRegularReg(RegKind, RegNum, RegWidth, Tokens); } else { Reg = ParseRegList(RegKind, RegNum, RegWidth, Tokens); } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - if (Reg == AMDGPU::NoRegister) { + if (!Reg) { assert(Parser.hasPendingError()); return false; } @@ -3061,10 +3065,11 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, return true; } -bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, - unsigned &RegNum, unsigned &RegWidth, +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, + MCRegister &Reg, unsigned &RegNum, + unsigned &RegWidth, bool RestoreOnFailure /*=false*/) { - Reg = AMDGPU::NoRegister; + Reg = MCRegister(); SmallVector Tokens; if (ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, Tokens)) { @@ -3132,7 +3137,8 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { SMLoc StartLoc = Tok.getLoc(); SMLoc EndLoc = Tok.getEndLoc(); RegisterKind RegKind; - unsigned Reg, RegNum, RegWidth; + MCRegister Reg; + unsigned RegNum, RegWidth; if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { return nullptr; @@ -3735,7 +3741,7 @@ bool AMDGPUAsmParser::validateConstantBusLimitations( const MCInst &Inst, const OperandVector &Operands) { const unsigned Opcode = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opcode); - unsigned LastSGPR = AMDGPU::NoRegister; + MCRegister LastSGPR; unsigned ConstantBusUseCount = 0; unsigned NumLiterals = 0; unsigned LiteralSize; @@ -4688,7 +4694,7 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, } // Check if VCC register matches wavefront size -bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const { +bool AMDGPUAsmParser::validateVccOperand(MCRegister Reg) const { auto FB = getFeatureBits(); return (FB[AMDGPU::FeatureWavefrontSize64] && Reg == AMDGPU::VCC) || (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO); @@ -4820,7 +4826,7 @@ bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const { if (!Op.isReg()) continue; - unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); if (!Sub) continue; @@ -6248,15 +6254,15 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { } bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, - unsigned RegNo) { - if (MRI.regsOverlap(TTMP12_TTMP13_TTMP14_TTMP15, RegNo)) + MCRegister Reg) { + if (MRI.regsOverlap(TTMP12_TTMP13_TTMP14_TTMP15, Reg)) return isGFX9Plus(); // GFX10+ has 2 more SGPRs 104 and 105. - if (MRI.regsOverlap(SGPR104_SGPR105, RegNo)) + if (MRI.regsOverlap(SGPR104_SGPR105, Reg)) return hasSGPR104_SGPR105(); - switch (RegNo) { + switch (Reg.id()) { case SRC_SHARED_BASE_LO: case SRC_SHARED_BASE: case SRC_SHARED_LIMIT_LO: @@ -6295,7 +6301,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, // No flat_scr on SI. // On GFX10Plus flat scratch is not a valid register operand and can only be // accessed with s_setreg/s_getreg. - switch (RegNo) { + switch (Reg.id()) { case FLAT_SCR: case FLAT_SCR_LO: case FLAT_SCR_HI: @@ -6307,7 +6313,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that // SI/CI have. - if (MRI.regsOverlap(SGPR102_SGPR103, RegNo)) + if (MRI.regsOverlap(SGPR102_SGPR103, Reg)) return hasSGPR102_SGPR103(); return true; @@ -7105,7 +7111,7 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { if (Op.isOff()) { assert(SrcIdx < 4); OperandIdx[SrcIdx] = Inst.size(); - Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister)); + Inst.addOperand(MCOperand::createReg(MCRegister())); ++SrcIdx; continue; } @@ -7128,12 +7134,12 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { if (OptionalIdx.find(AMDGPUOperand::ImmTyExpCompr) != OptionalIdx.end()) { Compr = true; Inst.getOperand(OperandIdx[1]) = Inst.getOperand(OperandIdx[2]); - Inst.getOperand(OperandIdx[2]).setReg(AMDGPU::NoRegister); - Inst.getOperand(OperandIdx[3]).setReg(AMDGPU::NoRegister); + Inst.getOperand(OperandIdx[2]).setReg(MCRegister()); + Inst.getOperand(OperandIdx[3]).setReg(MCRegister()); } for (auto i = 0; i < SrcIdx; ++i) { - if (Inst.getOperand(OperandIdx[i]).getReg() != AMDGPU::NoRegister) { + if (Inst.getOperand(OperandIdx[i]).getReg()) { EnMask |= Compr? (0x3 << i * 2) : (0x1 << i); } } @@ -7902,9 +7908,8 @@ AMDGPUAsmParser::getImmLoc(AMDGPUOperand::ImmTy Type, return getOperandLoc(Test, Operands); } -SMLoc -AMDGPUAsmParser::getRegLoc(unsigned Reg, - const OperandVector &Operands) const { +SMLoc AMDGPUAsmParser::getRegLoc(MCRegister Reg, + const OperandVector &Operands) const { auto Test = [=](const AMDGPUOperand& Op) { return Op.isRegKind() && Op.getReg() == Reg; }; @@ -7996,12 +8001,9 @@ encodeBitmaskPerm(const unsigned AndMask, (XorMask << BITMASK_XOR_SHIFT); } -bool -AMDGPUAsmParser::parseSwizzleOperand(int64_t &Op, - const unsigned MinVal, - const unsigned MaxVal, - const StringRef ErrMsg, - SMLoc &Loc) { +bool AMDGPUAsmParser::parseSwizzleOperand(int64_t &Op, const unsigned MinVal, + const unsigned MaxVal, + const Twine &ErrMsg, SMLoc &Loc) { if (!skipToken(AsmToken::Comma, "expected a comma")) { return false; } @@ -8166,6 +8168,54 @@ AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) { return true; } +bool AMDGPUAsmParser::parseSwizzleFFT(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (!AMDGPU::isGFX9Plus(getSTI())) { + Error(getLoc(), "FFT mode swizzle not supported on this GPU"); + return false; + } + + int64_t Swizzle; + SMLoc Loc; + if (!parseSwizzleOperand(Swizzle, 0, FFT_SWIZZLE_MAX, + "FFT swizzle must be in the interval [0," + + Twine(FFT_SWIZZLE_MAX) + Twine(']'), + Loc)) + return false; + + Imm = FFT_MODE_ENC | Swizzle; + return true; +} + +bool AMDGPUAsmParser::parseSwizzleRotate(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (!AMDGPU::isGFX9Plus(getSTI())) { + Error(getLoc(), "Rotate mode swizzle not supported on this GPU"); + return false; + } + + SMLoc Loc; + int64_t Direction; + + if (!parseSwizzleOperand(Direction, 0, 1, + "direction must be 0 (left) or 1 (right)", Loc)) + return false; + + int64_t RotateSize; + if (!parseSwizzleOperand( + RotateSize, 0, ROTATE_MAX_SIZE, + "number of threads to rotate must be in the interval [0," + + Twine(ROTATE_MAX_SIZE) + Twine(']'), + Loc)) + return false; + + Imm = ROTATE_MODE_ENC | (Direction << ROTATE_DIR_SHIFT) | + (RotateSize << ROTATE_SIZE_SHIFT); + return true; +} + bool AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) { @@ -8200,6 +8250,10 @@ AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) { Ok = parseSwizzleSwap(Imm); } else if (trySkipId(IdSymbolic[ID_REVERSE])) { Ok = parseSwizzleReverse(Imm); + } else if (trySkipId(IdSymbolic[ID_FFT])) { + Ok = parseSwizzleFFT(Imm); + } else if (trySkipId(IdSymbolic[ID_ROTATE])) { + Ok = parseSwizzleRotate(Imm); } else { Error(ModeLoc, "expected a swizzle mode"); } @@ -9298,7 +9352,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, if (IsVOP3CvtSrDpp) { if (Src2ModIdx == static_cast(Inst.getNumOperands())) { Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createReg(MCRegister())); } } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 7c883cc2017dd..fed29c3e14aae 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp AMDGPUIGroupLP.cpp + AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -94,6 +95,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPURegBankSelect.cpp AMDGPURegisterBankInfo.cpp AMDGPURemoveIncompatibleFunctions.cpp + AMDGPUReserveWWMRegs.cpp AMDGPUResourceUsageAnalysis.cpp AMDGPURewriteOutArguments.cpp AMDGPURewriteUndefForPHI.cpp diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index ca4be01736c1f..9eedcc636fd94 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1038,18 +1038,18 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { return; // Widen the register to the correct number of enabled channels. - unsigned NewVdata = AMDGPU::NoRegister; + MCRegister NewVdata; if (DstSize != Info->VDataDwords) { auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass; // Get first subregister of VData - unsigned Vdata0 = MI.getOperand(VDataIdx).getReg(); - unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0); + MCRegister Vdata0 = MI.getOperand(VDataIdx).getReg(); + MCRegister VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0); Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0; NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, &MRI.getRegClass(DataRCID)); - if (NewVdata == AMDGPU::NoRegister) { + if (!NewVdata) { // It's possible to encode this such that the low register + enabled // components exceeds the register count. return; @@ -1059,11 +1059,11 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { // If not using NSA on GFX10+, widen vaddr0 address register to correct size. // If using partial NSA on GFX11+ widen last address register. int VAddrSAIdx = IsPartialNSA ? (RsrcIdx - 1) : VAddr0Idx; - unsigned NewVAddrSA = AMDGPU::NoRegister; + MCRegister NewVAddrSA; if (STI.hasFeature(AMDGPU::FeatureNSAEncoding) && (!IsNSA || IsPartialNSA) && AddrSize != Info->VAddrDwords) { - unsigned VAddrSA = MI.getOperand(VAddrSAIdx).getReg(); - unsigned VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0); + MCRegister VAddrSA = MI.getOperand(VAddrSAIdx).getReg(); + MCRegister VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0); VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA; auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 52c24a5c25ec2..187d337a98a0b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -143,11 +143,10 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, if (LDSBankCount == 0) LDSBankCount = 32; - if (TT.getArch() == Triple::amdgcn && LocalMemorySize == 0) - LocalMemorySize = 32768; - - AddressableLocalMemorySize = LocalMemorySize; + if (TT.getArch() == Triple::amdgcn && AddressableLocalMemorySize == 0) + AddressableLocalMemorySize = 32768; + LocalMemorySize = AddressableLocalMemorySize; if (AMDGPU::isGFX10Plus(*this) && !getFeatureBits().test(AMDGPU::FeatureCuMode)) LocalMemorySize *= 2; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 94bf5e4b95270..dd8d93c3f0b72 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -315,10 +315,10 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, } } -void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, +void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI) { #if !defined(NDEBUG) - switch (RegNo) { + switch (Reg.id()) { case AMDGPU::FP_REG: case AMDGPU::SP_REG: case AMDGPU::PRIVATE_RSRC_REG: @@ -328,7 +328,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } #endif - O << getRegisterName(RegNo); + O << getRegisterName(Reg); } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, @@ -1503,8 +1503,21 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, O << " offset:"; - if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { + // Rotate and FFT modes + if (Imm >= ROTATE_MODE_LO && AMDGPU::isGFX9Plus(STI)) { + if (Imm >= FFT_MODE_LO) { + O << "swizzle(" << IdSymbolic[ID_FFT] << ',' << (Imm & FFT_SWIZZLE_MASK) + << ')'; + } else if (Imm >= ROTATE_MODE_LO) { + O << "swizzle(" << IdSymbolic[ID_ROTATE] << ',' + << ((Imm >> ROTATE_DIR_SHIFT) & ROTATE_DIR_MASK) << ',' + << ((Imm >> ROTATE_SIZE_SHIFT) & ROTATE_SIZE_MASK) << ')'; + } + return; + } + // Basic mode + if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { O << "swizzle(" << IdSymbolic[ID_QUAD_PERM]; for (unsigned I = 0; I < LANE_NUM; ++I) { O << ","; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 4d44db5d9d818..a72e0fe6ea769 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -32,7 +32,7 @@ class AMDGPUInstPrinter : public MCInstPrinter { void printRegName(raw_ostream &OS, MCRegister Reg) const override; void printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &O) override; - static void printRegOperand(unsigned RegNo, raw_ostream &O, + static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI); private: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 2af1f91973025..f5e05f6bd658a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -489,7 +489,7 @@ void AMDGPUMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, const MCOperand &MO = MI.getOperand(OpNo); if (MO.isReg()) { - unsigned Reg = MO.getReg(); + MCRegister Reg = MO.getReg(); RegEnc |= MRI.getEncodingValue(Reg); RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { @@ -518,7 +518,7 @@ void AMDGPUMCCodeEmitter::getSDWAVopcDstEncoding( const MCOperand &MO = MI.getOperand(OpNo); - unsigned Reg = MO.getReg(); + MCRegister Reg = MO.getReg(); if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) { RegEnc |= MRI.getEncodingValue(Reg); RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; @@ -530,7 +530,7 @@ void AMDGPUMCCodeEmitter::getSDWAVopcDstEncoding( void AMDGPUMCCodeEmitter::getAVOperandEncoding( const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { - unsigned Reg = MI.getOperand(OpNo).getReg(); + MCRegister Reg = MI.getOperand(OpNo).getReg(); unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; bool IsVGPROrAGPR = diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 73d466abc66f7..a1a41d6cc8c6a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -271,6 +271,47 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, << Alignment.value() << '\n'; } +void AMDGPUTargetAsmStreamer::EmitMCResourceInfo( + const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, + const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, + const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, + const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, + const MCSymbol *HasIndirectCall) { +#define PRINT_RES_INFO(ARG) \ + OS << "\t.set "; \ + ARG->print(OS, getContext().getAsmInfo()); \ + OS << ", "; \ + ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \ + Streamer.addBlankLine(); + + PRINT_RES_INFO(NumVGPR); + PRINT_RES_INFO(NumAGPR); + PRINT_RES_INFO(NumExplicitSGPR); + PRINT_RES_INFO(PrivateSegmentSize); + PRINT_RES_INFO(UsesVCC); + PRINT_RES_INFO(UsesFlatScratch); + PRINT_RES_INFO(HasDynamicallySizedStack); + PRINT_RES_INFO(HasRecursion); + PRINT_RES_INFO(HasIndirectCall); +#undef PRINT_RES_INFO +} + +void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR, + const MCSymbol *MaxAGPR, + const MCSymbol *MaxSGPR) { +#define PRINT_RES_INFO(ARG) \ + OS << "\t.set "; \ + ARG->print(OS, getContext().getAsmInfo()); \ + OS << ", "; \ + ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \ + Streamer.addBlankLine(); + + PRINT_RES_INFO(MaxVGPR); + PRINT_RES_INFO(MaxAGPR); + PRINT_RES_INFO(MaxSGPR); +#undef PRINT_RES_INFO +} + bool AMDGPUTargetAsmStreamer::EmitISAVersion() { OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n"; return true; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index bf1538c71d154..6a91ad06de5d1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -60,6 +60,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) { } + virtual void EmitMCResourceInfo( + const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, + const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, + const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, + const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, + const MCSymbol *HasIndirectCall) {}; + + virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, + const MCSymbol *MaxAGPR, + const MCSymbol *MaxSGPR) {}; + /// \returns True on success, false on failure. virtual bool EmitISAVersion() { return true; } @@ -136,6 +147,18 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; + void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, + const MCSymbol *NumExplicitSGPR, + const MCSymbol *PrivateSegmentSize, + const MCSymbol *UsesVCC, + const MCSymbol *UsesFlatScratch, + const MCSymbol *HasDynamicallySizedStack, + const MCSymbol *HasRecursion, + const MCSymbol *HasIndirectCall) override; + + void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, + const MCSymbol *MaxSGPR) override; + /// \returns True on success, false on failure. bool EmitISAVersion() override; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp index 56a23e26b8d9f..7a9ed80bd1a6c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp @@ -141,7 +141,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - switch (Op.getReg()) { + switch (Op.getReg().id()) { // This is the default predicate state, so we don't need to print it. case R600::PRED_SEL_OFF: break; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index fa040d548f64c..134f30518d501 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -52,7 +52,7 @@ class R600MCCodeEmitter : public MCCodeEmitter { void emit(uint32_t value, SmallVectorImpl &CB) const; void emit(uint64_t value, SmallVectorImpl &CB) const; - unsigned getHWReg(unsigned regNo) const; + unsigned getHWReg(MCRegister Reg) const; uint64_t getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl &Fixups, @@ -145,8 +145,8 @@ void R600MCCodeEmitter::emit(uint64_t Value, SmallVectorImpl &CB) const { support::endian::write(CB, Value, llvm::endianness::little); } -unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { - return MRI.getEncodingValue(RegNo) & HW_REG_MASK; +unsigned R600MCCodeEmitter::getHWReg(MCRegister Reg) const { + return MRI.getEncodingValue(Reg) & HW_REG_MASK; } uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, diff --git a/llvm/lib/Target/AMDGPU/R600Processors.td b/llvm/lib/Target/AMDGPU/R600Processors.td index 8cf8edd1254fe..0265a976c9825 100644 --- a/llvm/lib/Target/AMDGPU/R600Processors.td +++ b/llvm/lib/Target/AMDGPU/R600Processors.td @@ -53,13 +53,13 @@ def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700", >; def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen", - [FeatureFetchLimit16, FeatureLocalMemorySize32768] + [FeatureFetchLimit16, FeatureAddressableLocalMemorySize32768] >; def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS", "northern-islands", [FeatureFetchLimit16, FeatureWavefrontSize64, - FeatureLocalMemorySize32768] + FeatureAddressableLocalMemorySize32768] >; diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp index e5a8c5cf3baf6..fd5a87999cf81 100644 --- a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp +++ b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp @@ -29,7 +29,7 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), InstrItins(getInstrItineraryForCPU(GPU)) { - AddressableLocalMemorySize = LocalMemorySize; + LocalMemorySize = AddressableLocalMemorySize; } R600Subtarget &R600Subtarget::initializeSubtargetDependencies(const Triple &TT, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index fb3d83ca30d19..07c80bd2575f0 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -841,9 +841,12 @@ enum Id : unsigned { // id of symbolic names ID_BITMASK_PERM, ID_SWAP, ID_REVERSE, - ID_BROADCAST + ID_BROADCAST, + ID_FFT, + ID_ROTATE }; +// clang-format off enum EncBits : unsigned { // swizzle mode encodings @@ -854,6 +857,14 @@ enum EncBits : unsigned { BITMASK_PERM_ENC = 0x0000, BITMASK_PERM_ENC_MASK = 0x8000, + FFT_MODE_ENC = 0xE000, + + ROTATE_MODE_ENC = 0xC000, + FFT_ROTATE_MODE_MASK = 0xF000, + + ROTATE_MODE_LO = 0xC000, + FFT_MODE_LO = 0xE000, + // QUAD_PERM encodings LANE_MASK = 0x3, @@ -869,8 +880,21 @@ enum EncBits : unsigned { BITMASK_AND_SHIFT = 0, BITMASK_OR_SHIFT = 5, - BITMASK_XOR_SHIFT = 10 + BITMASK_XOR_SHIFT = 10, + + // FFT encodings + + FFT_SWIZZLE_MASK = 0x1F, + FFT_SWIZZLE_MAX = 0x1F, + + // ROTATE encodings + ROTATE_MAX_SIZE = 0x1F, + ROTATE_DIR_SHIFT = 10, // bit position of rotate direction + ROTATE_DIR_MASK = 0x1, + ROTATE_SIZE_SHIFT = 5, // bit position of rotate size + ROTATE_SIZE_MASK = ROTATE_MAX_SIZE, }; +// clang-format on } // namespace Swizzle diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 07505110476b5..3d1657392884f 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1341,13 +1341,6 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - // Allocate spill slots for WWM reserved VGPRs. - for (Register Reg : FuncInfo->getWWMReservedRegs()) { - const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); - FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), - TRI->getSpillAlign(*RC)); - } - const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() && EnableSpillVGPRToAGPR; @@ -1573,11 +1566,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) return; - MFI->shiftSpillPhysVGPRsToLowestRange(MF); - TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); - if (MFI->isEntryFunction()) - return; const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1587,19 +1576,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, MachineInstr *ReturnMI = nullptr; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - // WRITELANE instructions used for SGPR spills can overwrite the inactive - // lanes of VGPRs and callee must spill and restore them even if they are - // marked Caller-saved. - - // TODO: Handle this elsewhere at an early point. Walking through all MBBs - // here would be a bad heuristic. A better way should be by calling - // allocateWWMSpill during the regalloc pipeline whenever a physical - // register is allocated for the intended virtual registers. - if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) - MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); - else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR) - MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); - else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) + // TODO: Walking through all MBBs here would be a bad heuristic. Better + // handle them elsewhere. + if (TII->isWWMRegSpillOpcode(MI.getOpcode())) NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || @@ -1614,6 +1593,23 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, } } + SmallVector SortedWWMVGPRs; + for (Register Reg : MFI->getWWMReservedRegs()) { + // The shift-back is needed only for the VGPRs used for SGPR spills and they + // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM + // reserved registers. + const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); + if (TRI->getRegSizeInBits(*RC) > 32) + continue; + SortedWWMVGPRs.push_back(Reg); + } + + sort(SortedWWMVGPRs, std::greater()); + MFI->shiftWwmVGPRsToLowestRange(MF, SortedWWMVGPRs, SavedVGPRs); + + if (MFI->isEntryFunction()) + return; + // Remove any VGPRs used in the return value because these do not need to be saved. // This prevents CSR restore from clobbering return VGPRs. if (ReturnMI) { @@ -1623,6 +1619,13 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, } } + // Create the stack objects for WWM registers now. + for (Register Reg : MFI->getWWMReservedRegs()) { + const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); + MFI->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), + TRI->getSpillAlign(*RC)); + } + // Ignore the SGPRs the default implementation found. SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); @@ -1638,14 +1641,6 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, // allow the default insertion to handle them. for (auto &Reg : MFI->getWWMSpills()) SavedVGPRs.reset(Reg.first); - - // Mark all lane VGPRs as BB LiveIns. - for (MachineBasicBlock &MBB : MF) { - for (auto &Reg : MFI->getWWMSpills()) - MBB.addLiveIn(Reg.first); - - MBB.sortUniqueLiveIns(); - } } void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index eb2f20f89de08..d559d0446b9d8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -753,7 +753,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); - setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, + setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM, + ISD::FMAXIMUMNUM}, {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, Custom); @@ -5842,6 +5843,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMAXNUM_IEEE: case ISD::FMINIMUM: case ISD::FMAXIMUM: + case ISD::FMINIMUMNUM: + case ISD::FMAXIMUMNUM: case ISD::UADDSAT: case ISD::USUBSAT: case ISD::SADDSAT: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 44ee5c56a237b..5c39b2a4fc96a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8899,11 +8899,10 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, } uint16_t Opcode = MI.getOpcode(); - // FIXME: Copies inserted in the block prolog for live-range split should also - // be included. return IsNullOrVectorRegister && - (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY && - MI.modifiesRegister(AMDGPU::EXEC, &RI))); + (isSGPRSpill(Opcode) || + (!MI.isTerminator() && Opcode != AMDGPU::COPY && + MI.modifiesRegister(AMDGPU::EXEC, &RI))); } MachineInstrBuilder diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9afb29d95abd7..8073aca7f197f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -931,6 +931,7 @@ def SI_SPILL_S32_TO_VGPR : PseudoInstSI <(outs VGPR_32:$vdst), let hasSideEffects = 0; let mayLoad = 0; let mayStore = 0; + let hasExtraDefRegAllocReq = 1; let Constraints = "$vdst = $vdst_in"; } @@ -941,6 +942,7 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), let hasSideEffects = 0; let mayLoad = 0; let mayStore = 0; + let hasExtraSrcRegAllocReq = 1; } } // End Spill = 1, VALU = 1, isConvergent = 1 diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 35e5bea9ae16e..822336ebaf5dc 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -21,6 +21,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" @@ -33,12 +34,18 @@ using MBBVector = SmallVector; namespace { +static cl::opt MaxNumVGPRsForWwmAllocation( + "amdgpu-num-vgprs-for-wwm-alloc", + cl::desc("Max num VGPRs for whole-wave register allocation."), + cl::ReallyHidden, cl::init(10)); + class SILowerSGPRSpills { private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; LiveIntervals *LIS = nullptr; SlotIndexes *Indexes = nullptr; + MachineDominatorTree *MDT = nullptr; // Save and Restore blocks of the current function. Typically there is a // single save block, unless Windows EH funclets are involved. @@ -46,13 +53,17 @@ class SILowerSGPRSpills { MBBVector RestoreBlocks; public: - SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes) - : LIS(LIS), Indexes(Indexes) {} + SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes, + MachineDominatorTree *MDT) + : LIS(LIS), Indexes(Indexes), MDT(MDT) {} bool run(MachineFunction &MF); void calculateSaveRestoreBlocks(MachineFunction &MF); bool spillCalleeSavedRegs(MachineFunction &MF, SmallVectorImpl &CalleeSavedFIs); - void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS); + void updateLaneVGPRDomInstr( + int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, + DenseMap &LaneVGPRDomInstr); + void determineRegsForWWMAllocation(MachineFunction &MF, BitVector &RegMask); }; class SILowerSGPRSpillsLegacy : public MachineFunctionPass { @@ -64,6 +75,7 @@ class SILowerSGPRSpillsLegacy : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -84,6 +96,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpillsLegacy, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(SILowerSGPRSpillsLegacy, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -266,51 +279,90 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs( return false; } -void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF, - LiveIntervals *LIS) { - // TODO: This is a workaround to avoid the unmodelled liveness computed with - // whole-wave virtual registers when allocated together with the regular VGPR - // virtual registers. Presently, the liveness computed during the regalloc is - // only uniform (or single lane aware) and it doesn't take account of the - // divergent control flow that exists for our GPUs. Since the WWM registers - // can modify inactive lanes, the wave-aware liveness should be computed for - // the virtual registers to accurately plot their interferences. Without - // having the divergent CFG for the function, it is difficult to implement the - // wave-aware liveness info. Until then, we conservatively extend the liveness - // of the wwm registers into the entire function so that they won't be reused - // without first spilling/splitting their liveranges. - SIMachineFunctionInfo *MFI = MF.getInfo(); - - // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks. - for (auto Reg : MFI->getSGPRSpillVGPRs()) { - for (MachineBasicBlock *SaveBlock : SaveBlocks) { - MachineBasicBlock::iterator InsertBefore = SaveBlock->begin(); - DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore); - auto MIB = BuildMI(*SaveBlock, InsertBefore, DL, - TII->get(AMDGPU::IMPLICIT_DEF), Reg); - MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); - // Set SGPR_SPILL asm printer flag - MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL); - if (LIS) { - LIS->InsertMachineInstrInMaps(*MIB); +void SILowerSGPRSpills::updateLaneVGPRDomInstr( + int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, + DenseMap &LaneVGPRDomInstr) { + // For the Def of a virtual LaneVPGR to dominate all its uses, we should + // insert an IMPLICIT_DEF before the dominating spill. Switching to a + // depth first order doesn't really help since the machine function can be in + // the unstructured control flow post-SSA. For each virtual register, hence + // finding the common dominator to get either the dominating spill or a block + // dominating all spills. + SIMachineFunctionInfo *FuncInfo = + MBB->getParent()->getInfo(); + ArrayRef VGPRSpills = + FuncInfo->getSGPRSpillToVirtualVGPRLanes(FI); + Register PrevLaneVGPR; + for (auto &Spill : VGPRSpills) { + if (PrevLaneVGPR == Spill.VGPR) + continue; + + PrevLaneVGPR = Spill.VGPR; + auto I = LaneVGPRDomInstr.find(Spill.VGPR); + if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) { + // Initially add the spill instruction itself for Insertion point. + LaneVGPRDomInstr[Spill.VGPR] = InsertPt; + } else { + assert(I != LaneVGPRDomInstr.end()); + auto PrevInsertPt = I->second; + MachineBasicBlock *DomMBB = PrevInsertPt->getParent(); + if (DomMBB == MBB) { + // The insertion point earlier selected in a predecessor block whose + // spills are currently being lowered. The earlier InsertPt would be + // the one just before the block terminator and it should be changed + // if we insert any new spill in it. + if (MDT->dominates(&*InsertPt, &*PrevInsertPt)) + I->second = InsertPt; + + continue; } + + // Find the common dominator block between PrevInsertPt and the + // current spill. + DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB); + if (DomMBB == MBB) + I->second = InsertPt; + else if (DomMBB != PrevInsertPt->getParent()) + I->second = &(*DomMBB->getFirstTerminator()); } } +} - // Insert the KILL in the return blocks to extend their liveness untill the - // end of function. Insert a separate KILL for each VGPR. - for (MachineBasicBlock *RestoreBlock : RestoreBlocks) { - MachineBasicBlock::iterator InsertBefore = - RestoreBlock->getFirstTerminator(); - DebugLoc DL = RestoreBlock->findDebugLoc(InsertBefore); - for (auto Reg : MFI->getSGPRSpillVGPRs()) { - auto MIB = BuildMI(*RestoreBlock, InsertBefore, DL, - TII->get(TargetOpcode::KILL)); - MIB.addReg(Reg); - if (LIS) - LIS->InsertMachineInstrInMaps(*MIB); +void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, + BitVector &RegMask) { + // Determine an optimal number of VGPRs for WWM allocation. The complement + // list will be available for allocating other VGPR virtual registers. + SIMachineFunctionInfo *MFI = MF.getInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + BitVector ReservedRegs = TRI->getReservedRegs(MF); + BitVector NonWwmAllocMask(TRI->getNumRegs()); + + // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future + // to have a balanced allocation between WWM values and per-thread vector + // register operands. + unsigned NumRegs = MaxNumVGPRsForWwmAllocation; + NumRegs = + std::min(static_cast(MFI->getSGPRSpillVGPRs().size()), NumRegs); + + auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF); + // Try to use the highest available registers for now. Later after + // vgpr-regalloc, they can be shifted to the lowest range. + unsigned I = 0; + for (unsigned Reg = AMDGPU::VGPR0 + MaxNumVGPRs - 1; + (I < NumRegs) && (Reg >= AMDGPU::VGPR0); --Reg) { + if (!ReservedRegs.test(Reg) && + !MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/true)) { + TRI->markSuperRegs(RegMask, Reg); + ++I; } } + + if (I != NumRegs) { + // Reserve an arbitrary register and report the error. + TRI->markSuperRegs(RegMask, AMDGPU::VGPR0); + MF.getFunction().getContext().emitError( + "can't find enough VGPRs for wwm-regalloc"); + } } bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) { @@ -318,7 +370,9 @@ bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) { LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; auto *SIWrapper = getAnalysisIfAvailable(); SlotIndexes *Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr; - return SILowerSGPRSpills(LIS, Indexes).run(MF); + MachineDominatorTree *MDT = + &getAnalysis().getDomTree(); + return SILowerSGPRSpills(LIS, Indexes, MDT).run(MF); } bool SILowerSGPRSpills::run(MachineFunction &MF) { @@ -361,6 +415,9 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) { // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + // To track the IMPLICIT_DEF insertion point for the lane vgprs. + DenseMap LaneVGPRDomInstr; + for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { if (!TII->isSGPRSpill(MI)) @@ -390,6 +447,7 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) { "failed to spill SGPR to physical VGPR lane when allocated"); } } else { + MachineInstrSpan MIS(&MI, &MBB); if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( MI, FI, nullptr, Indexes, LIS); @@ -397,21 +455,47 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) { llvm_unreachable( "failed to spill SGPR to virtual VGPR lane when allocated"); SpillFIs.set(FI); + updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr); SpilledToVirtVGPRLanes = true; } } } } - if (SpilledToVirtVGPRLanes) { - extendWWMVirtRegLiveness(MF, LIS); + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { + auto InsertPt = LaneVGPRDomInstr[Reg]; + // Insert the IMPLICIT_DEF at the identified points. + MachineBasicBlock &Block = *InsertPt->getParent(); + DebugLoc DL = Block.findDebugLoc(InsertPt); + auto MIB = + BuildMI(Block, *InsertPt, DL, TII->get(AMDGPU::IMPLICIT_DEF), Reg); + + // Add WWM flag to the virtual register. + FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); + + // Set SGPR_SPILL asm printer flag + MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL); if (LIS) { - // Compute the LiveInterval for the newly created virtual registers. - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) - LIS->createAndComputeVirtRegInterval(Reg); + LIS->InsertMachineInstrInMaps(*MIB); + LIS->createAndComputeVirtRegInterval(Reg); } } + // Determine the registers for WWM allocation and also compute the register + // mask for non-wwm VGPR allocation. + if (FuncInfo->getSGPRSpillVGPRs().size()) { + BitVector WwmRegMask(TRI->getNumRegs()); + + determineRegsForWWMAllocation(MF, WwmRegMask); + + BitVector NonWwmRegMask(WwmRegMask); + NonWwmRegMask.flip().clearBitsNotInMask(TRI->getAllVGPRRegMask()); + + // The complement set will be the registers for non-wwm (per-thread) vgpr + // allocation. + FuncInfo->updateNonWWMRegMask(NonWwmRegMask); + } + for (MachineBasicBlock &MBB : MF) { // FIXME: The dead frame indices are replaced with a null register from // the debug value instructions. We should instead, update it with the @@ -468,6 +552,7 @@ SILowerSGPRSpillsPass::run(MachineFunction &MF, MFPropsModifier _(*this, MF); auto *LIS = MFAM.getCachedResult(MF); auto *Indexes = MFAM.getCachedResult(MF); - SILowerSGPRSpills(LIS, Indexes).run(MF); + MachineDominatorTree *MDT = &MFAM.getResult(MF); + SILowerSGPRSpills(LIS, Indexes, MDT).run(MF); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index f59d29bd81403..8be9a082a7fd0 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -325,11 +325,13 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, return false; } -void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( - MachineFunction &MF) { +void SIMachineFunctionInfo::shiftWwmVGPRsToLowestRange( + MachineFunction &MF, SmallVectorImpl &WWMVGPRs, + BitVector &SavedVGPRs) { const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - for (Register &Reg : SpillPhysVGPRs) { + for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) { + Register Reg = WWMVGPRs[I]; Register NewReg = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (!NewReg || NewReg >= Reg) @@ -338,10 +340,22 @@ void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( MRI.replaceRegWith(Reg, NewReg); // Update various tables with the new VGPR. + WWMVGPRs[I] = NewReg; WWMReservedRegs.remove(Reg); WWMReservedRegs.insert(NewReg); - WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg])); - WWMSpills.erase(Reg); + MRI.reserveReg(NewReg, TRI); + + // Replace the register in SpillPhysVGPRs. This is needed to look for free + // lanes while spilling special SGPRs like FP, BP, etc. during PEI. + auto RegItr = std::find(SpillPhysVGPRs.begin(), SpillPhysVGPRs.end(), Reg); + if (RegItr != SpillPhysVGPRs.end()) { + unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr); + SpillPhysVGPRs[Idx] = NewReg; + } + + // The generic `determineCalleeSaves` might have set the old register if it + // is in the CSR range. + SavedVGPRs.reset(Reg); for (MachineBasicBlock &MBB : MF) { MBB.removeLiveIn(Reg); @@ -386,7 +400,9 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( return false; } - allocateWWMSpill(MF, LaneVGPR); + if (IsPrologEpilog) + allocateWWMSpill(MF, LaneVGPR); + reserveWWMRegister(LaneVGPR); for (MachineBasicBlock &MBB : MF) { MBB.addLiveIn(LaneVGPR); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index aff0b34947d68..669f98dd865d6 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -524,6 +524,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, // the VGPR and its stack slot index. WWMSpillsMap WWMSpills; + // Before allocation, the VGPR registers are partitioned into two distinct + // sets, the first one for WWM-values and the second set for non-WWM values. + // The latter set should be reserved during WWM-regalloc. + BitVector NonWWMRegMask; + using ReservedRegSet = SmallSetVector; // To track the VGPRs reserved for WWM instructions. They get stack slots // later during PrologEpilogInserter and get added into the superset WWMSpills @@ -590,6 +595,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, void reserveWWMRegister(Register Reg) { WWMReservedRegs.insert(Reg); } + void updateNonWWMRegMask(BitVector &RegMask) { NonWWMRegMask = RegMask; } + BitVector getNonWWMRegMask() const { return NonWWMRegMask; } + void clearNonWWMRegAllocMask() { NonWWMRegMask.clear(); } + SIModeRegisterDefaults getMode() const { return Mode; } ArrayRef @@ -729,9 +738,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, I->second.IsDead = true; } - // To bring the Physical VGPRs in the highest range allocated for CSR SGPR - // spilling into the lowest available range. - void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF); + // To bring the allocated WWM registers in \p WWMVGPRs to the lowest available + // range. + void shiftWwmVGPRsToLowestRange(MachineFunction &MF, + SmallVectorImpl &WWMVGPRs, + BitVector &SavedVGPRs); bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane = false, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 2d1cd1bda3afe..d7421a1ceff0f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -561,6 +561,37 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); } +std::pair +SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); + unsigned MaxNumAGPRs = MaxNumVGPRs; + unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + + // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, + // a wave may have up to 512 total vector registers combining together both + // VGPRs and AGPRs. Hence, in an entry function without calls and without + // AGPRs used within it, it is possible to use the whole vector register + // budget for VGPRs. + // + // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (ST.hasGFX90AInsts()) { + if (MFI->usesAGPRs(MF)) { + MaxNumVGPRs /= 2; + MaxNumAGPRs = MaxNumVGPRs; + } else { + if (MaxNumVGPRs > TotalNumVGPRs) { + MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; + MaxNumVGPRs = TotalNumVGPRs; + } else + MaxNumAGPRs = 0; + } + } + + return std::pair(MaxNumVGPRs, MaxNumAGPRs); +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::MODE); @@ -668,30 +699,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Reserve VGPRs/AGPRs. // - unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); - unsigned MaxNumAGPRs = MaxNumVGPRs; - unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - - // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, - // a wave may have up to 512 total vector registers combining together both - // VGPRs and AGPRs. Hence, in an entry function without calls and without - // AGPRs used within it, it is possible to use the whole vector register - // budget for VGPRs. - // - // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split - // register file accordingly. - if (ST.hasGFX90AInsts()) { - if (MFI->usesAGPRs(MF)) { - MaxNumVGPRs /= 2; - MaxNumAGPRs = MaxNumVGPRs; - } else { - if (MaxNumVGPRs > TotalNumVGPRs) { - MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; - MaxNumVGPRs = TotalNumVGPRs; - } else - MaxNumAGPRs = 0; - } - } + auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); for (const TargetRegisterClass *RC : regclasses()) { if (RC->isBaseClass() && isVGPRClass(RC)) { @@ -724,6 +732,18 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); } + // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The + // MFI->getNonWWMRegMask() field will have a valid bitmask only during + // wwm-regalloc and it would be empty otherwise. + BitVector NonWWMRegMask = MFI->getNonWWMRegMask(); + if (!NonWWMRegMask.empty()) { + for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs; + RegI < RegE; ++RegI) { + if (NonWWMRegMask.test(RegI)) + reserveRegisterTuples(Reserved, RegI); + } + } + for (Register Reg : MFI->getWWMReservedRegs()) reserveRegisterTuples(Reserved, Reg); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 88d5686720985..409e5418abc8e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -82,6 +82,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number + /// of waves per execution unit required for the function \p MF. + std::pair + getMaxNumVectorRegs(const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 5f7549c2921ed..a8e4ce133ffbc 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -645,14 +645,18 @@ unsigned const DfmtNfmt2UFmtGFX11[] = { namespace Swizzle { +// clang-format off // This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h. -const char* const IdSymbolic[] = { +const char *const IdSymbolic[] = { "QUAD_PERM", "BITMASK_PERM", "SWAP", "REVERSE", "BROADCAST", + "FFT", + "ROTATE", }; +// clang-format on } // namespace Swizzle diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index f32c82f1e4ba4..feec7b47ae294 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -911,9 +911,9 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI) { } unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) { - if (STI->getFeatureBits().test(FeatureLocalMemorySize32768)) + if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize32768)) return 32768; - if (STI->getFeatureBits().test(FeatureLocalMemorySize65536)) + if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536)) return 65536; return 0; } @@ -2219,9 +2219,9 @@ int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, return std::max(ArgNumVGPR, ArgNumAGPR); } -bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { +bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); - const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); + const MCRegister FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || Reg == AMDGPU::SCC; } @@ -2232,7 +2232,7 @@ bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI) { #define MAP_REG2REG \ using namespace AMDGPU; \ - switch(Reg) { \ + switch(Reg.id()) { \ default: return Reg; \ CASE_CI_VI(FLAT_SCR) \ CASE_CI_VI(FLAT_SCR_LO) \ @@ -2287,7 +2287,7 @@ bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI) { #define CASE_GFXPRE11_GFX11PLUS_TO(node, result) \ case node: return isGFX11Plus(STI) ? result##_gfx11plus : result##_gfxpre11; -unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { +MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) { if (STI.getTargetTriple().getArch() == Triple::r600) return Reg; MAP_REG2REG @@ -2303,9 +2303,7 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { #define CASE_GFXPRE11_GFX11PLUS(node) case node##_gfx11plus: case node##_gfxpre11: return node; #define CASE_GFXPRE11_GFX11PLUS_TO(node, result) -unsigned mc2PseudoReg(unsigned Reg) { - MAP_REG2REG -} +MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG } bool isInlineValue(unsigned Reg) { switch (Reg) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index da37534f2fa4f..d1d84394cc070 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1316,18 +1316,18 @@ unsigned hasKernargPreload(const MCSubtargetInfo &STI); bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST); /// Is Reg - scalar register -bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); +bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI); /// \returns if \p Reg occupies the high 16-bits of a 32-bit register. bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI); /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. -unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); +MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI); /// Convert hardware register \p Reg to a pseudo register LLVM_READNONE -unsigned mc2PseudoReg(unsigned Reg); +MCRegister mc2PseudoReg(MCRegister Reg); LLVM_READNONE bool isInlineValue(unsigned Reg); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index a53bf70d77717..92d09b3afa77d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -215,15 +215,15 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val, const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx); Val = MCBinaryExpr::createOr(Val, NExpr, Ctx); } - ExprIt->getSecond() = Val; } else if (N.getKind() == msgpack::Type::UInt) { const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx); Val = MCBinaryExpr::createOr(Val, NExpr, Ctx); - int64_t Unused; - if (!Val->evaluateAsAbsolute(Unused)) - REM[Reg] = Val; - (void)Unused; + } else { + // Default to uint64_t 0 so additional calls to setRegister will allow + // propagate ORs. + N = (uint64_t)0; } + REM[Reg] = Val; DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val); } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp index ffda703a24ade..178877aa00682 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp @@ -86,16 +86,16 @@ void AVRInstPrinter::printInst(const MCInst *MI, uint64_t Address, } } -const char *AVRInstPrinter::getPrettyRegisterName(unsigned RegNum, +const char *AVRInstPrinter::getPrettyRegisterName(MCRegister Reg, MCRegisterInfo const &MRI) { // GCC prints register pairs by just printing the lower register // If the register contains a subregister, print it instead if (MRI.getNumSubRegIndices() > 0) { - unsigned RegLoNum = MRI.getSubReg(RegNum, AVR::sub_lo); - RegNum = (RegLoNum != AVR::NoRegister) ? RegLoNum : RegNum; + MCRegister RegLo = MRI.getSubReg(Reg, AVR::sub_lo); + Reg = (RegLo != AVR::NoRegister) ? RegLo : Reg; } - return getRegisterName(RegNum); + return getRegisterName(Reg); } void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h index 89d210bb22e8f..8ba24dc80d884 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h @@ -26,7 +26,7 @@ class AVRInstPrinter : public MCInstPrinter { const MCRegisterInfo &MRI) : MCInstPrinter(MAI, MII, MRI) {} - static const char *getPrettyRegisterName(unsigned RegNo, + static const char *getPrettyRegisterName(MCRegister Reg, MCRegisterInfo const &MRI); void printInst(const MCInst *MI, uint64_t Address, StringRef Annot, diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp index 7682394e83926..aa69d618d8a0a 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp @@ -118,7 +118,7 @@ unsigned AVRMCCodeEmitter::encodeLDSTPtrReg(const MCInst &MI, unsigned OpNo, // The operand should be a pointer register. assert(MO.isReg()); - switch (MO.getReg()) { + switch (MO.getReg().id()) { case AVR::R27R26: return 0x03; // X: 0b11 case AVR::R29R28: @@ -144,7 +144,7 @@ unsigned AVRMCCodeEmitter::encodeMemri(const MCInst &MI, unsigned OpNo, uint8_t RegBit = 0; - switch (RegOp.getReg()) { + switch (RegOp.getReg().id()) { default: Ctx.reportError(MI.getLoc(), "Expected either Y or Z register"); return 0; diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp index 06b7743e0cd31..32ddf11ec3196 100644 --- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp +++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp @@ -86,7 +86,7 @@ struct BPFOperand : public MCParsedAsmOperand { } Kind; struct RegOp { - unsigned RegNum; + MCRegister RegNum; }; struct ImmOp { @@ -206,10 +206,10 @@ struct BPFOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr createReg(unsigned RegNo, SMLoc S, + static std::unique_ptr createReg(MCRegister Reg, SMLoc S, SMLoc E) { auto Op = std::make_unique(Register); - Op->Reg.RegNum = RegNo; + Op->Reg.RegNum = Reg; Op->StartLoc = S; Op->EndLoc = E; return Op; @@ -447,13 +447,13 @@ ParseStatus BPFAsmParser::parseRegister(OperandVector &Operands) { return ParseStatus::NoMatch; case AsmToken::Identifier: StringRef Name = getLexer().getTok().getIdentifier(); - unsigned RegNo = MatchRegisterName(Name); + MCRegister Reg = MatchRegisterName(Name); - if (RegNo == 0) + if (!Reg) return ParseStatus::NoMatch; getLexer().Lex(); - Operands.push_back(BPFOperand::createReg(RegNo, S, E)); + Operands.push_back(BPFOperand::createReg(Reg, S, E)); } return ParseStatus::Success; } @@ -487,12 +487,12 @@ ParseStatus BPFAsmParser::parseImmediate(OperandVector &Operands) { bool BPFAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // The first operand could be either register or actually an operator. - unsigned RegNo = MatchRegisterName(Name); + MCRegister Reg = MatchRegisterName(Name); - if (RegNo != 0) { + if (Reg) { SMLoc E = SMLoc::getFromPointer(NameLoc.getPointer() - 1); - Operands.push_back(BPFOperand::createReg(RegNo, NameLoc, E)); - } else if (BPFOperand::isValidIdAtStart (Name)) + Operands.push_back(BPFOperand::createReg(Reg, NameLoc, E)); + } else if (BPFOperand::isValidIdAtStart(Name)) Operands.push_back(BPFOperand::createToken(Name, NameLoc)); else return Error(NameLoc, "invalid register/token name"); diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp index d923c96bc008e..d7dde11a1ecac 100644 --- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp +++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp @@ -154,7 +154,7 @@ struct CSKYOperand : public MCParsedAsmOperand { } Kind; struct RegOp { - unsigned RegNum; + MCRegister RegNum; }; struct ImmOp { @@ -166,19 +166,19 @@ struct CSKYOperand : public MCParsedAsmOperand { }; struct RegSeqOp { - unsigned RegNumFrom; - unsigned RegNumTo; + MCRegister RegNumFrom; + MCRegister RegNumTo; }; struct RegListOp { - unsigned List1From = 0; - unsigned List1To = 0; - unsigned List2From = 0; - unsigned List2To = 0; - unsigned List3From = 0; - unsigned List3To = 0; - unsigned List4From = 0; - unsigned List4To = 0; + MCRegister List1From; + MCRegister List1To; + MCRegister List2From; + MCRegister List2To; + MCRegister List3From; + MCRegister List3To; + MCRegister List4From; + MCRegister List4To; }; SMLoc StartLoc, EndLoc; @@ -405,9 +405,9 @@ struct CSKYOperand : public MCParsedAsmOperand { return Reg.RegNum; } - std::pair getRegSeq() const { + std::pair getRegSeq() const { assert(Kind == RegisterSeq && "Invalid type access!"); - return std::pair(RegSeq.RegNumFrom, RegSeq.RegNumTo); + return {RegSeq.RegNumFrom, RegSeq.RegNumTo}; } RegListOp getRegList() const { @@ -478,7 +478,7 @@ struct CSKYOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr createReg(unsigned RegNo, SMLoc S, + static std::unique_ptr createReg(MCRegister RegNo, SMLoc S, SMLoc E) { auto Op = std::make_unique(Register); Op->Reg.RegNum = RegNo; @@ -487,8 +487,8 @@ struct CSKYOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr createRegSeq(unsigned RegNoFrom, - unsigned RegNoTo, SMLoc S) { + static std::unique_ptr + createRegSeq(MCRegister RegNoFrom, MCRegister RegNoTo, SMLoc S) { auto Op = std::make_unique(RegisterSeq); Op->RegSeq.RegNumFrom = RegNoFrom; Op->RegSeq.RegNumTo = RegNoTo; @@ -498,7 +498,7 @@ struct CSKYOperand : public MCParsedAsmOperand { } static std::unique_ptr - createRegList(SmallVector reglist, SMLoc S) { + createRegList(const SmallVector ®list, SMLoc S) { auto Op = std::make_unique(RegisterList); Op->RegList.List1From = 0; Op->RegList.List1To = 0; @@ -1445,9 +1445,7 @@ ParseStatus CSKYAsmParser::parseRegSeq(OperandVector &Operands) { ParseStatus CSKYAsmParser::parseRegList(OperandVector &Operands) { SMLoc S = getLoc(); - - SmallVector reglist; - + SmallVector reglist; while (true) { if (!parseRegister(Operands).isSuccess()) diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 62f188957cccf..e2157f1593e90 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -124,7 +124,7 @@ class HexagonAsmParser : public MCTargetAsmParser { bool parseDirectiveAttribute(SMLoc L); - bool RegisterMatchesArch(unsigned MatchNum) const; + bool RegisterMatchesArch(MCRegister MatchNum) const; bool matchBundleOptions(); bool handleNoncontigiousRegister(bool Contigious, SMLoc &Loc); @@ -145,10 +145,10 @@ class HexagonAsmParser : public MCTargetAsmParser { int processInstruction(MCInst &Inst, OperandVector const &Operands, SMLoc IDLoc); - unsigned matchRegister(StringRef Name); + MCRegister matchRegister(StringRef Name); -/// @name Auto-generated Match Functions -/// { + /// @name Auto-generated Match Functions + /// { #define GET_ASSEMBLER_HEADER #include "HexagonGenAsmMatcher.inc" @@ -205,7 +205,7 @@ struct HexagonOperand : public MCParsedAsmOperand { }; struct RegTy { - unsigned RegNum; + MCRegister RegNum; }; struct ImmTy { @@ -434,9 +434,9 @@ struct HexagonOperand : public MCParsedAsmOperand { } static std::unique_ptr - CreateReg(MCContext &Context, unsigned RegNum, SMLoc S, SMLoc E) { + CreateReg(MCContext &Context, MCRegister Reg, SMLoc S, SMLoc E) { HexagonOperand *Op = new HexagonOperand(Register, Context); - Op->Reg.RegNum = RegNum; + Op->Reg.RegNum = Reg; Op->StartLoc = S; Op->EndLoc = E; return std::unique_ptr(Op); @@ -867,7 +867,7 @@ bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) { } // validate register against architecture -bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const { +bool HexagonAsmParser::RegisterMatchesArch(MCRegister MatchNum) const { if (HexagonMCRegisterClasses[Hexagon::V62RegsRegClassID].contains(MatchNum)) if (!getSTI().hasFeature(Hexagon::ArchV62)) return false; @@ -929,7 +929,7 @@ bool HexagonAsmParser::parseOperand(OperandVector &Operands) { MCAsmLexer &Lexer = getLexer(); if (!parseRegister(Register, Begin, End)) { if (!ErrorMissingParenthesis) - switch (Register) { + switch (Register.id()) { default: break; case Hexagon::P0: @@ -1054,8 +1054,8 @@ ParseStatus HexagonAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, llvm::erase_if(Collapsed, isSpace); StringRef FullString = Collapsed; std::pair DotSplit = FullString.split('.'); - unsigned DotReg = matchRegister(DotSplit.first.lower()); - if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) { + MCRegister DotReg = matchRegister(DotSplit.first.lower()); + if (DotReg && RegisterMatchesArch(DotReg)) { if (DotSplit.second.empty()) { Reg = DotReg; EndLoc = Lexer.getLoc(); @@ -1074,8 +1074,8 @@ ParseStatus HexagonAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, } } std::pair ColonSplit = StringRef(FullString).split(':'); - unsigned ColonReg = matchRegister(ColonSplit.first.lower()); - if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) { + MCRegister ColonReg = matchRegister(ColonSplit.first.lower()); + if (ColonReg && RegisterMatchesArch(DotReg)) { do { Lexer.UnLex(Lookahead.pop_back_val()); } while (!Lookahead.empty() && !Lexer.is(AsmToken::Colon)); @@ -1358,13 +1358,13 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, return std::make_pair(matchRegister(R1), matchRegister(R2)); }; - auto GetScalarRegs = [RI, GetRegPair](unsigned RegPair) { + auto GetScalarRegs = [RI, GetRegPair](MCRegister RegPair) { const unsigned Lower = RI->getEncodingValue(RegPair); const RegPairVals RegPair_ = std::make_pair(Lower + 1, Lower); return GetRegPair(RegPair_); }; - auto GetVecRegs = [GetRegPair](unsigned VecRegPair) { + auto GetVecRegs = [GetRegPair](MCRegister VecRegPair) { const RegPairVals RegPair = HexagonMCInstrInfo::GetVecRegPairIndices(VecRegPair); @@ -1461,7 +1461,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)" case Hexagon::A2_tfrp: { MCOperand &MO = Inst.getOperand(1); - const std::pair RegPair = GetScalarRegs(MO.getReg()); + const std::pair RegPair = + GetScalarRegs(MO.getReg()); MO.setReg(RegPair.first); Inst.addOperand(MCOperand::createReg(RegPair.second)); Inst.setOpcode(Hexagon::A2_combinew); @@ -1471,7 +1472,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, case Hexagon::A2_tfrpt: case Hexagon::A2_tfrpf: { MCOperand &MO = Inst.getOperand(2); - const std::pair RegPair = GetScalarRegs(MO.getReg()); + const std::pair RegPair = + GetScalarRegs(MO.getReg()); MO.setReg(RegPair.first); Inst.addOperand(MCOperand::createReg(RegPair.second)); Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt) @@ -1482,7 +1484,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, case Hexagon::A2_tfrptnew: case Hexagon::A2_tfrpfnew: { MCOperand &MO = Inst.getOperand(2); - const std::pair RegPair = GetScalarRegs(MO.getReg()); + const std::pair RegPair = + GetScalarRegs(MO.getReg()); MO.setReg(RegPair.first); Inst.addOperand(MCOperand::createReg(RegPair.second)); Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew) @@ -1494,7 +1497,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, // Translate a "$Vdd = $Vss" to "$Vdd = vcombine($Vs, $Vt)" case Hexagon::V6_vassignp: { MCOperand &MO = Inst.getOperand(1); - const std::pair RegPair = GetVecRegs(MO.getReg()); + const std::pair RegPair = GetVecRegs(MO.getReg()); MO.setReg(RegPair.first); Inst.addOperand(MCOperand::createReg(RegPair.second)); Inst.setOpcode(Hexagon::V6_vcombine); @@ -2051,8 +2054,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, return Match_Success; } -unsigned HexagonAsmParser::matchRegister(StringRef Name) { - if (unsigned Reg = MatchRegisterName(Name)) +MCRegister HexagonAsmParser::matchRegister(StringRef Name) { + if (MCRegister Reg = MatchRegisterName(Name)) return Reg; return MatchRegisterAltName(Name); } diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 44a5cd73c6e89..231004f6c1dc0 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -499,13 +499,14 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, bool SubregBit = (Register & 0x1) != 0; if (HexagonMCInstrInfo::hasNewValue2(*MCII, Inst)) { // If subreg bit is set we're selecting the second produced newvalue - unsigned Producer = SubregBit ? - HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg() : - HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg(); + MCRegister Producer = + SubregBit + ? HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg() + : HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg(); assert(Producer != Hexagon::NoRegister); MCO.setReg(Producer); } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) { - unsigned Producer = + MCRegister Producer = HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg(); if (HexagonMCInstrInfo::IsVecRegPair(Producer)) { diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index ef4c23df54121..9b6bc5ade379d 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -65,8 +65,8 @@ void HexagonMCChecker::init() { init(MCB); } -void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg, - bool &isTrue) { +void HexagonMCChecker::initReg(MCInst const &MCI, MCRegister R, + MCRegister &PredReg, bool &isTrue) { if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && HexagonMCInstrInfo::isPredReg(RI, R)) { // Note an used predicate register. @@ -91,7 +91,7 @@ void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg, void HexagonMCChecker::init(MCInst const &MCI) { const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MCI); - unsigned PredReg = Hexagon::NoRegister; + MCRegister PredReg; bool isTrue = false; // Get used registers. @@ -133,7 +133,7 @@ void HexagonMCChecker::init(MCInst const &MCI) { // Figure out explicit register definitions. for (unsigned i = 0; i < MCID.getNumDefs(); ++i) { - unsigned R = MCI.getOperand(i).getReg(), S = Hexagon::NoRegister; + MCRegister R = MCI.getOperand(i).getReg(), S = MCRegister(); // USR has subregisters (while C8 does not for technical reasons), so // reset R to USR, since we know how to handle multiple defs of USR, // taking into account its subregisters. @@ -187,7 +187,7 @@ void HexagonMCChecker::init(MCInst const &MCI) { if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i) if (MCI.getOperand(i).isReg()) { - unsigned P = MCI.getOperand(i).getReg(); + MCRegister P = MCI.getOperand(i).getReg(); if (HexagonMCInstrInfo::isPredReg(RI, P)) NewPreds.insert(P); @@ -531,7 +531,7 @@ bool HexagonMCChecker::checkRegistersReadOnly() { for (unsigned j = 0; j < Defs; ++j) { MCOperand const &Operand = Inst.getOperand(j); assert(Operand.isReg() && "Def is not a register"); - unsigned Register = Operand.getReg(); + MCRegister Register = Operand.getReg(); if (ReadOnly.find(Register) != ReadOnly.end()) { reportError(Inst.getLoc(), "Cannot write to read-only register `" + Twine(RI.getName(Register)) + "'"); @@ -542,7 +542,7 @@ bool HexagonMCChecker::checkRegistersReadOnly() { return true; } -bool HexagonMCChecker::registerUsed(unsigned Register) { +bool HexagonMCChecker::registerUsed(MCRegister Register) { for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) for (unsigned j = HexagonMCInstrInfo::getDesc(MCII, I).getNumDefs(), n = I.getNumOperands(); @@ -556,7 +556,7 @@ bool HexagonMCChecker::registerUsed(unsigned Register) { std::tuple HexagonMCChecker::registerProducer( - unsigned Register, HexagonMCInstrInfo::PredicateInfo ConsumerPredicate) { + MCRegister Register, HexagonMCInstrInfo::PredicateInfo ConsumerPredicate) { std::tuple WrongSense; @@ -588,7 +588,7 @@ void HexagonMCChecker::checkRegisterCurDefs() { for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { if (HexagonMCInstrInfo::isCVINew(MCII, I) && HexagonMCInstrInfo::getDesc(MCII, I).mayLoad()) { - const unsigned RegDef = I.getOperand(0).getReg(); + const MCRegister RegDef = I.getOperand(0).getReg(); bool HasRegDefUse = false; for (MCRegAliasIterator Alias(RegDef, &RI, true); Alias.isValid(); @@ -819,7 +819,7 @@ bool HexagonMCChecker::checkHVXAccum() HexagonMCInstrInfo::isAccumulator(MCII, I) && I.getOperand(0).isReg(); if (!IsTarget) continue; - unsigned int R = I.getOperand(0).getReg(); + MCRegister R = I.getOperand(0).getReg(); TmpDefsIterator It = TmpDefs.find(R); if (It != TmpDefs.end()) { reportError("register `" + Twine(RI.getName(R)) + ".tmp" + diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index 42d91f559f51a..e9b87c5315fe4 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -77,15 +77,15 @@ class HexagonMCChecker { void init(); void init(MCInst const &); - void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue); + void initReg(MCInst const &, MCRegister, MCRegister &PredReg, bool &isTrue); - bool registerUsed(unsigned Register); + bool registerUsed(MCRegister Register); /// \return a tuple of: pointer to the producer instruction or nullptr if /// none was found, the operand index, and the PredicateInfo for the /// producer. std::tuple - registerProducer(unsigned Register, + registerProducer(MCRegister Register, HexagonMCInstrInfo::PredicateInfo Predicated); // Checks performed. diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index 96ec81cd86abe..b744519b9725c 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -388,8 +388,8 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, } } -static bool RegisterMatches(unsigned Consumer, unsigned Producer, - unsigned Producer2) { +static bool RegisterMatches(MCRegister Consumer, MCRegister Producer, + MCRegister Producer2) { return (Consumer == Producer) || (Consumer == Producer2) || HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(Producer, Consumer); @@ -721,9 +721,9 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, // Calculate the new value distance to the associated producer unsigned SOffset = 0; unsigned VOffset = 0; - unsigned UseReg = MO.getReg(); - unsigned DefReg1 = Hexagon::NoRegister; - unsigned DefReg2 = Hexagon::NoRegister; + MCRegister UseReg = MO.getReg(); + MCRegister DefReg1; + MCRegister DefReg2; auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle); const MCOperand *I = Instrs.begin() + State.Index - 1; @@ -734,8 +734,8 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, if (HexagonMCInstrInfo::isImmext(Inst)) continue; - DefReg1 = Hexagon::NoRegister; - DefReg2 = Hexagon::NoRegister; + DefReg1 = MCRegister(); + DefReg2 = MCRegister(); ++SOffset; if (HexagonMCInstrInfo::isVector(MCII, Inst)) { // Vector instructions don't count scalars. @@ -770,7 +770,7 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, assert(!MO.isImm()); if (MO.isReg()) { - unsigned Reg = MO.getReg(); + MCRegister Reg = MO.getReg(); switch (HexagonMCInstrInfo::getDesc(MCII, MI) .operands()[OperandNumber] .RegClass) { diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp index 3deef95df3245..c1a9a01aaf7ab 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp @@ -78,7 +78,7 @@ static const unsigned cmpgtn1BitOpcode[8] = { // enum HexagonII::CompoundGroup static unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { - unsigned DstReg, SrcReg, Src1Reg, Src2Reg; + MCRegister DstReg, SrcReg, Src1Reg, Src2Reg; switch (MI.getOpcode()) { default: @@ -174,7 +174,7 @@ static unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { /// getCompoundOp - Return the index from 0-7 into the above opcode lists. static unsigned getCompoundOp(MCInst const &HMCI) { const MCOperand &Predicate = HMCI.getOperand(0); - unsigned PredReg = Predicate.getReg(); + MCRegister PredReg = Predicate.getReg(); assert((PredReg == Hexagon::P0) || (PredReg == Hexagon::P1) || (PredReg == Hexagon::P2) || (PredReg == Hexagon::P3)); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index 36d6c8c9f84b8..f3bdaf7921efe 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -187,7 +187,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) { } unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { - unsigned DstReg, PredReg, SrcReg, Src1Reg, Src2Reg; + MCRegister DstReg, PredReg, SrcReg, Src1Reg, Src2Reg; switch (MCI.getOpcode()) { default: @@ -533,7 +533,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { } bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) { - unsigned DstReg, SrcReg; + MCRegister DstReg, SrcReg; switch (potentialDuplex.getOpcode()) { case Hexagon::A2_addi: // testing for case of: Rx = add(Rx,#s7) @@ -657,7 +657,7 @@ bool HexagonMCInstrInfo::isDuplexPair(MCInst const &MIa, MCInst const &MIb) { inline static void addOps(MCInst &subInstPtr, MCInst const &Inst, unsigned opNum) { if (Inst.getOperand(opNum).isReg()) { - switch (Inst.getOperand(opNum).getReg()) { + switch (Inst.getOperand(opNum).getReg().id()) { default: llvm_unreachable("Not Duplexable Register"); break; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp index a6de2ab9c75a2..a2ac8b70b9db4 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp @@ -257,10 +257,10 @@ MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII, return MCII.get(MCI.getOpcode()); } -unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(unsigned Reg) { +unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(MCRegister Reg) { using namespace Hexagon; - switch (Reg) { + switch (Reg.id()) { default: llvm_unreachable("unknown duplex register"); // Rs Rss @@ -616,7 +616,7 @@ bool HexagonMCInstrInfo::isCVINew(MCInstrInfo const &MCII, MCInst const &MCI) { return ((F >> HexagonII::CVINewPos) & HexagonII::CVINewMask); } -bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) { +bool HexagonMCInstrInfo::isDblRegForSubInst(MCRegister Reg) { return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) || (Reg >= Hexagon::D8 && Reg <= Hexagon::D11)); } @@ -657,11 +657,11 @@ bool HexagonMCInstrInfo::isInnerLoop(MCInst const &MCI) { return (Flags & innerLoopMask) != 0; } -bool HexagonMCInstrInfo::isIntReg(unsigned Reg) { +bool HexagonMCInstrInfo::isIntReg(MCRegister Reg) { return (Reg >= Hexagon::R0 && Reg <= Hexagon::R31); } -bool HexagonMCInstrInfo::isIntRegForSubInst(unsigned Reg) { +bool HexagonMCInstrInfo::isIntRegForSubInst(MCRegister Reg) { return ((Reg >= Hexagon::R0 && Reg <= Hexagon::R7) || (Reg >= Hexagon::R16 && Reg <= Hexagon::R23)); } @@ -691,21 +691,21 @@ bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) { return (Flags & outerLoopMask) != 0; } -bool HexagonMCInstrInfo::IsVecRegPair(unsigned VecReg) { +bool HexagonMCInstrInfo::IsVecRegPair(MCRegister VecReg) { return (VecReg >= Hexagon::W0 && VecReg <= Hexagon::W15) || (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15); } -bool HexagonMCInstrInfo::IsReverseVecRegPair(unsigned VecReg) { +bool HexagonMCInstrInfo::IsReverseVecRegPair(MCRegister VecReg) { return (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15); } -bool HexagonMCInstrInfo::IsVecRegSingle(unsigned VecReg) { +bool HexagonMCInstrInfo::IsVecRegSingle(MCRegister VecReg) { return (VecReg >= Hexagon::V0 && VecReg <= Hexagon::V31); } std::pair -HexagonMCInstrInfo::GetVecRegPairIndices(unsigned VecRegPair) { +HexagonMCInstrInfo::GetVecRegPairIndices(MCRegister VecRegPair) { assert(IsVecRegPair(VecRegPair) && "VecRegPair must be a vector register pair"); @@ -717,8 +717,8 @@ HexagonMCInstrInfo::GetVecRegPairIndices(unsigned VecRegPair) { : std::make_pair(PairIndex + 1, PairIndex); } -bool HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(unsigned Producer, - unsigned Consumer) { +bool HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(MCRegister Producer, + MCRegister Consumer) { if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) { const unsigned ProdPairIndex = IsReverseVecRegPair(Producer) ? Producer - Hexagon::WR0 @@ -760,7 +760,7 @@ bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII, !((F >> HexagonII::PredicatedFalsePos) & HexagonII::PredicatedFalseMask)); } -bool HexagonMCInstrInfo::isPredReg(MCRegisterInfo const &MRI, unsigned Reg) { +bool HexagonMCInstrInfo::isPredReg(MCRegisterInfo const &MRI, MCRegister Reg) { auto &PredRegClass = MRI.getRegClass(Hexagon::PredRegsRegClassID); return PredRegClass.contains(Reg); } @@ -1031,9 +1031,9 @@ void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) { Operand.setImm(Operand.getImm() | outerLoopMask); } -unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer, - unsigned Producer, - unsigned Producer2) { +unsigned HexagonMCInstrInfo::SubregisterBit(MCRegister Consumer, + MCRegister Producer, + MCRegister Producer2) { // If we're a single vector consumer of a double producer, set subreg bit // based on if we're accessing the lower or upper register component if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) { diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h index ccd2482c3fd76..df942b63ee282 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h @@ -132,7 +132,7 @@ unsigned getDuplexCandidateGroup(MCInst const &MI); SmallVector getDuplexPossibilties(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst const &MCB); -unsigned getDuplexRegisterNumbering(unsigned Reg); +unsigned getDuplexRegisterNumbering(MCRegister Reg); MCExpr const &getExpr(MCExpr const &Expr); @@ -231,7 +231,7 @@ bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI); bool isCVINew(MCInstrInfo const &MCII, MCInst const &MCI); // Is this double register suitable for use in a duplex subinst -bool isDblRegForSubInst(unsigned Reg); +bool isDblRegForSubInst(MCRegister Reg); // Is this a duplex instruction bool isDuplex(MCInstrInfo const &MCII, MCInst const &MCI); @@ -260,10 +260,10 @@ bool isImmext(MCInst const &MCI); bool isInnerLoop(MCInst const &MCI); // Is this an integer register -bool isIntReg(unsigned Reg); +bool isIntReg(MCRegister Reg); // Is this register suitable for use in a duplex subinst -bool isIntRegForSubInst(unsigned Reg); +bool isIntRegForSubInst(MCRegister Reg); bool isMemReorderDisabled(MCInst const &MCI); // Return whether the insn is a new-value consumer. @@ -289,7 +289,7 @@ bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI); bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI); // Return true if this is a scalar predicate register. -bool isPredReg(MCRegisterInfo const &MRI, unsigned Reg); +bool isPredReg(MCRegisterInfo const &MRI, MCRegister Reg); // Returns true if the Ith operand is a predicate register. bool isPredRegister(MCInstrInfo const &MCII, MCInst const &Inst, unsigned I); @@ -333,11 +333,11 @@ unsigned slotsConsumed(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, void padEndloop(MCInst &MCI, MCContext &Context); class PredicateInfo { public: - PredicateInfo() : Register(0), Operand(0), PredicatedTrue(false) {} - PredicateInfo(unsigned Register, unsigned Operand, bool PredicatedTrue) + PredicateInfo() : Operand(0), PredicatedTrue(false) {} + PredicateInfo(MCRegister Register, unsigned Operand, bool PredicatedTrue) : Register(Register), Operand(Operand), PredicatedTrue(PredicatedTrue) {} bool isPredicated() const; - unsigned Register; + MCRegister Register; unsigned Operand; bool PredicatedTrue; }; @@ -360,18 +360,18 @@ void setOuterLoop(MCInst &MCI); // Would duplexing this instruction create a requirement to extend bool subInstWouldBeExtended(MCInst const &potentialDuplex); -unsigned SubregisterBit(unsigned Consumer, unsigned Producer, - unsigned Producer2); +unsigned SubregisterBit(MCRegister Consumer, MCRegister Producer, + MCRegister Producer2); -bool IsVecRegSingle(unsigned VecReg); -bool IsVecRegPair(unsigned VecReg); -bool IsReverseVecRegPair(unsigned VecReg); -bool IsSingleConsumerRefPairProducer(unsigned Producer, unsigned Consumer); +bool IsVecRegSingle(MCRegister VecReg); +bool IsVecRegPair(MCRegister VecReg); +bool IsReverseVecRegPair(MCRegister VecReg); +bool IsSingleConsumerRefPairProducer(MCRegister Producer, MCRegister Consumer); /// Returns an ordered pair of the constituent register ordinals for /// each of the elements of \a VecRegPair. For example, Hexagon::W0 ("v0:1") /// returns { 0, 1 } and Hexagon::W1 ("v3:2") returns { 3, 2 }. -std::pair GetVecRegPairIndices(unsigned VecRegPair); +std::pair GetVecRegPairIndices(MCRegister VecRegPair); // Attempt to find and replace compound pairs void tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index 280f1f3ddbb69..7ede5e3ed4093 100644 --- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -118,7 +118,7 @@ struct LanaiOperand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNum; + MCRegister RegNum; }; struct ImmOp { @@ -126,8 +126,8 @@ struct LanaiOperand : public MCParsedAsmOperand { }; struct MemOp { - unsigned BaseReg; - unsigned OffsetReg; + MCRegister BaseReg; + MCRegister OffsetReg; unsigned AluOp; const MCExpr *Offset; }; @@ -166,12 +166,12 @@ struct LanaiOperand : public MCParsedAsmOperand { return StringRef(Tok.Data, Tok.Length); } - unsigned getMemBaseReg() const { + MCRegister getMemBaseReg() const { assert(isMem() && "Invalid type access!"); return Mem.BaseReg; } - unsigned getMemOffsetReg() const { + MCRegister getMemOffsetReg() const { assert(isMem() && "Invalid type access!"); return Mem.OffsetReg; } @@ -439,7 +439,7 @@ struct LanaiOperand : public MCParsedAsmOperand { void addMemRegRegOperands(MCInst &Inst, unsigned N) const { assert(N == 3 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getMemBaseReg())); - assert(getMemOffsetReg() != 0 && "Invalid offset"); + assert(getMemOffsetReg() && "Invalid offset"); Inst.addOperand(MCOperand::createReg(getMemOffsetReg())); Inst.addOperand(MCOperand::createImm(getMemOp())); } @@ -589,10 +589,10 @@ struct LanaiOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr createReg(unsigned RegNum, SMLoc Start, + static std::unique_ptr createReg(MCRegister Reg, SMLoc Start, SMLoc End) { auto Op = std::make_unique(REGISTER); - Op->Reg.RegNum = RegNum; + Op->Reg.RegNum = Reg; Op->StartLoc = Start; Op->EndLoc = End; return Op; @@ -611,7 +611,7 @@ struct LanaiOperand : public MCParsedAsmOperand { MorphToMemImm(std::unique_ptr Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = MEMORY_IMM; - Op->Mem.BaseReg = 0; + Op->Mem.BaseReg = MCRegister(); Op->Mem.AluOp = LPAC::ADD; Op->Mem.OffsetReg = 0; Op->Mem.Offset = Imm; @@ -619,9 +619,9 @@ struct LanaiOperand : public MCParsedAsmOperand { } static std::unique_ptr - MorphToMemRegReg(unsigned BaseReg, std::unique_ptr Op, + MorphToMemRegReg(MCRegister BaseReg, std::unique_ptr Op, unsigned AluOp) { - unsigned OffsetReg = Op->getReg(); + MCRegister OffsetReg = Op->getReg(); Op->Kind = MEMORY_REG_REG; Op->Mem.BaseReg = BaseReg; Op->Mem.AluOp = AluOp; @@ -631,7 +631,7 @@ struct LanaiOperand : public MCParsedAsmOperand { } static std::unique_ptr - MorphToMemRegImm(unsigned BaseReg, std::unique_ptr Op, + MorphToMemRegImm(MCRegister BaseReg, std::unique_ptr Op, unsigned AluOp) { const MCExpr *Imm = Op->getImm(); Op->Kind = MEMORY_REG_IMM; @@ -691,21 +691,21 @@ LanaiAsmParser::parseRegister(bool RestoreOnFailure) { SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); std::optional PercentTok; - unsigned RegNum; + MCRegister Reg; // Eat the '%'. if (Lexer.getKind() == AsmToken::Percent) { PercentTok = Parser.getTok(); Parser.Lex(); } if (Lexer.getKind() == AsmToken::Identifier) { - RegNum = MatchRegisterName(Lexer.getTok().getIdentifier()); - if (RegNum == 0) { + Reg = MatchRegisterName(Lexer.getTok().getIdentifier()); + if (!Reg) { if (PercentTok && RestoreOnFailure) Lexer.UnLex(*PercentTok); return nullptr; } Parser.Lex(); // Eat identifier token - return LanaiOperand::createReg(RegNum, Start, End); + return LanaiOperand::createReg(Reg, Start, End); } if (PercentTok && RestoreOnFailure) Lexer.UnLex(*PercentTok); @@ -900,7 +900,7 @@ ParseStatus LanaiAsmParser::parseMemoryOperand(OperandVector &Operands) { // Use 0 if no offset given int OffsetValue = 0; - unsigned BaseReg = 0; + MCRegister BaseReg; unsigned AluOp = LPAC::ADD; bool PostOp = false, PreOp = false; diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h index 1bc84014e7367..3c0b34dc8ecf4 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h @@ -37,8 +37,8 @@ enum TOF { }; } // namespace LanaiII -static inline unsigned getLanaiRegisterNumbering(unsigned Reg) { - switch (Reg) { +static inline unsigned getLanaiRegisterNumbering(MCRegister Reg) { + switch (Reg.id()) { case Lanai::R0: return 0; case Lanai::R1: diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index 57c42024b4d2b..e3abb7eecc32b 100644 --- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -559,10 +559,10 @@ class LoongArchOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr createReg(unsigned RegNo, SMLoc S, + static std::unique_ptr createReg(MCRegister Reg, SMLoc S, SMLoc E) { auto Op = std::make_unique(KindTy::Register); - Op->Reg.RegNum = RegNo; + Op->Reg.RegNum = Reg; Op->StartLoc = S; Op->EndLoc = E; return Op; @@ -1424,9 +1424,9 @@ unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) { switch (Opc) { default: if (Opc >= LoongArch::AMADD_D && Opc <= LoongArch::AMXOR_W) { - unsigned Rd = Inst.getOperand(0).getReg(); - unsigned Rk = Inst.getOperand(1).getReg(); - unsigned Rj = Inst.getOperand(2).getReg(); + MCRegister Rd = Inst.getOperand(0).getReg(); + MCRegister Rk = Inst.getOperand(1).getReg(); + MCRegister Rj = Inst.getOperand(2).getReg(); if ((Rd == Rk || Rd == Rj) && Rd != LoongArch::R0) return Match_RequiresAMORdDifferRkRj; } @@ -1435,7 +1435,7 @@ unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) { case LoongArch::PseudoLA_TLS_DESC_ABS_LARGE: case LoongArch::PseudoLA_TLS_DESC_PC: case LoongArch::PseudoLA_TLS_DESC_PC_LARGE: { - unsigned Rd = Inst.getOperand(0).getReg(); + MCRegister Rd = Inst.getOperand(0).getReg(); if (Rd != LoongArch::R4) return Match_RequiresLAORdR4; break; @@ -1445,15 +1445,15 @@ unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) { case LoongArch::PseudoLA_TLS_IE_LARGE: case LoongArch::PseudoLA_TLS_LD_LARGE: case LoongArch::PseudoLA_TLS_GD_LARGE: { - unsigned Rd = Inst.getOperand(0).getReg(); - unsigned Rj = Inst.getOperand(1).getReg(); + MCRegister Rd = Inst.getOperand(0).getReg(); + MCRegister Rj = Inst.getOperand(1).getReg(); if (Rd == Rj) return Match_RequiresLAORdDifferRj; break; } case LoongArch::CSRXCHG: case LoongArch::GCSRXCHG: { - unsigned Rj = Inst.getOperand(2).getReg(); + MCRegister Rj = Inst.getOperand(2).getReg(); if (Rj == LoongArch::R0 || Rj == LoongArch::R1) return Match_RequiresOpnd2NotR0R1; return Match_Success; diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index 34ae80669f2c3..7a8835c3af60f 100644 --- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -101,12 +101,12 @@ class MSP430Operand : public MCParsedAsmOperand { } Kind; struct Memory { - unsigned Reg; + MCRegister Reg; const MCExpr *Offset; }; union { const MCExpr *Imm; - unsigned Reg; + MCRegister Reg; StringRef Tok; Memory Mem; }; @@ -116,11 +116,11 @@ class MSP430Operand : public MCParsedAsmOperand { public: MSP430Operand(StringRef Tok, SMLoc const &S) : Kind(k_Tok), Tok(Tok), Start(S), End(S) {} - MSP430Operand(KindTy Kind, unsigned Reg, SMLoc const &S, SMLoc const &E) + MSP430Operand(KindTy Kind, MCRegister Reg, SMLoc const &S, SMLoc const &E) : Kind(Kind), Reg(Reg), Start(S), End(E) {} MSP430Operand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E) : Kind(k_Imm), Imm(Imm), Start(S), End(E) {} - MSP430Operand(unsigned Reg, MCExpr const *Expr, SMLoc const &S, + MSP430Operand(MCRegister Reg, MCExpr const *Expr, SMLoc const &S, SMLoc const &E) : Kind(k_Mem), Mem({Reg, Expr}), Start(S), End(E) {} @@ -188,7 +188,7 @@ class MSP430Operand : public MCParsedAsmOperand { return Reg; } - void setReg(unsigned RegNo) { + void setReg(MCRegister RegNo) { assert(Kind == k_Reg && "Invalid access!"); Reg = RegNo; } @@ -197,9 +197,9 @@ class MSP430Operand : public MCParsedAsmOperand { return std::make_unique(Str, S); } - static std::unique_ptr CreateReg(unsigned RegNum, SMLoc S, + static std::unique_ptr CreateReg(MCRegister Reg, SMLoc S, SMLoc E) { - return std::make_unique(k_Reg, RegNum, S, E); + return std::make_unique(k_Reg, Reg, S, E); } static std::unique_ptr CreateImm(const MCExpr *Val, SMLoc S, @@ -207,20 +207,19 @@ class MSP430Operand : public MCParsedAsmOperand { return std::make_unique(Val, S, E); } - static std::unique_ptr CreateMem(unsigned RegNum, - const MCExpr *Val, - SMLoc S, SMLoc E) { - return std::make_unique(RegNum, Val, S, E); + static std::unique_ptr + CreateMem(MCRegister Reg, const MCExpr *Val, SMLoc S, SMLoc E) { + return std::make_unique(Reg, Val, S, E); } - static std::unique_ptr CreateIndReg(unsigned RegNum, SMLoc S, - SMLoc E) { - return std::make_unique(k_IndReg, RegNum, S, E); + static std::unique_ptr CreateIndReg(MCRegister Reg, SMLoc S, + SMLoc E) { + return std::make_unique(k_IndReg, Reg, S, E); } - static std::unique_ptr CreatePostIndReg(unsigned RegNum, SMLoc S, - SMLoc E) { - return std::make_unique(k_PostIndReg, RegNum, S, E); + static std::unique_ptr CreatePostIndReg(MCRegister Reg, + SMLoc S, SMLoc E) { + return std::make_unique(k_PostIndReg, Reg, S, E); } SMLoc getStartLoc() const override { return Start; } @@ -545,8 +544,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmParser() { #define GET_MATCHER_IMPLEMENTATION #include "MSP430GenAsmMatcher.inc" -static unsigned convertGR16ToGR8(unsigned Reg) { - switch (Reg) { +static MCRegister convertGR16ToGR8(MCRegister Reg) { + switch (Reg.id()) { default: llvm_unreachable("Unknown GR16 register"); case MSP430::PC: return MSP430::PCB; @@ -575,7 +574,7 @@ unsigned MSP430AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, if (!Op.isReg()) return Match_InvalidOperand; - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); bool isGR16 = MSP430MCRegisterClasses[MSP430::GR16RegClassID].contains(Reg); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 56c96ea943b89..7f942de74bdcc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1109,11 +1109,21 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *CastN = cast(N); unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); unsigned DstAddrSpace = CastN->getDestAddressSpace(); + SDLoc DL(N); assert(SrcAddrSpace != DstAddrSpace && "addrspacecast must be between different address spaces"); if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { // Specific to generic + + if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) { + SDValue CvtNone = + CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32); + SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64, + Src, CvtNone); + Src = SDValue(Cvt, 0); + } + unsigned Opc; switch (SrcAddrSpace) { default: report_fatal_error("Bad address space in addrspacecast"); @@ -1121,26 +1131,16 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global; break; case ADDRESS_SPACE_SHARED: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 - ? NVPTX::cvta_shared_6432 - : NVPTX::cvta_shared_64) - : NVPTX::cvta_shared; + Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared; break; case ADDRESS_SPACE_CONST: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 - ? NVPTX::cvta_const_6432 - : NVPTX::cvta_const_64) - : NVPTX::cvta_const; + Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const; break; case ADDRESS_SPACE_LOCAL: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 - ? NVPTX::cvta_local_6432 - : NVPTX::cvta_local_64) - : NVPTX::cvta_local; + Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local; break; } - ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), - Src)); + ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src)); return; } else { // Generic to specific @@ -1153,30 +1153,28 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global; break; case ADDRESS_SPACE_SHARED: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 - ? NVPTX::cvta_to_shared_3264 - : NVPTX::cvta_to_shared_64) - : NVPTX::cvta_to_shared; + Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared; break; case ADDRESS_SPACE_CONST: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 - ? NVPTX::cvta_to_const_3264 - : NVPTX::cvta_to_const_64) - : NVPTX::cvta_to_const; + Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const; break; case ADDRESS_SPACE_LOCAL: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 - ? NVPTX::cvta_to_local_3264 - : NVPTX::cvta_to_local_64) - : NVPTX::cvta_to_local; + Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local; break; case ADDRESS_SPACE_PARAM: - Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 - : NVPTX::nvvm_ptr_gen_to_param; + Opc = TM.is64Bit() ? NVPTX::IMOV64rr : NVPTX::IMOV32rr; break; } - ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), - Src)); + + SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src); + if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) { + SDValue CvtNone = + CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32); + CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32, + SDValue(CVTA, 0), CvtNone); + } + + ReplaceNode(N, CVTA); return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 0d9dd1b8ee70a..b82826089d3fe 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -174,10 +174,6 @@ def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">; def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" "&& Subtarget->getPTXVersion() >= 64)">; -def useShortPtrLocal : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_LOCAL) == 32">; -def useShortPtrShared : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32">; -def useShortPtrConst : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_CONST) == 32">; - def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 176d28c991207..f5ac3c4e96436 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2543,59 +2543,45 @@ defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>; -multiclass NG_TO_G { +multiclass NG_TO_G { def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - !strconcat("cvta.", Str, ".u32 \t$result, $src;"), - [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; + "cvta." # Str # ".u32 \t$result, $src;", []>; def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - !strconcat("cvta.", Str, ".u64 \t$result, $src;"), - [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; - def _6432 : NVPTXInst<(outs Int64Regs:$result), (ins Int32Regs:$src), - "{{ .reg .b64 %tmp;\n\t" - #" cvt.u64.u32 \t%tmp, $src;\n\t" - #" cvta." # Str # ".u64 \t$result, %tmp; }}", - [(set Int64Regs:$result, (Intrin Int32Regs:$src))]>, - Requires<[ShortPtr]>; + "cvta." # Str # ".u64 \t$result, $src;", []>; } -multiclass G_TO_NG { +multiclass G_TO_NG { def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - !strconcat("cvta.to.", Str, ".u32 \t$result, $src;"), - [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; + "cvta.to." # Str # ".u32 \t$result, $src;", []>; def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"), - [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; - def _3264 : NVPTXInst<(outs Int32Regs:$result), (ins Int64Regs:$src), - "{{ .reg .b64 %tmp;\n\t" - #" cvta.to." # Str # ".u64 \t%tmp, $src;\n\t" - #" cvt.u32.u64 \t$result, %tmp; }}", - [(set Int32Regs:$result, (Intrin Int64Regs:$src))]>, - Requires<[ShortPtr]>; -} - -defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal>; -defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>; -defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>; -defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>; -defm cvta_param : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>; - -defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>; -defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>; -defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global, False>; -defm cvta_to_const : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant, useShortPtrConst>; + "cvta.to." # Str # ".u64 \t$result, $src;", []>; +} + +defm cvta_local : NG_TO_G<"local">; +defm cvta_shared : NG_TO_G<"shared">; +defm cvta_global : NG_TO_G<"global">; +defm cvta_const : NG_TO_G<"const">; + +defm cvta_to_local : G_TO_NG<"local">; +defm cvta_to_shared : G_TO_NG<"shared">; +defm cvta_to_global : G_TO_NG<"global">; +defm cvta_to_const : G_TO_NG<"const">; + +// nvvm.ptr.param.to.gen +defm cvta_param : NG_TO_G<"param">; + +def : Pat<(int_nvvm_ptr_param_to_gen Int32Regs:$src), + (cvta_param Int32Regs:$src)>; + +def : Pat<(int_nvvm_ptr_param_to_gen Int64Regs:$src), + (cvta_param_64 Int64Regs:$src)>; // nvvm.ptr.gen.to.param -def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result), - (ins Int32Regs:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, - (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>; -def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result), - (ins Int64Regs:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, - (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>; +def : Pat<(int_nvvm_ptr_gen_to_param Int32Regs:$src), + (IMOV32rr Int32Regs:$src)>; +def : Pat<(int_nvvm_ptr_gen_to_param Int64Regs:$src), + (IMOV64rr Int64Regs:$src)>; // nvvm.move intrinsicc def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s), @@ -2638,24 +2624,6 @@ def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s), [(set Int64Regs:$r, (int_nvvm_move_ptr texternalsym:$s))]>;*/ - -// MoveParam %r1, param -// ptr_local_to_gen %r2, %r1 -// ptr_gen_to_local %r3, %r2 -// -> -// mov %r1, param - -// @TODO: Revisit this. There is a type -// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym -// instructions are not currently defined. However, we can use the ptr -// variants and the asm printer will do the right thing. -def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen - (MoveParam texternalsym:$src)))), - (nvvm_move_ptr64 texternalsym:$src)>; -def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen - (MoveParam texternalsym:$src)))), - (nvvm_move_ptr32 texternalsym:$src)>; - def texsurf_handles : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src), "mov.u64 \t$result, $src;", []>; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index bd796efd836c7..e77f8783f1727 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19640,8 +19640,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, DAG.getIntPtrConstant(HiVA.getLocMemOffset(), DL)); // Emit the store. - MemOpChains.push_back( - DAG.getStore(Chain, DL, Hi, Address, MachinePointerInfo())); + MemOpChains.push_back(DAG.getStore( + Chain, DL, Hi, Address, + MachinePointerInfo::getStack(MF, HiVA.getLocMemOffset()))); } else { // Second half of f64 is passed in another GPR. Register RegHigh = HiVA.getLocReg(); @@ -19723,7 +19724,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, // Emit the store. MemOpChains.push_back( - DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo())); + DAG.getStore(Chain, DL, ArgValue, Address, + MachinePointerInfo::getStack(MF, VA.getLocMemOffset()))); } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index f0295d289ed86..529944044f02d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -779,7 +779,7 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl( if (RISCV::getRVVMCOpcode(MI.getOpcode()) == RISCV::VMV_X_S) { unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm(); - if (STI.getXLen() < (1 << Log2SEW)) + if (STI.getXLen() < (1U << Log2SEW)) return nullptr; switch (Log2SEW) { case 3: diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 121e68bbd975a..c042782389f18 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1182,10 +1182,10 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, // We do not use vsext/vzext to extend from mask vector. // Instead we use the following instructions to extend from mask vector: // vmv.v.i v8, 0 - // vmerge.vim v8, v8, -1, v0 - return DstLT.first * - getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM}, - DstLT.second, CostKind) + + // vmerge.vim v8, v8, -1, v0 (repeated per split) + return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) + + DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM, + DstLT.second, CostKind) + DstLT.first - 1; } break; diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h index 2fbb4381da263..24047f31fab29 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h @@ -24,6 +24,7 @@ namespace llvm { class SPIRVTTIImpl : public BasicTTIImplBase { using BaseT = BasicTTIImplBase; + using TTI = TargetTransformInfo; friend BaseT; @@ -37,6 +38,16 @@ class SPIRVTTIImpl : public BasicTTIImplBase { explicit SPIRVTTIImpl(const SPIRVTargetMachine *TM, const Function &F) : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { + // SPIR-V natively supports OpBitcount, per 3.53.14 in the spec, as such it + // is reasonable to assume the Op is fast / preferable to the expanded loop. + // Furthermore, this prevents information being lost if transforms are + // applied to SPIR-V before lowering to a concrete target. + if (!isPowerOf2_32(TyWidth) || TyWidth > 64) + return TTI::PSK_Software; // Arbitrary bit-width INT is not core SPIR-V. + return TTI::PSK_FastHardware; + } }; } // namespace llvm diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp index fec2d3a35ae6d..50a96368bbdca 100644 --- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp @@ -48,6 +48,10 @@ static std::string computeDataLayout(const Triple &T, bool is64Bit) { // Alignments for 64 bit integers. Ret += "-i64:64"; + // Alignments for 128 bit integers. + // This is not specified in the ABI document but is the de facto standard. + Ret += "-i128:128"; + // On SparcV9 128 floats are aligned to 128 bits, on others only to 64. // On SparcV9 registers can hold 64 or 32 bits, on others only 32. if (is64Bit) diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index ba105c12bc4e9..43483ccada366 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1925,11 +1925,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, IsTailCall = false; // Integer args <=32 bits should have an extension attribute. - bool IsInternal = false; - if (auto *G = dyn_cast(Callee)) - if (const Function *Fn = dyn_cast(G->getGlobal())) - IsInternal = isFullyInternal(Fn); - verifyNarrowIntegerArgs(Outs, IsInternal); + verifyNarrowIntegerArgs_Call(Outs, &MF.getFunction(), Callee); // Analyze the operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -2192,7 +2188,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, MachineFunction &MF = DAG.getMachineFunction(); // Integer args <=32 bits should have an extension attribute. - verifyNarrowIntegerArgs(Outs, isFullyInternal(&MF.getFunction())); + verifyNarrowIntegerArgs_Ret(Outs, &MF.getFunction()); // Assign locations to each returned value. SmallVector RetLocs; @@ -9835,23 +9831,74 @@ bool SystemZTargetLowering::isFullyInternal(const Function *Fn) const { return true; } -// Verify that narrow integer arguments are extended as required by the ABI. +static void printFunctionArgExts(const Function *F, raw_fd_ostream &OS) { + FunctionType *FT = F->getFunctionType(); + const AttributeList &Attrs = F->getAttributes(); + if (Attrs.hasRetAttrs()) + OS << Attrs.getAsString(AttributeList::ReturnIndex) << " "; + OS << *F->getReturnType() << " @" << F->getName() << "("; + for (unsigned I = 0, E = FT->getNumParams(); I != E; ++I) { + if (I) + OS << ", "; + OS << *FT->getParamType(I); + AttributeSet ArgAttrs = Attrs.getParamAttrs(I); + for (auto A : {Attribute::SExt, Attribute::ZExt, Attribute::NoExt}) + if (ArgAttrs.hasAttribute(A)) + OS << " " << Attribute::getNameFromAttrKind(A); + } + OS << ")\n"; +} + void SystemZTargetLowering:: +verifyNarrowIntegerArgs_Call(const SmallVectorImpl &Outs, + const Function *F, SDValue Callee) const { + bool IsInternal = false; + const Function *CalleeFn = nullptr; + if (auto *G = dyn_cast(Callee)) + if (CalleeFn = dyn_cast(G->getGlobal())) + IsInternal = isFullyInternal(CalleeFn); + if (!verifyNarrowIntegerArgs(Outs, IsInternal)) { + errs() << "ERROR: Missing extension attribute of passed " + << "value in call to function:\n" << "Callee: "; + if (CalleeFn != nullptr) + printFunctionArgExts(CalleeFn, errs()); + else + errs() << "-"; + errs() << "Caller: "; + printFunctionArgExts(F, errs()); + llvm_unreachable(""); + } +} + +void SystemZTargetLowering:: +verifyNarrowIntegerArgs_Ret(const SmallVectorImpl &Outs, + const Function *F) const { + if (!verifyNarrowIntegerArgs(Outs, isFullyInternal(F))) { + errs() << "ERROR: Missing extension attribute of returned " + << "value from function:\n"; + printFunctionArgExts(F, errs()); + llvm_unreachable(""); + } +} + +// Verify that narrow integer arguments are extended as required by the ABI. +// Return false if an error is found. +bool SystemZTargetLowering:: verifyNarrowIntegerArgs(const SmallVectorImpl &Outs, bool IsInternal) const { if (IsInternal || !Subtarget.isTargetELF()) - return; + return true; // Temporarily only do the check when explicitly requested, until it can be // enabled by default. if (!EnableIntArgExtCheck) - return; + return true; if (EnableIntArgExtCheck.getNumOccurrences()) { if (!EnableIntArgExtCheck) - return; + return true; } else if (!getTargetMachine().Options.VerifyArgABICompliance) - return; + return true; for (unsigned i = 0; i < Outs.size(); ++i) { MVT VT = Outs[i].VT; @@ -9859,10 +9906,11 @@ verifyNarrowIntegerArgs(const SmallVectorImpl &Outs, if (VT.isInteger()) { assert((VT == MVT::i32 || VT.getSizeInBits() >= 64) && "Unexpected integer argument VT."); - assert((VT != MVT::i32 || - (Flags.isSExt() || Flags.isZExt() || Flags.isNoExt())) && - "Narrow integer argument must have a valid extension type."); - (void)Flags; + if (VT == MVT::i32 && + !Flags.isSExt() && !Flags.isZExt() && !Flags.isNoExt()) + return false; } } + + return true; } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 8c528897182d1..2b065245c16f2 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -806,7 +806,11 @@ class SystemZTargetLowering : public TargetLowering { const TargetRegisterClass *getRepRegClassFor(MVT VT) const override; bool isFullyInternal(const Function *Fn) const; - void verifyNarrowIntegerArgs(const SmallVectorImpl &Outs, + void verifyNarrowIntegerArgs_Call(const SmallVectorImpl &Outs, + const Function *F, SDValue Callee) const; + void verifyNarrowIntegerArgs_Ret(const SmallVectorImpl &Outs, + const Function *F) const; + bool verifyNarrowIntegerArgs(const SmallVectorImpl &Outs, bool IsInternal) const; }; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index c6518a8e4363e..4bf660b5e234a 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -902,6 +902,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { uint64_t Disp = (int32_t)AM.Disp; unsigned IndexReg = AM.IndexReg; unsigned Scale = AM.Scale; + MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT(); + gep_type_iterator GTI = gep_type_begin(U); // Iterate through the indices, folding what we can. Constants can be // folded, and one dynamic index can be handled, if the scale is supported. @@ -937,7 +939,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { (S == 1 || S == 2 || S == 4 || S == 8)) { // Scaled-index addressing. Scale = S; - IndexReg = getRegForGEPIndex(Op); + IndexReg = getRegForGEPIndex(PtrVT, Op); if (IndexReg == 0) return false; break; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index b9124658028da..8561658379f7e 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -124,12 +124,14 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, !Subtarget.hasX87()) return MVT::i32; - if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) - return getRegisterTypeForCallingConv(Context, CC, - VT.changeVectorElementType(MVT::f16)); + if (isTypeLegal(MVT::f16)) { + if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) + return getRegisterTypeForCallingConv( + Context, CC, VT.changeVectorElementType(MVT::f16)); - if (VT == MVT::bf16) - return MVT::f16; + if (VT == MVT::bf16) + return MVT::f16; + } return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -162,7 +164,8 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, return 3; } - if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) + if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 && + isTypeLegal(MVT::f16)) return getNumRegistersForCallingConv(Context, CC, VT.changeVectorElementType(MVT::f16)); @@ -194,7 +197,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( } // Split vNbf16 vectors according to vNf16. - if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) + if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 && + isTypeLegal(MVT::f16)) VT = VT.changeVectorElementType(MVT::f16); return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 56d1133b25549..99b46591da420 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -3294,6 +3294,12 @@ InformationCache::getIndirectlyCallableFunctions(Attributor &A) const { return IndirectlyCallableFunctions; } +std::optional InformationCache::getFlatAddressSpace() const { + if (TargetTriple.isAMDGPU() || TargetTriple.isNVPTX()) + return 0; + return std::nullopt; +} + void Attributor::recordDependence(const AbstractAttribute &FromAA, const AbstractAttribute &ToAA, DepClassTy DepClass) { diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 038a374e19f79..416dd09ca874b 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -12580,8 +12580,19 @@ struct AAAddressSpaceImpl : public AAAddressSpace { void initialize(Attributor &A) override { assert(getAssociatedType()->isPtrOrPtrVectorTy() && "Associated value is not a pointer"); - if (getAssociatedType()->getPointerAddressSpace()) + + if (!A.getInfoCache().getFlatAddressSpace().has_value()) { + indicatePessimisticFixpoint(); + return; + } + + unsigned FlatAS = A.getInfoCache().getFlatAddressSpace().value(); + unsigned AS = getAssociatedType()->getPointerAddressSpace(); + if (AS != FlatAS) { + [[maybe_unused]] bool R = takeAddressSpace(AS); + assert(R && "The take should happen"); indicateOptimisticFixpoint(); + } } ChangeStatus updateImpl(Attributor &A) override { @@ -12603,12 +12614,13 @@ struct AAAddressSpaceImpl : public AAAddressSpace { /// See AbstractAttribute::manifest(...). ChangeStatus manifest(Attributor &A) override { - Value *AssociatedValue = &getAssociatedValue(); - Value *OriginalValue = peelAddrspacecast(AssociatedValue); - if (getAddressSpace() == NoAddressSpace || + if (getAddressSpace() == InvalidAddressSpace || getAddressSpace() == getAssociatedType()->getPointerAddressSpace()) return ChangeStatus::UNCHANGED; + Value *AssociatedValue = &getAssociatedValue(); + Value *OriginalValue = peelAddrspacecast(AssociatedValue); + PointerType *NewPtrTy = PointerType::get(getAssociatedType()->getContext(), getAddressSpace()); bool UseOriginalValue = @@ -12655,17 +12667,17 @@ struct AAAddressSpaceImpl : public AAAddressSpace { if (!isValidState()) return "addrspace()"; return "addrspace(" + - (AssumedAddressSpace == NoAddressSpace + (AssumedAddressSpace == InvalidAddressSpace ? "none" : std::to_string(AssumedAddressSpace)) + ")"; } private: - uint32_t AssumedAddressSpace = NoAddressSpace; + uint32_t AssumedAddressSpace = InvalidAddressSpace; bool takeAddressSpace(uint32_t AS) { - if (AssumedAddressSpace == NoAddressSpace) { + if (AssumedAddressSpace == InvalidAddressSpace) { AssumedAddressSpace = AS; return true; } diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 27049d547f6e3..5e7d2c3c713d3 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -619,6 +619,16 @@ class CallsiteContextGraph { return static_cast(this)->getLabel(Func, Call, CloneNo); } + // Create and return a new ContextNode. + ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr, + CallInfo C = CallInfo()) { + NodeOwner.push_back(std::make_unique(IsAllocation, C)); + auto *NewNode = NodeOwner.back().get(); + if (F) + NodeToCallingFunc[NewNode] = F; + return NewNode; + } + /// Helpers to find the node corresponding to the given call or stackid. ContextNode *getNodeForInst(const CallInfo &C); ContextNode *getNodeForAlloc(const CallInfo &C); @@ -1082,11 +1092,8 @@ typename CallsiteContextGraph::ContextNode * CallsiteContextGraph::addAllocNode( CallInfo Call, const FuncTy *F) { assert(!getNodeForAlloc(Call)); - NodeOwner.push_back( - std::make_unique(/*IsAllocation=*/true, Call)); - ContextNode *AllocNode = NodeOwner.back().get(); + ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call); AllocationCallToContextNodeMap[Call] = AllocNode; - NodeToCallingFunc[AllocNode] = F; // Use LastContextId as a uniq id for MIB allocation nodes. AllocNode->OrigStackOrAllocId = LastContextId; // Alloc type should be updated as we add in the MIBs. We should assert @@ -1143,9 +1150,7 @@ void CallsiteContextGraph::addStackNodesForMIB( auto StackId = getStackId(*ContextIter); ContextNode *StackNode = getNodeForStackId(StackId); if (!StackNode) { - NodeOwner.push_back( - std::make_unique(/*IsAllocation=*/false)); - StackNode = NodeOwner.back().get(); + StackNode = createNewNode(/*IsAllocation=*/false); StackEntryIdToContextNodeMap[StackId] = StackNode; StackNode->OrigStackOrAllocId = StackId; } @@ -1448,10 +1453,7 @@ void CallsiteContextGraph:: continue; // Create new context node. - NodeOwner.push_back( - std::make_unique(/*IsAllocation=*/false, Call)); - ContextNode *NewNode = NodeOwner.back().get(); - NodeToCallingFunc[NewNode] = Func; + ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call); NonAllocationCallToContextNodeMap[Call] = NewNode; CreatedNode = true; NewNode->AllocTypes = computeAllocType(SavedContextIds); @@ -2164,10 +2166,7 @@ bool CallsiteContextGraph::calleesMatch( } else { FuncToCallsWithMetadata[Func].push_back({NewCall}); // Create Node and record node info. - NodeOwner.push_back( - std::make_unique(/*IsAllocation=*/false, NewCall)); - NewNode = NodeOwner.back().get(); - NodeToCallingFunc[NewNode] = Func; + NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall); TailCallToContextNodeMap[NewCall] = NewNode; NewNode->AllocTypes = Edge->AllocTypes; } @@ -2740,13 +2739,11 @@ CallsiteContextGraph::moveEdgeToNewCalleeClone( const std::shared_ptr &Edge, EdgeIter *CallerEdgeI, DenseSet ContextIdsToMove) { ContextNode *Node = Edge->Callee; - NodeOwner.push_back( - std::make_unique(Node->IsAllocation, Node->Call)); - ContextNode *Clone = NodeOwner.back().get(); + assert(NodeToCallingFunc.count(Node)); + ContextNode *Clone = + createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call); Node->addClone(Clone); Clone->MatchingCalls = Node->MatchingCalls; - assert(NodeToCallingFunc.count(Node)); - NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true, ContextIdsToMove); return Clone; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index ea51d77904571..9934c065ebf85 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -436,6 +436,71 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt)); } +/// Whenever an element is extracted from a vector, optionally shifted down, and +/// then truncated, canonicalize by converting it to a bitcast followed by an +/// extractelement. +/// +/// Examples (little endian): +/// trunc (extractelement <4 x i64> %X, 0) to i32 +/// ---> +/// extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0 +/// +/// trunc (lshr (extractelement <4 x i32> %X, 0), 8) to i8 +/// ---> +/// extractelement <16 x i8> (bitcast <4 x i32> %X to <16 x i8>), i32 1 +static Instruction *foldVecExtTruncToExtElt(TruncInst &Trunc, + InstCombinerImpl &IC) { + Value *Src = Trunc.getOperand(0); + Type *SrcType = Src->getType(); + Type *DstType = Trunc.getType(); + + // Only attempt this if we have simple aliasing of the vector elements. + // A badly fit destination size would result in an invalid cast. + unsigned SrcBits = SrcType->getScalarSizeInBits(); + unsigned DstBits = DstType->getScalarSizeInBits(); + unsigned TruncRatio = SrcBits / DstBits; + if ((SrcBits % DstBits) != 0) + return nullptr; + + Value *VecOp; + ConstantInt *Cst; + const APInt *ShiftAmount = nullptr; + if (!match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst)))) && + !match(Src, + m_OneUse(m_LShr(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst)), + m_APInt(ShiftAmount))))) + return nullptr; + + auto *VecOpTy = cast(VecOp->getType()); + auto VecElts = VecOpTy->getElementCount(); + + uint64_t BitCastNumElts = VecElts.getKnownMinValue() * TruncRatio; + uint64_t VecOpIdx = Cst->getZExtValue(); + uint64_t NewIdx = IC.getDataLayout().isBigEndian() + ? (VecOpIdx + 1) * TruncRatio - 1 + : VecOpIdx * TruncRatio; + + // Adjust index by the whole number of truncated elements. + if (ShiftAmount) { + // Check shift amount is in range and shifts a whole number of truncated + // elements. + if (ShiftAmount->uge(SrcBits) || ShiftAmount->urem(DstBits) != 0) + return nullptr; + + uint64_t IdxOfs = ShiftAmount->udiv(DstBits).getZExtValue(); + NewIdx = IC.getDataLayout().isBigEndian() ? (NewIdx - IdxOfs) + : (NewIdx + IdxOfs); + } + + assert(BitCastNumElts <= std::numeric_limits::max() && + NewIdx <= std::numeric_limits::max() && "overflow 32-bits"); + + auto *BitCastTo = + VectorType::get(DstType, BitCastNumElts, VecElts.isScalable()); + Value *BitCast = IC.Builder.CreateBitCast(VecOp, BitCastTo); + return ExtractElementInst::Create(BitCast, IC.Builder.getInt32(NewIdx)); +} + /// Funnel/Rotate left/right may occur in a wider type than necessary because of /// type promotion rules. Try to narrow the inputs and convert to funnel shift. Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) { @@ -848,36 +913,8 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { if (Instruction *I = foldVecTruncToExtElt(Trunc, *this)) return I; - // Whenever an element is extracted from a vector, and then truncated, - // canonicalize by converting it to a bitcast followed by an - // extractelement. - // - // Example (little endian): - // trunc (extractelement <4 x i64> %X, 0) to i32 - // ---> - // extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0 - Value *VecOp; - ConstantInt *Cst; - if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) { - auto *VecOpTy = cast(VecOp->getType()); - auto VecElts = VecOpTy->getElementCount(); - - // A badly fit destination size would result in an invalid cast. - if (SrcWidth % DestWidth == 0) { - uint64_t TruncRatio = SrcWidth / DestWidth; - uint64_t BitCastNumElts = VecElts.getKnownMinValue() * TruncRatio; - uint64_t VecOpIdx = Cst->getZExtValue(); - uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1 - : VecOpIdx * TruncRatio; - assert(BitCastNumElts <= std::numeric_limits::max() && - "overflow 32-bits"); - - auto *BitCastTo = - VectorType::get(DestTy, BitCastNumElts, VecElts.isScalable()); - Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo); - return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx)); - } - } + if (Instruction *I = foldVecExtTruncToExtElt(Trunc, *this)) + return I; // trunc (ctlz_i32(zext(A), B) --> add(ctlz_i16(A, B), C) if (match(Src, m_OneUse(m_Intrinsic(m_ZExt(m_Value(A)), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index b1215bb4d83b0..e3f4925024e65 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -5282,6 +5282,11 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I, match(Op1, m_c_Mul(m_Specific(Z), m_Value(Y)))))) { bool NonZero; if (ICmpInst::isEquality(Pred)) { + // If X != Y, fold (X *nw Z) eq/ne (Y *nw Z) -> Z eq/ne 0 + if (((Op0HasNSW && Op1HasNSW) || (Op0HasNUW && Op1HasNUW)) && + isKnownNonEqual(X, Y, DL, &AC, &I, &DT)) + return new ICmpInst(Pred, Z, Constant::getNullValue(Z->getType())); + KnownBits ZKnown = computeKnownBits(Z, 0, &I); // if Z % 2 != 0 // X * Z eq/ne Y * Z -> X eq/ne Y diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index c66db9285c799..dd31bfa7e65f5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -858,35 +858,16 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, } case Instruction::SRem: { const APInt *Rem; - // X % -1 demands all the bits because we don't want to introduce - // INT_MIN % -1 (== undef) by accident. - if (match(I->getOperand(1), m_APInt(Rem)) && !Rem->isAllOnes()) { - APInt RA = Rem->abs(); - if (RA.isPowerOf2()) { - if (DemandedMask.ult(RA)) // srem won't affect demanded bits - return I->getOperand(0); - - APInt LowBits = RA - 1; - APInt Mask2 = LowBits | APInt::getSignMask(BitWidth); - if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1, Q)) - return I; - - // The low bits of LHS are unchanged by the srem. - Known.Zero = LHSKnown.Zero & LowBits; - Known.One = LHSKnown.One & LowBits; - - // If LHS is non-negative or has all low bits zero, then the upper bits - // are all zero. - if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero)) - Known.Zero |= ~LowBits; + if (match(I->getOperand(1), m_APInt(Rem)) && Rem->isPowerOf2()) { + if (DemandedMask.ult(*Rem)) // srem won't affect demanded bits + return I->getOperand(0); - // If LHS is negative and not all low bits are zero, then the upper bits - // are all one. - if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One)) - Known.One |= ~LowBits; - - break; - } + APInt LowBits = *Rem - 1; + APInt Mask2 = LowBits | APInt::getSignMask(BitWidth); + if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1, Q)) + return I; + Known = KnownBits::srem(LHSKnown, KnownBits::makeConstant(*Rem)); + break; } llvm::computeKnownBits(I, Known, Depth, Q); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 4b1650b93cc1d..d73b550741fb3 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -44,6 +44,7 @@ #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/Loads.h" @@ -65,9 +66,9 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" @@ -567,12 +568,11 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // We want to visit children before parents. We will enqueue all the parents // before their children in the worklist and process the worklist in reverse // order. - SmallVector Worklist = collectChildrenInLoop(N, CurLoop); + SmallVector Worklist = + collectChildrenInLoop(DT, N, CurLoop); bool Changed = false; - for (DomTreeNode *DTN : reverse(Worklist)) { - BasicBlock *BB = DTN->getBlock(); - // Only need to process the contents of this block if it is not part of a + for (BasicBlock *BB : reverse(Worklist)) { // subloop (which would already have been processed). if (inSubLoop(BB, CurLoop, LI)) continue; @@ -1603,13 +1603,14 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, // const auto &BlockColors = SafetyInfo->getBlockColors(); SmallSetVector PredBBs(pred_begin(ExitBB), pred_end(ExitBB)); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); while (!PredBBs.empty()) { BasicBlock *PredBB = *PredBBs.begin(); assert(CurLoop->contains(PredBB) && "Expect all predecessors are in the loop"); if (PN->getBasicBlockIndex(PredBB) >= 0) { BasicBlock *NewPred = SplitBlockPredecessors( - ExitBB, PredBB, ".split.loop.exit", DT, LI, MSSAU, true); + ExitBB, PredBB, ".split.loop.exit", &DTU, LI, MSSAU, true); // Since we do not allow splitting EH-block with BlockColors in // canSplitPredecessors(), we can simply assign predecessor's color to // the new block. diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index e9f212fb482c3..b568811dcdbca 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -284,6 +284,7 @@ static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start, void MemCpyOptPass::eraseInstruction(Instruction *I) { MSSAU->removeMemoryAccess(I); + EEI->removeInstruction(I); I->eraseFromParent(); } @@ -638,7 +639,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent()) return false; - BatchAAResults BAA(*AA); + BatchAAResults BAA(*AA, EEI); auto *T = LI->getType(); // Don't introduce calls to memcpy/memmove intrinsics out of thin air if // the corresponding libcalls are not available. @@ -1147,14 +1148,14 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, IRBuilder<> Builder(M); auto *CopySource = MDep->getSource(); Instruction *NewCopySource = nullptr; - auto CleanupOnRet = llvm::make_scope_exit([&NewCopySource] { + auto CleanupOnRet = llvm::make_scope_exit([&] { if (NewCopySource && NewCopySource->use_empty()) // Safety: It's safe here because we will only allocate more instructions // after finishing all BatchAA queries, but we have to be careful if we // want to do something like this in another place. Then we'd probably // have to delay instruction removal until all transforms on an // instruction finished. - NewCopySource->eraseFromParent(); + eraseInstruction(NewCopySource); }); MaybeAlign CopySourceAlign = MDep->getSourceAlign(); // We just need to calculate the actual size of the copy. @@ -1751,7 +1752,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { return true; } - BatchAAResults BAA(*AA); + BatchAAResults BAA(*AA, EEI); // FIXME: Not using getClobberingMemoryAccess() here due to PR54682. MemoryAccess *AnyClobber = MA->getDefiningAccess(); MemoryLocation DestLoc = MemoryLocation::getForDest(M); @@ -1876,7 +1877,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { if (!CallAccess) return false; MemCpyInst *MDep = nullptr; - BatchAAResults BAA(*AA); + BatchAAResults BAA(*AA, EEI); MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( CallAccess->getDefiningAccess(), Loc, BAA); if (auto *MD = dyn_cast(Clobber)) @@ -1949,7 +1950,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { /// 4. The memcpy src is not modified during the call. (ModRef check shows no /// Mod.) bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) { - BatchAAResults BAA(*AA); + BatchAAResults BAA(*AA, EEI); Value *ImmutArg = CB.getArgOperand(ArgNo); // 1. Ensure passed argument is immutable during call. @@ -2117,6 +2118,8 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, MSSA = MSSA_; MemorySSAUpdater MSSAU_(MSSA_); MSSAU = &MSSAU_; + EarliestEscapeInfo EEI_(*DT); + EEI = &EEI_; while (true) { if (!iterateOnFunction(F)) diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index d464e49990b3b..da9b804e94a74 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -700,7 +700,7 @@ bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) { bool ScalarizerVisitor::isTriviallyScalarizable(Intrinsic::ID ID) { if (isTriviallyVectorizable(ID)) return true; - return Function::isTargetIntrinsic(ID) && + return Intrinsic::isTargetIntrinsic(ID) && TTI->isTargetIntrinsicTriviallyScalarizable(ID); } diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 9a4289e1a30da..70047273c3b9a 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -445,21 +445,22 @@ TransformationMode llvm::hasLICMVersioningTransformation(const Loop *L) { } /// Does a BFS from a given node to all of its children inside a given loop. -/// The returned vector of nodes includes the starting point. -SmallVector -llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) { - SmallVector Worklist; +/// The returned vector of basic blocks includes the starting point. +SmallVector llvm::collectChildrenInLoop(DominatorTree *DT, + DomTreeNode *N, + const Loop *CurLoop) { + SmallVector Worklist; auto AddRegionToWorklist = [&](DomTreeNode *DTN) { // Only include subregions in the top level loop. BasicBlock *BB = DTN->getBlock(); if (CurLoop->contains(BB)) - Worklist.push_back(DTN); + Worklist.push_back(DTN->getBlock()); }; AddRegionToWorklist(N); for (size_t I = 0; I < Worklist.size(); I++) { - for (DomTreeNode *Child : Worklist[I]->children()) + for (DomTreeNode *Child : DT->getNode(Worklist[I])->children()) AddRegionToWorklist(Child); } diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 2119320566902..0927a3015818f 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -28,7 +28,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/LoopUtils.h" -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS #define SCEV_DEBUG_WITH_TYPE(TYPE, X) DEBUG_WITH_TYPE(TYPE, X) #else #define SCEV_DEBUG_WITH_TYPE(TYPE, X) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 1f2c9389c008b..1ee6c17bdad39 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -3057,16 +3058,17 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, if (auto *LI = dyn_cast(&CurI)) { if (LI->getPointerOperand() == StorePtr && LI->getType() == StoreTy && LI->isSimple() && LI->getAlign() >= StoreToHoist->getAlign()) { - // Local objects (created by an `alloca` instruction) are always - // writable, so once we are past a read from a location it is valid to - // also write to that same location. - // If the address of the local object never escapes the function, that - // means it's never concurrently read or written, hence moving the store - // from under the condition will not introduce a data race. - auto *AI = dyn_cast(getUnderlyingObject(StorePtr)); - if (AI && !PointerMayBeCaptured(AI, false, true)) + Value *Obj = getUnderlyingObject(StorePtr); + bool ExplicitlyDereferenceableOnly; + if (isWritableObject(Obj, ExplicitlyDereferenceableOnly) && + !PointerMayBeCaptured(Obj, /*ReturnCaptures=*/false, + /*StoreCaptures=*/true) && + (!ExplicitlyDereferenceableOnly || + isDereferenceablePointer(StorePtr, StoreTy, + LI->getDataLayout()))) { // Found a previous load, return it. return LI; + } } // The load didn't work out, but we may still find a store. } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 00eec0a6f7b14..f24cd43a93bc7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -435,11 +435,10 @@ class LoopVectorizationPlanner { /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop /// to re-use expansion results generated during main plan execution. /// - /// Returns a mapping of SCEVs to their expanded IR values and a mapping for - /// the reduction resume values. Note that this is a temporary workaround - /// needed due to the current epilogue handling. - std::pair, - DenseMap> + /// Returns a mapping of SCEVs to their expanded IR values. + /// Note that this is a temporary workaround needed due to the current + /// epilogue handling. + DenseMap executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b1077d37b4cdc..034765bee40e7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -408,10 +408,11 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { /// the following procedure: /// 1) Returns exact trip count if it is known. /// 2) Returns expected trip count according to profile data if any. -/// 3) Returns upper bound estimate if it is known. +/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. /// 4) Returns std::nullopt if all of the above failed. -static std::optional getSmallBestKnownTC(ScalarEvolution &SE, - Loop *L) { +static std::optional +getSmallBestKnownTC(ScalarEvolution &SE, Loop *L, + bool CanUseConstantMax = true) { // Check if exact trip count is known. if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) return ExpectedTC; @@ -421,6 +422,9 @@ static std::optional getSmallBestKnownTC(ScalarEvolution &SE, if (auto EstimatedTC = getLoopEstimatedTripCount(L)) return *EstimatedTC; + if (!CanUseConstantMax) + return std::nullopt; + // Check if upper bound estimate is known. if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) return ExpectedTC; @@ -676,27 +680,6 @@ class InnerLoopVectorizer { /// Structure to hold information about generated runtime checks, responsible /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; - - // Holds the resume values for reductions in the loops, used to set the - // correct start value of reduction PHIs when vectorizing the epilogue. - SmallMapVector - ReductionResumeValues; -}; - -class InnerLoopUnroller : public InnerLoopVectorizer { -public: - InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, - LoopInfo *LI, DominatorTree *DT, - const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, - LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - ElementCount::getFixed(1), - ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI, Check) {} }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -1954,14 +1937,10 @@ class GeneratedRTChecks { // count. Assume that the outer loop executes at least twice. unsigned BestTripCount = 2; - // If exact trip count is known use that. - if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop)) - BestTripCount = SmallTC; - else if (LoopVectorizeWithBlockFrequency) { - // Else use profile data if available. - if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop)) - BestTripCount = *EstimatedTC; - } + // Get the best known TC estimate. + if (auto EstimatedTC = getSmallBestKnownTC( + *SE, OuterLoop, /* CanUseConstantMax = */ false)) + BestTripCount = *EstimatedTC; BestTripCount = std::max(BestTripCount, 1U); InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; @@ -2325,12 +2304,6 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for - // the first lane and part. - if (isa(Instr)) - if (!Lane.isFirstLane()) - return; - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); @@ -2373,6 +2346,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, // End if-block. VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; + assert((Parent || all_of(RepRecipe->operands(), + [](VPValue *Op) { + return Op->isDefinedOutsideLoopRegions(); + })) && + "Expected a recipe is either within a region or all of its operands " + "are defined outside the vectorized region."); if (IfPredicateInstr) PredicatedInstructions.push_back(Cloned); } @@ -6566,8 +6545,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2 = cast(PSE.getSCEV(Op2))->getValue(); } auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && - Legal->isInvariant(Op2)) + auto IsInvariant = [this](Value *Op) { + if (!Legal->isInvariant(Op)) + return false; + // Consider Op2 invariant, if it is not a predicated instruction in the + // loop. In that case, it is not trivially hoistable. + return !isa(Op) || + !TheLoop->contains(cast(Op)) || + !isPredicatedInst(cast(Op)); + }; + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && IsInvariant(Op2)) Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); @@ -7448,10 +7435,9 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) { } // Check if \p RedResult is a ComputeReductionResult instruction, and if it is -// create a merge phi node for it and add it to \p ReductionResumeValues. +// create a merge phi node for it. static void createAndCollectMergePhiForReduction( VPInstruction *RedResult, - DenseMap &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue) { if (!RedResult || @@ -7509,13 +7495,9 @@ static void createAndCollectMergePhiForReduction( OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); - - ReductionResumeValues[&RdxDesc] = BCBlockPhi; } -std::pair, - DenseMap> -LoopVectorizationPlanner::executePlan( +DenseMap LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap *ExpandedSCEVs) { @@ -7601,12 +7583,11 @@ LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); // 2.5 Collect reduction resume values. - DenseMap ReductionResumeValues; auto *ExitVPBB = cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); for (VPRecipeBase &R : *ExitVPBB) { createAndCollectMergePhiForReduction( - dyn_cast(&R), ReductionResumeValues, State, OrigLoop, + dyn_cast(&R), State, OrigLoop, State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); } @@ -7656,7 +7637,7 @@ LoopVectorizationPlanner::executePlan( setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); } - return {State.ExpandedSCEVs, ReductionResumeValues}; + return State.ExpandedSCEVs; } //===--------------------------------------------------------------------===// @@ -8975,6 +8956,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); return Legal->blockNeedsPredication(BB) || NeedsBlends; }); + auto *MiddleVPBB = + cast(Plan->getVectorLoopRegion()->getSingleSuccessor()); + VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -9001,12 +8985,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { Operands = {OpRange.begin(), OpRange.end()}; } - // Invariant stores inside loop will be deleted and a single store - // with the final reduction value will be added to the exit block + // The stores with invariant address inside the loop will be deleted, and + // in the exit block, a uniform store recipe will be created for the final + // invariant store of the reduction. StoreInst *SI; if ((SI = dyn_cast(&I)) && - Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { + // Only create recipe for the final invariant store of the reduction. + if (!Legal->isInvariantStoreOfReduction(SI)) + continue; + auto *Recipe = new VPReplicateRecipe( + SI, RecipeBuilder.mapToVPValues(Instr->operands()), + true /* IsUniform */); + Recipe->insertBefore(*MiddleVPBB, MBIP); continue; + } VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); @@ -9175,45 +9168,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( using namespace VPlanPatternMatch; VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); - // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores - // sank outside of the loop would keep the same order as they had in the - // original loop. - SmallVector ReductionPHIList; - for (VPRecipeBase &R : Header->phis()) { - if (auto *ReductionPhi = dyn_cast(&R)) - ReductionPHIList.emplace_back(ReductionPhi); - } - bool HasIntermediateStore = false; - stable_sort(ReductionPHIList, - [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, - const VPReductionPHIRecipe *R2) { - auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; - auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; - HasIntermediateStore |= IS1 || IS2; - - // If neither of the recipes has an intermediate store, keep the - // order the same. - if (!IS1 && !IS2) - return false; - - // If only one of the recipes has an intermediate store, then - // move it towards the beginning of the list. - if (IS1 && !IS2) - return true; - - if (!IS1 && IS2) - return false; - - // If both recipes have an intermediate store, then the recipe - // with the later store should be processed earlier. So it - // should go to the beginning of the list. - return DT->dominates(IS2, IS1); - }); - - if (HasIntermediateStore && ReductionPHIList.size() > 1) - for (VPRecipeBase *R : ReductionPHIList) - R->moveBefore(*Header, Header->getFirstNonPhi()); - + VPBasicBlock *MiddleVPBB = + cast(VectorLoopRegion->getSingleSuccessor()); for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast(&R); if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) @@ -9232,9 +9188,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPUser *U : Cur->users()) { auto *UserRecipe = cast(U); if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { - assert(match(U, m_Binary( - m_VPValue(), m_VPValue())) && - "U must be an ExtractFromEnd VPInstruction"); + assert(UserRecipe->getParent() == MiddleVPBB && + "U must be either in the loop region or the middle block."); continue; } Worklist.insert(UserRecipe); @@ -9339,8 +9294,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); Builder.setInsertPoint(&*LatchVPBB->begin()); - VPBasicBlock *MiddleVPBB = - cast(VectorLoopRegion->getSingleSuccessor()); VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { @@ -9415,12 +9368,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // also modeled in VPlan. auto *FinalReductionResult = new VPInstruction( VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); + // Update all users outside the vector region. + OrigExitingVPV->replaceUsesWithIf( + FinalReductionResult, [](VPUser &User, unsigned) { + auto *Parent = cast(&User)->getParent(); + return Parent && !Parent->getParent(); + }); FinalReductionResult->insertBefore(*MiddleVPBB, IP); - OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User, - unsigned) { - return match(&User, m_Binary(m_VPValue(), - m_VPValue())); - }); // Adjust AnyOf reductions; replace the reduction phi for the selected value // with a boolean reduction phi node to check if the condition is true in @@ -10113,8 +10067,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM, BFI, PSI, Checks); + InnerLoopVectorizer Unroller( + L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), + ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks); VPlan &BestPlan = LVP.getPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); @@ -10142,8 +10097,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { EPI, &LVL, &CM, BFI, PSI, Checks); std::unique_ptr BestMainPlan(BestPlan.duplicate()); - const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( - EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true); + auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, + *BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10188,10 +10143,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast(&R)) { + ResumeV = cast(ReductionPhi->getUnderlyingInstr()) + ->getIncomingValueForBlock(L->getLoopPreheader()); const RecurrenceDescriptor &RdxDesc = ReductionPhi->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); - ResumeV = ReductionResumeValues.find(&RdxDesc)->second; if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { // VPReductionPHIRecipes for AnyOf reductions expect a boolean as // start value; compare the final value from the main vector loop diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index dee0b7e1f4371..6b8ec55b30426 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4454,7 +4454,9 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { auto *VecTy = getWidenedType(ScalarTy, NumScalars); int NumParts = TTI->getNumberOfParts(VecTy); if (NumParts == 0 || NumParts >= NumScalars || - VecTy->getNumElements() % NumParts != 0) + VecTy->getNumElements() % NumParts != 0 || + !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(), + VecTy->getNumElements() / NumParts)) NumParts = 1; SmallVector ExtractMask; SmallVector Mask; @@ -6462,7 +6464,9 @@ static void gatherPossiblyVectorizableLoads( auto *VecTy = getWidenedType(ScalarTy, NumScalars); NumParts = TTI.getNumberOfParts(VecTy); if (NumParts == 0 || NumParts >= NumScalars || - VecTy->getNumElements() % NumParts != 0) + VecTy->getNumElements() % NumParts != 0 || + !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), + VecTy->getNumElements() / NumParts)) NumParts = 1; } unsigned VF = PowerOf2Ceil(NumScalars / NumParts); @@ -9365,10 +9369,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { bool NeedShuffle = count(VL, *It) > 1 && (VL.front() != *It || !all_of(VL.drop_front(), IsaPred)); - if (!NeedShuffle) + if (!NeedShuffle) { + if (isa(ScalarTy)) { + assert(SLPReVec && "FixedVectorType is not expected."); + return TTI.getShuffleCost( + TTI::SK_InsertSubvector, VecTy, {}, CostKind, + std::distance(VL.begin(), It) * getNumElements(ScalarTy), + cast(ScalarTy)); + } return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, std::distance(VL.begin(), It), PoisonValue::get(VecTy), *It); + } SmallVector ShuffleMask(VL.size(), PoisonMaskElem); transform(VL, ShuffleMask.begin(), [](Value *V) { @@ -9961,7 +9973,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size()); unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); if (NumParts == 0 || NumParts >= Mask.size() || - MaskVecTy->getNumElements() % NumParts != 0) + MaskVecTy->getNumElements() % NumParts != 0 || + !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(), + MaskVecTy->getNumElements() / NumParts)) NumParts = 1; unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); const auto *It = @@ -9979,7 +9993,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size()); unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); if (NumParts == 0 || NumParts >= Mask.size() || - MaskVecTy->getNumElements() % NumParts != 0) + MaskVecTy->getNumElements() % NumParts != 0 || + !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(), + MaskVecTy->getNumElements() / NumParts)) NumParts = 1; unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); const auto *It = @@ -13633,7 +13649,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size()); unsigned NumParts = TTI->getNumberOfParts(VecTy); if (NumParts == 0 || NumParts >= GatheredScalars.size() || - VecTy->getNumElements() % NumParts != 0) + VecTy->getNumElements() % NumParts != 0 || + !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(), + VecTy->getNumElements() / NumParts)) NumParts = 1; if (!all_of(GatheredScalars, IsaPred)) { // Check for gathered extracts. diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 0c44d05f0474d..7ebbcabb004df 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -8,6 +8,7 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/SandboxIR/Instruction.h" using namespace llvm::sandboxir; diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index 161d300e6e9f2..80afcb499a2c2 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -8,8 +8,8 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/SandboxIR/Constant.h" #include "llvm/SandboxIR/PassManager.h" -#include "llvm/SandboxIR/SandboxIR.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 4ddbd0d5fafb8..ed0bb13d9425f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -26,7 +26,7 @@ namespace llvm { namespace VPlanPatternMatch { template bool match(Val *V, const Pattern &P) { - return const_cast(P).match(V); + return P.match(V); } template bool match(VPUser *U, const Pattern &P) { @@ -35,7 +35,7 @@ template bool match(VPUser *U, const Pattern &P) { } template struct class_match { - template bool match(ITy *V) { return isa(V); } + template bool match(ITy *V) const { return isa(V); } }; /// Match an arbitrary VPValue and ignore it. @@ -46,7 +46,7 @@ template struct bind_ty { bind_ty(Class *&V) : VR(V) {} - template bool match(ITy *V) { + template bool match(ITy *V) const { if (auto *CV = dyn_cast(V)) { VR = CV; return true; @@ -63,7 +63,7 @@ template struct specific_intval { specific_intval(APInt V) : Val(std::move(V)) {} - bool match(VPValue *VPV) { + bool match(VPValue *VPV) const { if (!VPV->isLiveIn()) return false; Value *V = VPV->getLiveInIRValue(); @@ -94,7 +94,7 @@ template struct match_combine_or { match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) {} - template bool match(ITy *V) { + template bool match(ITy *V) const { if (L.match(V)) return true; if (R.match(V)) @@ -139,16 +139,16 @@ struct UnaryRecipe_match { UnaryRecipe_match(Op0_t Op0) : Op0(Op0) {} - bool match(const VPValue *V) { + bool match(const VPValue *V) const { auto *DefR = V->getDefiningRecipe(); return DefR && match(DefR); } - bool match(const VPSingleDefRecipe *R) { + bool match(const VPSingleDefRecipe *R) const { return match(static_cast(R)); } - bool match(const VPRecipeBase *R) { + bool match(const VPRecipeBase *R) const { if (!detail::MatchRecipeAndOpcode::match(R)) return false; assert(R->getNumOperands() == 1 && @@ -174,16 +174,16 @@ struct BinaryRecipe_match { BinaryRecipe_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {} - bool match(const VPValue *V) { + bool match(const VPValue *V) const { auto *DefR = V->getDefiningRecipe(); return DefR && match(DefR); } - bool match(const VPSingleDefRecipe *R) { + bool match(const VPSingleDefRecipe *R) const { return match(static_cast(R)); } - bool match(const VPRecipeBase *R) { + bool match(const VPRecipeBase *R) const { if (!detail::MatchRecipeAndOpcode::match(R)) return false; assert(R->getNumOperands() == 2 && @@ -314,12 +314,14 @@ m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) { } struct VPCanonicalIVPHI_match { - bool match(const VPValue *V) { + bool match(const VPValue *V) const { auto *DefR = V->getDefiningRecipe(); return DefR && match(DefR); } - bool match(const VPRecipeBase *R) { return isa(R); } + bool match(const VPRecipeBase *R) const { + return isa(R); + } }; inline VPCanonicalIVPHI_match m_CanonicalIV() { @@ -332,12 +334,12 @@ template struct VPScalarIVSteps_match { VPScalarIVSteps_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {} - bool match(const VPValue *V) { + bool match(const VPValue *V) const { auto *DefR = V->getDefiningRecipe(); return DefR && match(DefR); } - bool match(const VPRecipeBase *R) { + bool match(const VPRecipeBase *R) const { if (!isa(R)) return false; assert(R->getNumOperands() == 2 && diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7590863853295..f8b0a400a31d7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -611,14 +611,6 @@ Value *VPInstruction::generate(VPTransformState &State) { : Builder.CreateZExt(ReducedPartRdx, PhiTy); } - // If there were stores of the reduction value to a uniform memory address - // inside the loop, create the final store here. - if (StoreInst *SI = RdxDesc.IntermediateStore) { - auto *NewSI = Builder.CreateAlignedStore( - ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); - propagateMetadata(NewSI, SI); - } - return ReducedPartRdx; } case VPInstruction::ExtractFromEnd: { diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll index 08d2a86fa80a3..853eef6bcb2e8 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cast.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll @@ -93,7 +93,7 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) @@ -103,7 +103,7 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64> @@ -112,8 +112,8 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) @@ -122,8 +122,8 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64> @@ -131,9 +131,9 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16> +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) @@ -141,29 +141,29 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8> +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16> +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext undef to @@ -253,7 +253,7 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i8( undef, undef, i32 undef) @@ -263,7 +263,7 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call @llvm.vp.sext.nxv16i8.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext undef to @@ -272,8 +272,8 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i8( undef, undef, i32 undef) @@ -282,8 +282,8 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i32( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call @llvm.vp.sext.nxv32i8.nxv32i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext undef to @@ -291,8 +291,8 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i8( undef, undef, i32 undef) @@ -301,8 +301,8 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i16( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i32( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call @llvm.vp.sext.nxv64i8.nxv64i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext undef to @@ -310,9 +310,9 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv128i1_nxv128i8 = sext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv128i1_nxv128i16 = sext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv128i1_nxv128i32 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i8( undef, undef, i32 undef) @@ -320,9 +320,9 @@ define void @sext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i16( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i16( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i32( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i8.nxv128i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i8.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -416,7 +416,7 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) @@ -426,7 +426,7 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64> @@ -435,8 +435,8 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) @@ -445,8 +445,8 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64> @@ -454,9 +454,9 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16> +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) @@ -464,29 +464,29 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8> +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16> +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext undef to @@ -576,7 +576,7 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i8( undef, undef, i32 undef) @@ -586,7 +586,7 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call @llvm.vp.sext.nxv16i8.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext undef to @@ -595,8 +595,8 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i8( undef, undef, i32 undef) @@ -605,8 +605,8 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i32( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call @llvm.vp.sext.nxv32i8.nxv32i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext undef to @@ -614,9 +614,9 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i8( undef, undef, i32 undef) @@ -624,18 +624,18 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i16( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i32( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call @llvm.vp.sext.nxv64i8.nxv64i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv128i1_nxv128i8 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv128i1_nxv128i16 = sext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv128i1_nxv128i32 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i8( undef, undef, i32 undef) @@ -643,9 +643,9 @@ define void @sext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i16( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i16( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i32( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i8.nxv128i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i8.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -1095,7 +1095,7 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) @@ -1105,7 +1105,7 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64> @@ -1114,8 +1114,8 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) @@ -1124,8 +1124,8 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64> @@ -1133,9 +1133,9 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16> +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) @@ -1143,29 +1143,29 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64> +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8> +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16> +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext undef to @@ -1255,7 +1255,7 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i8( undef, undef, i32 undef) @@ -1265,7 +1265,7 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call @llvm.vp.zext.nxv16i8.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext undef to @@ -1274,8 +1274,8 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i8( undef, undef, i32 undef) @@ -1284,8 +1284,8 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i32( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call @llvm.vp.zext.nxv32i8.nxv32i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext undef to @@ -1293,8 +1293,8 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i8( undef, undef, i32 undef) @@ -1303,8 +1303,8 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i16( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i32( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call @llvm.vp.zext.nxv64i8.nxv64i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext undef to @@ -1312,9 +1312,9 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv128i1_nxv128i8 = zext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv128i1_nxv128i16 = zext undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv128i1_nxv128i32 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext undef to +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext undef to ; RV32-NEXT: Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i8( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i8( undef, undef, i32 undef) @@ -1322,9 +1322,9 @@ define void @zext() { ; RV32-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i16( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i16( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i32( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i8.nxv128i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i1( undef, undef, i32 undef) -; RV32-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i8.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i1( undef, undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i1( undef, undef, i32 undef) ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -1418,7 +1418,7 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) @@ -1428,7 +1428,7 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64> @@ -1437,8 +1437,8 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) @@ -1447,8 +1447,8 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64> @@ -1456,9 +1456,9 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16> +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) @@ -1466,29 +1466,29 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64> +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8> +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16> +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext undef to @@ -1578,7 +1578,7 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i8( undef, undef, i32 undef) @@ -1588,7 +1588,7 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call @llvm.vp.zext.nxv16i8.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext undef to @@ -1597,8 +1597,8 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i8( undef, undef, i32 undef) @@ -1607,8 +1607,8 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i32( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call @llvm.vp.zext.nxv32i8.nxv32i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext undef to @@ -1616,9 +1616,9 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i8( undef, undef, i32 undef) @@ -1626,18 +1626,18 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i16( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i32( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call @llvm.vp.zext.nxv64i8.nxv64i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv128i1_nxv128i8 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %nxv128i1_nxv128i16 = zext undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv128i1_nxv128i32 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext undef to +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext undef to ; RV64-NEXT: Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i8( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i8( undef, undef, i32 undef) @@ -1645,9 +1645,9 @@ define void @zext() { ; RV64-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i16( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i16( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i32( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i8.nxv128i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i1( undef, undef, i32 undef) -; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i8.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i1( undef, undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i1( undef, undef, i32 undef) ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -6965,14 +6965,14 @@ define void @oddvec_sizes() { ; infinite recursion between cast costing and scalarization costing. define void @legalization_crash() { ; CHECK-LABEL: 'legalization_crash' -; CHECK-NEXT: Cost Model: Found an estimated cost of 19772 for instruction: %1 = bitcast <24 x i8> undef to <192 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16700 for instruction: %1 = bitcast <24 x i8> undef to <192 x i1> ; CHECK-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %2 = trunc <192 x i8> undef to <192 x i1> -; CHECK-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %3 = zext <192 x i1> undef to <192 x i8> -; CHECK-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %4 = sext <192 x i1> undef to <192 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %3 = zext <192 x i1> undef to <192 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %4 = sext <192 x i1> undef to <192 x i8> ; CHECK-NEXT: Cost Model: Found an estimated cost of 948 for instruction: %5 = sitofp <192 x i1> undef to <192 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 948 for instruction: %6 = uitofp <192 x i1> undef to <192 x float> -; CHECK-NEXT: Cost Model: Found an estimated cost of 19964 for instruction: %7 = fptosi <192 x float> undef to <192 x i1> -; CHECK-NEXT: Cost Model: Found an estimated cost of 19964 for instruction: %8 = fptoui <192 x float> undef to <192 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16892 for instruction: %7 = fptosi <192 x float> undef to <192 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16892 for instruction: %8 = fptoui <192 x float> undef to <192 x i1> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; bitcast <24 x i8> undef to <192 x i1> diff --git a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll index 82038280f71c6..fce9f71d37634 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll @@ -9,28 +9,28 @@ define void @foo_no_vscale_range() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 782 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 662 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 391 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 335 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 782 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 662 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 391 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 335 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) @@ -75,28 +75,28 @@ define void @foo_vscale_range_2_16() vscale_range(2,16) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll deleted file mode 100644 index 5b9833553fa02..0000000000000 --- a/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll +++ /dev/null @@ -1,49 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes='print' -disable-output %s 2>&1 | FileCheck %s - -define void @false.equal.predicate(ptr %arg, ptr %arg1, i1 %arg2) { -; CHECK-LABEL: 'false.equal.predicate' -; CHECK-NEXT: loop.body: -; CHECK-NEXT: Memory dependences are safe -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: -; CHECK-EMPTY: -; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. -; CHECK-NEXT: SCEV assumptions: -; CHECK-NEXT: Equal predicate: %load == 1 -; CHECK-EMPTY: -; CHECK-NEXT: Expressions re-written: -; CHECK-NEXT: [PSE] %gep10 = getelementptr double, ptr %gep8, i64 %mul: -; CHECK-NEXT: {(8 + %arg1),+,(8 * (sext i32 %load to i64))}<%loop.body> -; CHECK-NEXT: --> {(8 + %arg1),+,8}<%loop.body> -; -entry: - %load = load i32, ptr %arg, align 4 - br i1 %arg2, label %noloop.exit, label %loop.ph - -loop.ph: ; preds = %entry - %sext7 = sext i32 %load to i64 - %gep8 = getelementptr i8, ptr %arg1, i64 8 - br label %loop.body - -loop.body: ; preds = %loop.body, %loop.ph - %phi = phi i64 [ 0, %loop.ph ], [ %add, %loop.body ] - %mul = mul i64 %phi, %sext7 - %gep10 = getelementptr double, ptr %gep8, i64 %mul - %load11 = load double, ptr %gep10, align 8 - store double %load11, ptr %arg1, align 8 - %add = add i64 %phi, 1 - %icmp = icmp eq i64 %phi, 0 - br i1 %icmp, label %loop.exit, label %loop.body - -noloop.exit: ; preds = %entry - %sext = sext i32 %load to i64 - %gep = getelementptr double, ptr %arg1, i64 %sext - %load5 = load double, ptr %gep, align 8 - store double %load5, ptr %arg, align 8 - ret void - -loop.exit: ; preds = %loop.body - ret void -} diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll index 1e12dbf3bbee3..1585c7b562806 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll @@ -223,6 +223,54 @@ exit: ret void } +define double @single_iteration_unknown_stride(i32 %x, ptr %y, i1 %cond) { +; CHECK-LABEL: 'single_iteration_unknown_stride' +; CHECK-NEXT: loop.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Equal predicate: %x == 1 +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %gep10 = getelementptr double, ptr %gep8, i64 %mul: +; CHECK-NEXT: {(8 + %y),+,(8 * (sext i32 %x to i64))}<%loop.body> +; CHECK-NEXT: --> {(8 + %y),+,8}<%loop.body> +; +entry: + br i1 %cond, label %noloop.exit, label %loop.ph + +loop.ph: ; preds = %entry + %sext7 = sext i32 %x to i64 + %gep8 = getelementptr i8, ptr %y, i64 8 + br label %loop.body + +loop.body: ; preds = %loop.body, %loop.ph + %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop.body ] + %mul = mul i64 %iv, %sext7 + %gep10 = getelementptr double, ptr %gep8, i64 %mul + %load11 = load double, ptr %gep10, align 8 + store double %load11, ptr %y, align 8 + %iv.next = add i64 %iv, 1 + %icmp = icmp eq i64 %iv, 0 + br i1 %icmp, label %loop.exit, label %loop.body + +noloop.exit: ; preds = %entry + %sext = sext i32 %x to i64 + %gep = getelementptr double, ptr %y, i64 %sext + %load5 = load double, ptr %gep, align 8 + ret double %load5 + +loop.exit: ; preds = %loop.body + %sext2 = sext i32 %x to i64 + %gep2 = getelementptr double, ptr %y, i64 %sext2 + %load6 = load double, ptr %gep2, align 8 + ret double %load6 +} + ; A loop with two symbolic strides. define void @two_strides(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride.1, i64 %stride.2) { ; CHECK-LABEL: 'two_strides' diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll index 59e40bfd11433..124d895e5aef5 100644 --- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll +++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll @@ -1568,86 +1568,6 @@ exit: ret i32 0 } - -define i32 @ptr_induction_ult_2(ptr %a, ptr %b) { -; CHECK-LABEL: 'ptr_induction_ult_2' -; CHECK-NEXT: Classifying expressions for: @ptr_induction_ult_2 -; CHECK-NEXT: %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] -; CHECK-NEXT: --> {%a,+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } -; CHECK-NEXT: %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1 -; CHECK-NEXT: --> {(4 + %a),+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } -; CHECK-NEXT: Determining loop execution counts for: @ptr_induction_ult_2 -; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. -; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. -; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. -; -entry: - %cmp.6 = icmp ult ptr %a, %b - br i1 %cmp.6, label %loop, label %exit - -loop: - %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] - %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1 - %exitcond = icmp eq ptr %ptr.iv, %b - br i1 %exitcond, label %exit, label %loop - -exit: - ret i32 0 -} - -define i32 @ptr_induction_ult_3_step_6(ptr %a, ptr %b) { -; CHECK-LABEL: 'ptr_induction_ult_3_step_6' -; CHECK-NEXT: Classifying expressions for: @ptr_induction_ult_3_step_6 -; CHECK-NEXT: %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] -; CHECK-NEXT: --> {%a,+,6}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } -; CHECK-NEXT: %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 6 -; CHECK-NEXT: --> {(6 + %a),+,6}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } -; CHECK-NEXT: Determining loop execution counts for: @ptr_induction_ult_3_step_6 -; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. -; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. -; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. -; -entry: - %cmp.6 = icmp ult ptr %a, %b - br i1 %cmp.6, label %loop, label %exit - -loop: - %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] - %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 6 - %exitcond = icmp eq ptr %ptr.iv, %b - br i1 %exitcond, label %exit, label %loop - -exit: - ret i32 0 -} - -define i32 @ptr_induction_ult_3_step_7(ptr %a, ptr %b) { -; CHECK-LABEL: 'ptr_induction_ult_3_step_7' -; CHECK-NEXT: Classifying expressions for: @ptr_induction_ult_3_step_7 -; CHECK-NEXT: %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] -; CHECK-NEXT: --> {%a,+,7}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable } -; CHECK-NEXT: %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 7 -; CHECK-NEXT: --> {(7 + %a),+,7}<%loop> U: full-set S: full-set Exits: (7 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable } -; CHECK-NEXT: Determining loop execution counts for: @ptr_induction_ult_3_step_7 -; CHECK-NEXT: Loop %loop: backedge-taken count is ((7905747460161236407 * (ptrtoint ptr %b to i64)) + (-7905747460161236407 * (ptrtoint ptr %a to i64))) -; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 -1 -; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is ((7905747460161236407 * (ptrtoint ptr %b to i64)) + (-7905747460161236407 * (ptrtoint ptr %a to i64))) -; CHECK-NEXT: Loop %loop: Trip multiple is 1 -; -entry: - %cmp.6 = icmp ult ptr %a, %b - br i1 %cmp.6, label %loop, label %exit - -loop: - %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] - %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 7 - %exitcond = icmp eq ptr %ptr.iv, %b - br i1 %exitcond, label %exit, label %loop - -exit: - ret i32 0 -} - define void @ptr_induction_eq_1(ptr %a, ptr %b) { ; CHECK-LABEL: 'ptr_induction_eq_1' ; CHECK-NEXT: Classifying expressions for: @ptr_induction_eq_1 @@ -1705,47 +1625,6 @@ exit: ret void } -; TODO: It feels like we should be able to calculate the symbolic max -; exit count for the loop.inc block here, in the same way as -; ptr_induction_eq_1. The problem seems to be in howFarToZero when the -; ControlsOnlyExit is set to false. -define void @ptr_induction_early_exit_eq_1(ptr %a, ptr %b, ptr %c) { -; CHECK-LABEL: 'ptr_induction_early_exit_eq_1' -; CHECK-NEXT: Classifying expressions for: @ptr_induction_early_exit_eq_1 -; CHECK-NEXT: %ptr.iv = phi ptr [ %ptr.iv.next, %loop.inc ], [ %a, %entry ] -; CHECK-NEXT: --> {%a,+,8}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } -; CHECK-NEXT: %ld1 = load ptr, ptr %ptr.iv, align 8 -; CHECK-NEXT: --> %ld1 U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } -; CHECK-NEXT: %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 8 -; CHECK-NEXT: --> {(8 + %a),+,8}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } -; CHECK-NEXT: Determining loop execution counts for: @ptr_induction_early_exit_eq_1 -; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. -; CHECK-NEXT: exit count for loop: ***COULDNOTCOMPUTE*** -; CHECK-NEXT: exit count for loop.inc: ***COULDNOTCOMPUTE*** -; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. -; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. -; CHECK-NEXT: symbolic max exit count for loop: ***COULDNOTCOMPUTE*** -; CHECK-NEXT: symbolic max exit count for loop.inc: ***COULDNOTCOMPUTE*** -; -entry: - %cmp = icmp eq ptr %a, %b - br i1 %cmp, label %exit, label %loop - -loop: - %ptr.iv = phi ptr [ %ptr.iv.next, %loop.inc ], [ %a, %entry ] - %ld1 = load ptr, ptr %ptr.iv, align 8 - %earlyexitcond = icmp eq ptr %ld1, %c - br i1 %earlyexitcond, label %exit, label %loop.inc - -loop.inc: - %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 8 - %exitcond = icmp eq ptr %ptr.iv.next, %b - br i1 %exitcond, label %exit, label %loop - -exit: - ret void -} - define void @ptr_induction_early_exit_eq_2(ptr %a, i64 %n, ptr %c) { ; CHECK-LABEL: 'ptr_induction_early_exit_eq_2' ; CHECK-NEXT: Classifying expressions for: @ptr_induction_early_exit_eq_2 @@ -1786,7 +1665,6 @@ exit: ret void } - define void @gep_addrec_nw(ptr %a) { ; CHECK-LABEL: 'gep_addrec_nw' ; CHECK-NEXT: Classifying expressions for: @gep_addrec_nw diff --git a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll index 3022281658a75..bb97005e8faf4 100644 --- a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll +++ b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll @@ -58,6 +58,15 @@ define void @test_well_defined_infinite_st(i32 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i32 2147483647 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 ; entry: br label %for.body @@ -79,6 +88,15 @@ define void @test_well_defined_infinite_ld(i32 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i32 2147483647 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 ; entry: br label %for.body @@ -100,6 +118,15 @@ define void @test_no_mustprogress(i32 %N) { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i32 2147483647 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 ; entry: br label %for.body @@ -187,6 +214,15 @@ define void @test_abnormal_exit(i32 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i32 2147483647 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 ; entry: br label %for.body @@ -209,10 +245,24 @@ define void @test_other_exit(i32 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: exit count for for.body: i32 9 ; CHECK-NEXT: exit count for for.latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: predicated exit count for for.latch: ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-EMPTY: ; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is i32 9 ; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is i32 9 ; CHECK-NEXT: symbolic max exit count for for.body: i32 9 ; CHECK-NEXT: symbolic max exit count for for.latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: predicated symbolic max exit count for for.latch: ((-2 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-EMPTY: +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (9 umin ((-2 + %N) /u 2)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (9 umin ((-2 + %N) /u 2)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0 ; entry: br label %for.body @@ -267,6 +317,18 @@ define void @test_sext(i64 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (%N /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0 +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (%N /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0 ; entry: br label %for.body @@ -288,6 +350,21 @@ define void @test_zext_of_sext(i64 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (%N /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0 +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (%N /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0 ; entry: br label %for.body @@ -310,6 +387,18 @@ define void @test_zext_offset(i64 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is ((-21 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0 +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is ((-21 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0 ; entry: br label %for.body @@ -332,6 +421,18 @@ define void @test_sext_offset(i64 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. ; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is ((-21 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0 +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0 +; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is ((-21 + %N) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0 ; entry: br label %for.body diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/predicated-max-backedge-taken-count-guard-info.ll new file mode 100644 index 0000000000000..1805b983c8e2e --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/predicated-max-backedge-taken-count-guard-info.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s + +define i32 @ptr_induction_ult(ptr %a, ptr %b) { +; CHECK-LABEL: 'ptr_induction_ult' +; CHECK-NEXT: Determining loop execution counts for: @ptr_induction_ult +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is (((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %b to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %b to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is (((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %b to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i2))) to i64) == 0 +; +entry: + %cmp.6 = icmp ult ptr %a, %b + br i1 %cmp.6, label %loop, label %exit + +loop: + %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] + %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1 + %exitcond = icmp eq ptr %ptr.iv, %b + br i1 %exitcond, label %exit, label %loop + +exit: + ret i32 0 +} + +define i32 @ptr_induction_ult_3_step_6(ptr %a, ptr %b) { +; CHECK-LABEL: 'ptr_induction_ult_3_step_6' +; CHECK-NEXT: Determining loop execution counts for: @ptr_induction_ult_3_step_6 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is (((3074457345618258603 * (ptrtoint ptr %b to i64)) + (-3074457345618258603 * (ptrtoint ptr %a to i64))) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) to i1) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 9223372036854775807 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) to i1) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is (((3074457345618258603 * (ptrtoint ptr %b to i64)) + (-3074457345618258603 * (ptrtoint ptr %a to i64))) /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) to i1) to i64) == 0 +; +entry: + %cmp.6 = icmp ult ptr %a, %b + br i1 %cmp.6, label %loop, label %exit + +loop: + %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] + %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 6 + %exitcond = icmp eq ptr %ptr.iv, %b + br i1 %exitcond, label %exit, label %loop + +exit: + ret i32 0 +} + +define i32 @ptr_induction_ult_3_step_7(ptr %a, ptr %b) { +; CHECK-LABEL: 'ptr_induction_ult_3_step_7' +; CHECK-NEXT: Determining loop execution counts for: @ptr_induction_ult_3_step_7 +; CHECK-NEXT: Loop %loop: backedge-taken count is ((7905747460161236407 * (ptrtoint ptr %b to i64)) + (-7905747460161236407 * (ptrtoint ptr %a to i64))) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 -1 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is ((7905747460161236407 * (ptrtoint ptr %b to i64)) + (-7905747460161236407 * (ptrtoint ptr %a to i64))) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + %cmp.6 = icmp ult ptr %a, %b + br i1 %cmp.6, label %loop, label %exit + +loop: + %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ] + %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 7 + %exitcond = icmp eq ptr %ptr.iv, %b + br i1 %exitcond, label %exit, label %loop + +exit: + ret i32 0 +} + +; %a and %b may not have the same alignment, so the loop may only via the early +; exit when %ptr.iv > %b. The predicated exit count for the latch can be +; computed by adding a predicate. +define void @ptr_induction_early_exit_eq_1(ptr %a, ptr %b, ptr %c) { +; CHECK-LABEL: 'ptr_induction_early_exit_eq_1' +; CHECK-NEXT: Determining loop execution counts for: @ptr_induction_early_exit_eq_1 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: exit count for loop: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: exit count for loop.inc: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: predicated exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %b to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i3))) to i64) == 0 +; CHECK-EMPTY: +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: symbolic max exit count for loop: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: symbolic max exit count for loop.inc: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: predicated symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %b to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i3))) to i64) == 0 +; CHECK-EMPTY: +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 2305843009213693951 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %b to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i3))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %b to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i3))) to i64) == 0 +; +entry: + %cmp = icmp eq ptr %a, %b + br i1 %cmp, label %exit, label %loop + +loop: + %ptr.iv = phi ptr [ %ptr.iv.next, %loop.inc ], [ %a, %entry ] + %ld1 = load ptr, ptr %ptr.iv, align 8 + %earlyexitcond = icmp eq ptr %ld1, %c + br i1 %earlyexitcond, label %exit, label %loop.inc + +loop.inc: + %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 8 + %exitcond = icmp eq ptr %ptr.iv.next, %b + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + + diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-urem.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-urem.ll new file mode 100644 index 0000000000000..d24655f6ae5c1 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/trip-count-urem.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -passes='print' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s + +declare void @foo() + +define void @test_trip_count_expr_contains_urem(i32 %N) { +; CHECK-LABEL: 'test_trip_count_expr_contains_urem' +; CHECK-NEXT: Determining loop execution counts for: @test_trip_count_expr_contains_urem +; CHECK-NEXT: Loop %loop: backedge-taken count is ((1 + (-1 * (zext i4 (1 + (trunc i32 %N to i4)) to i32)) + %N) /u 16) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 268435455 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is ((1 + (-1 * (zext i4 (1 + (trunc i32 %N to i4)) to i32)) + %N) /u 16) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + %n.rnd.up = add i32 %N, 1 + %n.mod.vf = urem i32 %n.rnd.up, 16 + %n.vec = sub i32 %n.rnd.up, %n.mod.vf + br label %loop + +loop: + %index = phi i32 [ 0, %entry ], [ %index.next, %loop ] + %index.next = add i32 %index, 16 + call void @foo() + %ec = icmp eq i32 %index, %n.vec + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index 43ac246055da7..584c0ef7cfeb7 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -35,6 +35,15 @@ declare i32 @llvm.nvvm.rotate.b32(i32, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) declare i64 @llvm.nvvm.rotate.b64(i64, i32) +declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr) +declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr) +declare ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr) +declare ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr) +declare ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1)) +declare ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3)) +declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4)) +declare ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5)) + ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -156,3 +165,29 @@ define void @rotate(i32 %a, i64 %b) { %r3 = call i64 @llvm.nvvm.rotate.b64(i64 %b, i32 8) ret void } + +; CHECK-LABEL: @addrspacecast +define void @addrspacecast(ptr %p0) { +; CHECK: %1 = addrspacecast ptr %p0 to ptr addrspace(1) +; CHECK: %2 = addrspacecast ptr addrspace(1) %1 to ptr +; CHECK: %3 = addrspacecast ptr %2 to ptr addrspace(3) +; CHECK: %4 = addrspacecast ptr addrspace(3) %3 to ptr +; CHECK: %5 = addrspacecast ptr %4 to ptr addrspace(4) +; CHECK: %6 = addrspacecast ptr addrspace(4) %5 to ptr +; CHECK: %7 = addrspacecast ptr %6 to ptr addrspace(5) +; CHECK: %8 = addrspacecast ptr addrspace(5) %7 to ptr +; + %p1 = call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr %p0) + %p2 = call ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1) %p1) + + %p3 = call ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr %p2) + %p4 = call ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3) %p3) + + %p5 = call ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr %p4) + %p6 = call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) %p5) + + %p7 = call ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr %p6) + %p8 = call ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5) %p7) + + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll new file mode 100644 index 0000000000000..cf490021026e0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll @@ -0,0 +1,459 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve -aarch64-disable-multivector-spill-fill -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR + +declare void @my_func() +declare void @my_func2( %v) + +define void @fbyte( %v) #0{ +; NOPAIR-LABEL: fbyte: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; NOPAIR-NEXT: cntd x9 +; NOPAIR-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-18 +; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-1 +; NOPAIR-NEXT: str z0, [sp] // 16-byte Folded Spill +; NOPAIR-NEXT: bl __arm_sme_state +; NOPAIR-NEXT: and x19, x0, #0x1 +; NOPAIR-NEXT: tbz w19, #0, .LBB0_2 +; NOPAIR-NEXT: // %bb.1: +; NOPAIR-NEXT: smstop sm +; NOPAIR-NEXT: .LBB0_2: +; NOPAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; NOPAIR-NEXT: bl my_func2 +; NOPAIR-NEXT: tbz w19, #0, .LBB0_4 +; NOPAIR-NEXT: // %bb.3: +; NOPAIR-NEXT: smstart sm +; NOPAIR-NEXT: .LBB0_4: +; NOPAIR-NEXT: addvl sp, sp, #1 +; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #18 +; NOPAIR-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload +; NOPAIR-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: fbyte: +; PAIR: // %bb.0: +; PAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; PAIR-NEXT: cntd x9 +; PAIR-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-18 +; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-1 +; PAIR-NEXT: str z0, [sp] // 16-byte Folded Spill +; PAIR-NEXT: bl __arm_sme_state +; PAIR-NEXT: and x19, x0, #0x1 +; PAIR-NEXT: tbz w19, #0, .LBB0_2 +; PAIR-NEXT: // %bb.1: +; PAIR-NEXT: smstop sm +; PAIR-NEXT: .LBB0_2: +; PAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; PAIR-NEXT: bl my_func2 +; PAIR-NEXT: tbz w19, #0, .LBB0_4 +; PAIR-NEXT: // %bb.3: +; PAIR-NEXT: smstart sm +; PAIR-NEXT: .LBB0_4: +; PAIR-NEXT: addvl sp, sp, #1 +; PAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #18 +; PAIR-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload +; PAIR-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; PAIR-NEXT: ret + call void @my_func2( %v) + ret void +} + +define void @fhalf( %v) #1{ +; NOPAIR-LABEL: fhalf: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; NOPAIR-NEXT: cntd x9 +; NOPAIR-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-18 +; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: smstop sm +; NOPAIR-NEXT: bl my_func +; NOPAIR-NEXT: smstart sm +; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #18 +; NOPAIR-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: fhalf: +; PAIR: // %bb.0: +; PAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; PAIR-NEXT: cntd x9 +; PAIR-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-18 +; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: smstop sm +; PAIR-NEXT: bl my_func +; PAIR-NEXT: smstart sm +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #18 +; PAIR-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; PAIR-NEXT: ret + call void @my_func() + ret void +} + +define void @ffloat( %v) #2 { +; NOPAIR-LABEL: ffloat: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; NOPAIR-NEXT: rdsvl x9, #1 +; NOPAIR-NEXT: lsr x9, x9, #3 +; NOPAIR-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; NOPAIR-NEXT: cntd x9 +; NOPAIR-NEXT: str x9, [sp, #24] // 8-byte Folded Spill +; NOPAIR-NEXT: addsvl sp, sp, #-18 +; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: smstart sm +; NOPAIR-NEXT: smstop sm +; NOPAIR-NEXT: bl my_func +; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: addsvl sp, sp, #18 +; NOPAIR-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: ffloat: +; PAIR: // %bb.0: +; PAIR-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; PAIR-NEXT: rdsvl x9, #1 +; PAIR-NEXT: lsr x9, x9, #3 +; PAIR-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; PAIR-NEXT: cntd x9 +; PAIR-NEXT: str x9, [sp, #24] // 8-byte Folded Spill +; PAIR-NEXT: addsvl sp, sp, #-18 +; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: smstart sm +; PAIR-NEXT: smstop sm +; PAIR-NEXT: bl my_func +; PAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addsvl sp, sp, #18 +; PAIR-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; PAIR-NEXT: ret + call void @my_func() + ret void +} + + + +attributes #0 = { nounwind "aarch64_pstate_sm_compatible" } +attributes #1 = { nounwind "aarch64_pstate_sm_enabled" } +attributes #2 = { nounwind "aarch64_pstate_sm_body" } diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll new file mode 100644 index 0000000000000..778f31194baf4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define {, , , } @test_luti4_zt_i8( %v0, %v1) #0 { +; CHECK-LABEL: test_luti4_zt_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 } +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, %v0, %v1) + ret {, , , } %res +} + +attributes #0 = { "target-features"="+sme2,+sme-lutv2"} diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll index c9d216935edbf..30a8396d85ab7 100644 --- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll @@ -1,13 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-disable-multivector-spill-fill -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=NOPAIR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -aarch64-disable-multivector-spill-fill -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=NOPAIR ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=PAIR ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR - +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=PAIR declare void @my_func() -define void @fbyte( %v) { +define void @fbyte( %v){ ; NOPAIR-LABEL: fbyte: ; NOPAIR: // %bb.0: ; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -558,3 +561,4 @@ define aarch64_sve_vector_pcs void @test_clobbers_p_reg_negative() { call void asm sideeffect "", "~{p10}"() ret void } + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 50d40368dd107..f2ff022308cc6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1510,4 +1510,243 @@ bb: ret void } +define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8 +; GFX9-NEXT: scratch_load_dword v2, off, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8 +; GFX10-NEXT: scratch_load_dword v2, off, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8 +; GFX940-NEXT: scratch_load_dword v2, off, s0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8 +; GFX11-NEXT: scratch_load_b32 v2, off, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_large_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 + %load = load i32, ptr addrspace(5) %large_offset, align 4 + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset_split: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_and_b32 s0, s2, -4 +; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8 +; GFX9-NEXT: scratch_load_dword v2, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset_split: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_and_b32 s0, s2, -4 +; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8 +; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset_split: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_and_b32 s0, s0, -4 +; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8 +; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset_split: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_and_b32 s0, s0, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8 +; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_large_offset_split: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_and_b32 s0, s0, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8 +; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5) + %sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32 + %sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292 + %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5) + %split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728 + %load = load volatile i32, ptr addrspace(5) %split_offset, align 4 + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) { +; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: v_add_u32_e32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8 +; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: v_add_u32_e32 v0, s1, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8 +; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8 +; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm +bb: + %add1 = add nsw i32 %sidx, %vidx + %add2 = add nsw i32 %add1, 65512 + %gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2 + store volatile i32 15, ptr addrspace(5) %gep, align 4 + ret void +} + +define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) { +; GFX9-LABEL: sgpr_base_negative_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8 +; GFX9-NEXT: scratch_load_dword v2, off, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_negative_offset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_negative_offset: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8 +; GFX940-NEXT: scratch_load_dword v2, off, s0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_negative_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_negative_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24 + %0 = load i32, ptr addrspace(5) %scevgep28, align 4 + store i32 %0, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index de973481f8230..e9e7360733581 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -12,97 +12,90 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; CHECK-NEXT: v_mov_b32_e32 v8, v0 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v15, v1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v14, v2 -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v13, v3 -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v12, v4 -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v11, v5 -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v10, v6 -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v9, v7 -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v2, v15 -; CHECK-NEXT: v_mov_b32_e32 v3, v14 -; CHECK-NEXT: v_mov_b32_e32 v4, v13 -; CHECK-NEXT: v_mov_b32_e32 v5, v12 -; CHECK-NEXT: v_mov_b32_e32 v6, v11 -; CHECK-NEXT: v_mov_b32_e32 v7, v10 -; CHECK-NEXT: v_mov_b32_e32 v8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v14, v1 +; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v13, v2 +; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v12, v3 +; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v11, v4 +; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v10, v5 +; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v9, v6 +; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v8, v7 +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v1, v14 +; CHECK-NEXT: v_mov_b32_e32 v2, v13 +; CHECK-NEXT: v_mov_b32_e32 v3, v12 +; CHECK-NEXT: v_mov_b32_e32 v4, v11 +; CHECK-NEXT: v_mov_b32_e32 v5, v10 +; CHECK-NEXT: v_mov_b32_e32 v6, v9 +; CHECK-NEXT: v_mov_b32_e32 v7, v8 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s4, s8 ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v16, s4, 0 +; CHECK-NEXT: v_writelane_b32 v16, s5, 1 +; CHECK-NEXT: v_writelane_b32 v16, s6, 2 +; CHECK-NEXT: v_writelane_b32 v16, s7, 3 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, s5 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v0, s4, 4 +; CHECK-NEXT: v_writelane_b32 v16, s4, 4 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s12, v8 -; CHECK-NEXT: v_readfirstlane_b32 s10, v7 -; CHECK-NEXT: v_readfirstlane_b32 s9, v6 -; CHECK-NEXT: v_readfirstlane_b32 s8, v5 -; CHECK-NEXT: v_readfirstlane_b32 s7, v4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v3 -; CHECK-NEXT: v_readfirstlane_b32 s5, v2 -; CHECK-NEXT: v_readfirstlane_b32 s4, v1 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: v_readfirstlane_b32 s12, v7 +; CHECK-NEXT: v_readfirstlane_b32 s10, v6 +; CHECK-NEXT: v_readfirstlane_b32 s9, v5 +; CHECK-NEXT: v_readfirstlane_b32 s8, v4 +; CHECK-NEXT: v_readfirstlane_b32 s7, v3 +; CHECK-NEXT: v_readfirstlane_b32 s6, v2 +; CHECK-NEXT: v_readfirstlane_b32 s5, v1 +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 ; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: s_mov_b32 s13, s10 ; CHECK-NEXT: s_mov_b32 s14, s9 @@ -111,59 +104,59 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v0, s12, 5 -; CHECK-NEXT: v_writelane_b32 v0, s13, 6 -; CHECK-NEXT: v_writelane_b32 v0, s14, 7 -; CHECK-NEXT: v_writelane_b32 v0, s15, 8 -; CHECK-NEXT: v_writelane_b32 v0, s16, 9 -; CHECK-NEXT: v_writelane_b32 v0, s17, 10 -; CHECK-NEXT: v_writelane_b32 v0, s18, 11 -; CHECK-NEXT: v_writelane_b32 v0, s19, 12 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_writelane_b32 v16, s12, 5 +; CHECK-NEXT: v_writelane_b32 v16, s13, 6 +; CHECK-NEXT: v_writelane_b32 v16, s14, 7 +; CHECK-NEXT: v_writelane_b32 v16, s15, 8 +; CHECK-NEXT: v_writelane_b32 v16, s16, 9 +; CHECK-NEXT: v_writelane_b32 v16, s17, 10 +; CHECK-NEXT: v_writelane_b32 v16, s18, 11 +; CHECK-NEXT: v_writelane_b32 v16, s19, 12 +; CHECK-NEXT: v_mov_b32_e32 v6, v8 ; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: v_mov_b32_e32 v8, v10 +; CHECK-NEXT: v_mov_b32_e32 v4, v10 ; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v6, v12 +; CHECK-NEXT: v_mov_b32_e32 v2, v12 ; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v4, v14 +; CHECK-NEXT: v_mov_b32_e32 v0, v14 ; CHECK-NEXT: v_mov_b32_e32 v1, v15 -; CHECK-NEXT: v_mov_b32_e32 v2, v16 ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] ; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] ; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] ; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19] -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[7:8] -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[5:6] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[3:4] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v0, s4, 13 +; CHECK-NEXT: v_writelane_b32 v16, s4, 13 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v2, 13 -; CHECK-NEXT: v_readlane_b32 s8, v2, 5 -; CHECK-NEXT: v_readlane_b32 s9, v2, 6 -; CHECK-NEXT: v_readlane_b32 s10, v2, 7 -; CHECK-NEXT: v_readlane_b32 s11, v2, 8 -; CHECK-NEXT: v_readlane_b32 s12, v2, 9 -; CHECK-NEXT: v_readlane_b32 s13, v2, 10 -; CHECK-NEXT: v_readlane_b32 s14, v2, 11 -; CHECK-NEXT: v_readlane_b32 s15, v2, 12 -; CHECK-NEXT: v_readlane_b32 s16, v2, 0 -; CHECK-NEXT: v_readlane_b32 s17, v2, 1 -; CHECK-NEXT: v_readlane_b32 s18, v2, 2 -; CHECK-NEXT: v_readlane_b32 s19, v2, 3 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s4, v16, 13 +; CHECK-NEXT: v_readlane_b32 s8, v16, 5 +; CHECK-NEXT: v_readlane_b32 s9, v16, 6 +; CHECK-NEXT: v_readlane_b32 s10, v16, 7 +; CHECK-NEXT: v_readlane_b32 s11, v16, 8 +; CHECK-NEXT: v_readlane_b32 s12, v16, 9 +; CHECK-NEXT: v_readlane_b32 s13, v16, 10 +; CHECK-NEXT: v_readlane_b32 s14, v16, 11 +; CHECK-NEXT: v_readlane_b32 s15, v16, 12 +; CHECK-NEXT: v_readlane_b32 s16, v16, 0 +; CHECK-NEXT: v_readlane_b32 s17, v16, 1 +; CHECK-NEXT: v_readlane_b32 s18, v16, 2 +; CHECK-NEXT: v_readlane_b32 s19, v16, 3 ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -171,24 +164,19 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v0, 4 +; CHECK-NEXT: v_readlane_b32 s4, v16, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 -; CHECK-NEXT: ; kill: killed $vgpr4 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir index 9794130d2b000..c91b686697b9d 100644 --- a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir @@ -20,7 +20,7 @@ body: | ; GFX908-LABEL: name: agpr32_restore_clobber_scc ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -36,7 +36,7 @@ body: | ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -514,7 +514,7 @@ body: | ; GFX908-FLATSCR-LABEL: name: agpr32_restore_clobber_scc ; GFX908-FLATSCR: bb.0: ; GFX908-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -531,7 +531,7 @@ body: | ; GFX908-FLATSCR-NEXT: S_NOP 0 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: bb.2: - ; GFX908-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -1038,7 +1038,7 @@ body: | ; GFX908-LABEL: name: agpr64_restore_clobber_scc ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -1056,7 +1056,7 @@ body: | ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -1535,7 +1535,7 @@ body: | ; GFX908-FLATSCR-LABEL: name: agpr64_restore_clobber_scc ; GFX908-FLATSCR: bb.0: ; GFX908-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -1554,7 +1554,7 @@ body: | ; GFX908-FLATSCR-NEXT: S_NOP 0 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: bb.2: - ; GFX908-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -2061,7 +2061,7 @@ body: | ; GFX908-LABEL: name: agpr96_restore_clobber_scc ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -2081,7 +2081,7 @@ body: | ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -2561,7 +2561,7 @@ body: | ; GFX908-FLATSCR-LABEL: name: agpr96_restore_clobber_scc ; GFX908-FLATSCR: bb.0: ; GFX908-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -2582,7 +2582,7 @@ body: | ; GFX908-FLATSCR-NEXT: S_NOP 0 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: bb.2: - ; GFX908-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -3089,7 +3089,7 @@ body: | ; GFX908-LABEL: name: agpr32_save_clobber_scc ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-NEXT: liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -3105,7 +3105,7 @@ body: | ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -3583,7 +3583,7 @@ body: | ; GFX908-FLATSCR-LABEL: name: agpr32_save_clobber_scc ; GFX908-FLATSCR: bb.0: ; GFX908-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-FLATSCR-NEXT: liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -3600,7 +3600,7 @@ body: | ; GFX908-FLATSCR-NEXT: S_NOP 0 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: bb.2: - ; GFX908-FLATSCR-NEXT: liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -4106,7 +4106,7 @@ body: | ; GFX908-LABEL: name: agpr64_save_clobber_scc ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-NEXT: liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -4124,7 +4124,7 @@ body: | ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -4603,7 +4603,7 @@ body: | ; GFX908-FLATSCR-LABEL: name: agpr64_save_clobber_scc ; GFX908-FLATSCR: bb.0: ; GFX908-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-FLATSCR-NEXT: liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -4622,7 +4622,7 @@ body: | ; GFX908-FLATSCR-NEXT: S_NOP 0 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: bb.2: - ; GFX908-FLATSCR-NEXT: liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -5127,7 +5127,7 @@ body: | ; GFX908-LABEL: name: agpr96_save_clobber_scc ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-NEXT: liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -5147,7 +5147,7 @@ body: | ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -5627,7 +5627,7 @@ body: | ; GFX908-FLATSCR-LABEL: name: agpr96_save_clobber_scc ; GFX908-FLATSCR: bb.0: ; GFX908-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX908-FLATSCR-NEXT: liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -5648,7 +5648,7 @@ body: | ; GFX908-FLATSCR-NEXT: S_NOP 0 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: bb.2: - ; GFX908-FLATSCR-NEXT: liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-FLATSCR-NEXT: {{ $}} ; GFX908-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll index 8d87b53efb4e7..0e16ea10c019a 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -154,25 +154,28 @@ bb: declare void @undef_func() ; GCN-LABEL: {{^}}kernel_call_undef_func: -; GFX908: .amdhsa_next_free_vgpr 32 -; GFX90A: .amdhsa_next_free_vgpr 64 -; GFX90A: .amdhsa_accum_offset 32 -; GCN: NumVgprs: 32 -; GCN: NumAgprs: 32 -; GFX908: TotalNumVgprs: 32 -; GFX90A: TotalNumVgprs: 64 -; GFX908: VGPRBlocks: 7 -; GFX90A: VGPRBlocks: 7 -; GFX908: NumVGPRsForWavesPerEU: 32 -; GFX90A: NumVGPRsForWavesPerEU: 64 -; GFX90A: AccumOffset: 32 -; GFX908: Occupancy: 8 -; GFX90A: Occupancy: 8 -; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7 +; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0) +; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4 +; GCN: .set kernel_call_undef_func.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: NumVgprs: kernel_call_undef_func.num_vgpr +; GCN: NumAgprs: kernel_call_undef_func.num_agpr +; GCN: TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr) +; GFX908: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4))/4)-1 +; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1 +; GCN: NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0) +; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4 +; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)) +; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)) +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63 define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() ret void } +; GCN: .set amdgpu.max_num_vgpr, 32 +; GCN-NEXT: .set amdgpu.max_num_agpr, 32 +; GCN-NEXT: .set amdgpu.max_num_sgpr, 34 + attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir index 80923dfc6f522..3c3c9839755a2 100644 --- a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir +++ b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,2 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s --- # GCN-LABEL: name: alloc_vgpr_64 diff --git a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir index 0f0cd0e8171d1..c42b570b40812 100644 --- a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir +++ b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -start-before=greedy,0 -stop-after=virtregrewriter,2 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s # Using the unaligned vector tuples are OK as long as they aren't used # in a real instruction. diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll index c8ba6722d9d85..122fc42ef9b62 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll @@ -11,7 +11,7 @@ ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6: ; ASM: .amdhsa_user_sgpr_count 10 ; ASM: .amdhsa_next_free_sgpr 10 -; ASM: ; NumSgprs: 16 +; ASM: ; TotalNumSgprs: 16 ; ASM: ; NumSGPRsForWavesPerEU: 16 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT @@ -31,7 +31,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2: ; ASM: .amdhsa_user_sgpr_count 10 ; ASM: .amdhsa_next_free_sgpr 10 -; ASM: ; NumSgprs: 16 +; ASM: ; TotalNumSgprs: 16 ; ASM: ; NumSGPRsForWavesPerEU: 16 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2 @@ -47,7 +47,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2: ; ASM: .amdhsa_user_sgpr_count 3 ; ASM: .amdhsa_next_free_sgpr 3 -; ASM: ; NumSgprs: 9 +; ASM: ; TotalNumSgprs: 9 ; ASM: ; NumSGPRsForWavesPerEU: 9 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD. @@ -62,7 +62,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2: ; ASM: .amdhsa_user_sgpr_count 2 ; ASM: .amdhsa_next_free_sgpr 0 -; ASM: ; NumSgprs: 6 +; ASM: ; TotalNumSgprs: 6 ; ASM: ; NumSGPRsForWavesPerEU: 6 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD. diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll index 99a7ae37e0e78..8f4cb364751d8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll @@ -60,7 +60,9 @@ bb: declare void @undef_func() ; CHECK: .type kernel_call_undef_func -; CHECK: NumAgprs: 32 +; CHECK: .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr) +; CHECK: NumAgprs: kernel_call_undef_func.num_agpr +; CHECK: .set amdgpu.max_num_agpr, 32 define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll index e4d427a0b826f..d45e116beb4e3 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll @@ -547,18 +547,20 @@ define amdgpu_kernel void @f256() #256 { attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" } ; GCN-LABEL: {{^}}f512: -; GFX9: NumVgprs: 128 -; GFX90A: NumVgprs: 128 -; GFX90A: NumAgprs: 128 -; GFX90A: TotalNumVgprs: 256 -; GFX10WGP-WAVE32: NumVgprs: 256 -; GFX10WGP-WAVE64: NumVgprs: 256 -; GFX10CU-WAVE32: NumVgprs: 128 -; GFX10CU-WAVE64: NumVgprs: 128 -; GFX11WGP-WAVE32: NumVgprs: 256 -; GFX11WGP-WAVE64: NumVgprs: 256 -; GFX11CU-WAVE32: NumVgprs: 192 -; GFX11CU-WAVE64: NumVgprs: 192 +; GFX9: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX90A: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX90A: .set f512.num_agpr, max(128, amdgpu.max_num_agpr) +; GFX10WGP-WAVE32: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr) +; GFX10WGP-WAVE64: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr) +; GFX10CU-WAVE32: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX10CU-WAVE64: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX11WGP-WAVE32: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr) +; GFX11WGP-WAVE64: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr) +; GFX11CU-WAVE32: .set f512.num_vgpr, max(192, amdgpu.max_num_vgpr) +; GFX11CU-WAVE64: .set f512.num_vgpr, max(192, amdgpu.max_num_vgpr) +; GCN: NumVgprs: f512.num_vgpr +; GFX90A: NumAgprs: f512.num_agpr +; GFX90A: TotalNumVgprs: totalnumvgprs(f512.num_agpr, f512.num_vgpr) define amdgpu_kernel void @f512() #512 { call void @foo() call void @use256vgprs() @@ -567,17 +569,20 @@ define amdgpu_kernel void @f512() #512 { attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" } ; GCN-LABEL: {{^}}f1024: -; GFX9: NumVgprs: 64 -; GFX90A: NumAgprs: 64 -; GFX90A: TotalNumVgprs: 128 -; GFX10WGP-WAVE32: NumVgprs: 128 -; GFX10WGP-WAVE64: NumVgprs: 128 -; GFX10CU-WAVE32: NumVgprs: 64 -; GFX10CU-WAVE64: NumVgprs: 64 -; GFX11WGP-WAVE32: NumVgprs: 192 -; GFX11WGP-WAVE64: NumVgprs: 192 -; GFX11CU-WAVE32: NumVgprs: 96 -; GFX11CU-WAVE64: NumVgprs: 96 +; GFX9: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr) +; GFX90A: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr) +; GFX90A: .set f1024.num_agpr, max(64, amdgpu.max_num_agpr) +; GFX10WGP-WAVE32: .set f1024.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX10WGP-WAVE64: .set f1024.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX10CU-WAVE32: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr) +; GFX10CU-WAVE64: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr) +; GFX11WGP-WAVE32: .set f1024.num_vgpr, max(192, amdgpu.max_num_vgpr) +; GFX11WGP-WAVE64: .set f1024.num_vgpr, max(192, amdgpu.max_num_vgpr) +; GFX11CU-WAVE32: .set f1024.num_vgpr, max(96, amdgpu.max_num_vgpr) +; GFX11CU-WAVE64: .set f1024.num_vgpr, max(96, amdgpu.max_num_vgpr) +; GCN: NumVgprs: f1024.num_vgpr +; GFX90A: NumAgprs: f1024.num_agpr +; GFX90A: TotalNumVgprs: totalnumvgprs(f1024.num_agpr, f1024.num_vgpr) define amdgpu_kernel void @f1024() #1024 { call void @foo() call void @use256vgprs() diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll index 3ed2cb856eaea..2b98f61748066 100644 --- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,1 -o - %s | FileCheck -check-prefix=REGALLOC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,2 -o - %s | FileCheck -check-prefix=REGALLOC %s ; Test to check if the bb prolog spills are inserted correctly during regalloc. define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { @@ -8,22 +8,20 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; REGALLOC-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; REGALLOC-NEXT: {{ $}} - ; REGALLOC-NEXT: renamable $vgpr3 = IMPLICIT_DEF ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr2, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) - ; REGALLOC-NEXT: renamable $vgpr1 = COPY $vgpr0 - ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 49 - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr1, killed $sgpr4, implicit $exec + ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr0, killed $sgpr4, implicit $exec ; REGALLOC-NEXT: renamable $sgpr6 = IMPLICIT_DEF - ; REGALLOC-NEXT: renamable $vgpr1 = COPY killed renamable $sgpr6 - ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; REGALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr6 + ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, killed renamable $sgpr6_sgpr7, implicit-def dead $scc - ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 0, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7 - ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 1, $vgpr0, implicit killed $sgpr6_sgpr7 - ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; REGALLOC-NEXT: $vgpr63 = IMPLICIT_DEF + ; REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 0, $vgpr63, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7 + ; REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 1, $vgpr63, implicit killed $sgpr6_sgpr7 + ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; REGALLOC-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5 ; REGALLOC-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; REGALLOC-NEXT: S_BRANCH %bb.3 @@ -31,16 +29,16 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: bb.1.Flow: ; REGALLOC-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; REGALLOC-NEXT: {{ $}} - ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5 - ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 + ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0, implicit-def $sgpr4_sgpr5 + ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 1 ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr5, 3, $vgpr0, implicit $sgpr4_sgpr5 - ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr63, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr5, 3, $vgpr63, implicit $sgpr4_sgpr5 + ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; REGALLOC-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; REGALLOC-NEXT: S_BRANCH %bb.2 @@ -64,13 +62,12 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: S_BRANCH %bb.1 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.4.bb.3: - ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5 - ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3 - ; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) + ; REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2, implicit-def $sgpr4_sgpr5 + ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 3 + ; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec - ; REGALLOC-NEXT: KILL killed renamable $vgpr1 ; REGALLOC-NEXT: SI_RETURN implicit killed $vgpr0 bb.0: %cmp = icmp slt i32 %arg0, 50 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index adfc177c8bf74..0047b6b0ee934 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -971,12 +971,12 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v1, s98, 3 ; CHECK-NEXT: v_writelane_b32 v0, s92, 61 ; CHECK-NEXT: v_writelane_b32 v1, s99, 4 +; CHECK-NEXT: s_mov_b32 s49, s12 ; CHECK-NEXT: v_writelane_b32 v0, s93, 62 ; CHECK-NEXT: v_writelane_b32 v1, s100, 5 -; CHECK-NEXT: s_mov_b32 s49, s12 +; CHECK-NEXT: s_cmp_eq_u32 s49, 0 ; CHECK-NEXT: v_writelane_b32 v0, s94, 63 ; CHECK-NEXT: v_writelane_b32 v1, s101, 6 -; CHECK-NEXT: s_cmp_eq_u32 s49, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll index a795e99560341..e8898d6a7001c 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=ALL %s ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s ; CallGraphAnalysis, which CodeGenSCC order depends on, does not look @@ -8,12 +8,13 @@ @alias = hidden alias void (), ptr @aliasee_default ; ALL-LABEL: {{^}}kernel: -; GFX908: .amdhsa_next_free_vgpr 32 -; GFX908-NEXT: .amdhsa_next_free_sgpr 33 +; ALL: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0) +; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.numbered_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)) +; GFX90A-NEXT: .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4 -; GFX90A: .amdhsa_next_free_vgpr 59 -; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A-NEXT: .amdhsa_accum_offset 32 +; ALL: .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr) +; ALL-NEXT: .set kernel.num_agpr, max(0, aliasee_default.num_agpr) +; ALL-NEXT: .set kernel.numbered_sgpr, max(33, aliasee_default.numbered_sgpr) define amdgpu_kernel void @kernel() #0 { bb: call void @alias() #2 @@ -25,6 +26,9 @@ bb: call void asm sideeffect "; clobber a26 ", "~{a26}"() ret void } +; ALL: .set aliasee_default.num_vgpr, 0 +; ALL-NEXT: .set aliasee_default.num_agpr, 27 +; ALL-NEXT: .set aliasee_default.numbered_sgpr, 32 attributes #0 = { noinline norecurse nounwind optnone } attributes #1 = { noinline norecurse nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll index c976cc3d53b5e..a01268625cedb 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll @@ -7,14 +7,18 @@ @alias0 = hidden alias void (), ptr @aliasee_default_vgpr64_sgpr102 ; CHECK-LABEL: {{^}}kernel0: -; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr) +; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr) +; CHECK-NEXT: .set kernel0.numbered_sgpr, max(33, aliasee_default_vgpr64_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel0() #0 { bb: call void @alias0() #2 ret void } +; CHECK: .set aliasee_default_vgpr64_sgpr102.num_vgpr, 53 +; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_agpr, 0 +; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.numbered_sgpr, 32 define internal void @aliasee_default_vgpr64_sgpr102() #1 { bb: call void asm sideeffect "; clobber v52 ", "~{v52}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll index edef71ef143df..86defe3ba7ec0 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -9,8 +9,12 @@ ; The parent kernel has a higher VGPR usage than the possible callees. ; CHECK-LABEL: {{^}}kernel1: -; CHECK: .amdhsa_next_free_vgpr 41 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0) +; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.numbered_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)) + +; CHECK: .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr) +; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr) +; CHECK-NEXT: .set kernel1.numbered_sgpr, max(33, aliasee_vgpr32_sgpr76.numbered_sgpr) define amdgpu_kernel void @kernel1() #0 { bb: call void asm sideeffect "; clobber v40 ", "~{v40}"() @@ -18,6 +22,9 @@ bb: ret void } +; CHECK: .set aliasee_vgpr32_sgpr76.num_vgpr, 27 +; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_agpr, 0 +; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.numbered_sgpr, 32 define internal void @aliasee_vgpr32_sgpr76() #1 { bb: call void asm sideeffect "; clobber v26 ", "~{v26}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll index bb34ef1a15d2b..6b1fbd9b6e16a 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll @@ -7,14 +7,21 @@ @alias2 = hidden alias void (), ptr @aliasee_vgpr64_sgpr102 ; CHECK-LABEL: {{^}}kernel2: -; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0) +; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.numbered_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)) + +; CHECK: .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr) +; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr) +; CHECK-NEXT: .set kernel2.numbered_sgpr, max(33, aliasee_vgpr64_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel2() #0 { bb: call void @alias2() #2 ret void } +; CHECK: .set aliasee_vgpr64_sgpr102.num_vgpr, 53 +; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_agpr, 0 +; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.numbered_sgpr, 32 define internal void @aliasee_vgpr64_sgpr102() #1 { bb: call void asm sideeffect "; clobber v52 ", "~{v52}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll index 8a88eb7e51ad7..c81181cd82667 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll @@ -7,14 +7,21 @@ @alias3 = hidden alias void (), ptr @aliasee_vgpr256_sgpr102 ; CHECK-LABEL: {{^}}kernel3: -; CHECK: .amdhsa_next_free_vgpr 253 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0) +; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.numbered_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)) + +; CHECK: .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr) +; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr) +; CHECK-NEXT: .set kernel3.numbered_sgpr, max(33, aliasee_vgpr256_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel3() #0 { bb: call void @alias3() #2 ret void } +; CHECK: .set aliasee_vgpr256_sgpr102.num_vgpr, 253 +; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_agpr, 0 +; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.numbered_sgpr, 33 define internal void @aliasee_vgpr256_sgpr102() #1 { bb: call void asm sideeffect "; clobber v252 ", "~{v252}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 6af45035d394f..dbd00f09943c0 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -7,7 +7,7 @@ ; Make sure to run a GPU with the SGPR allocation bug. ; GCN-LABEL: {{^}}use_vcc: -; GCN: ; NumSgprs: 34 +; GCN: ; TotalNumSgprs: 34 ; GCN: ; NumVgprs: 0 define void @use_vcc() #1 { call void asm sideeffect "", "~{vcc}" () #0 @@ -25,7 +25,7 @@ define void @use_vcc() #1 { ; GCN: v_readlane_b32 s4, v40, 2 ; GCN: s_mov_b32 s33, s4 ; GCN: s_setpc_b64 s[30:31] -; GCN: ; NumSgprs: 36 +; GCN: ; TotalNumSgprs: 36 ; GCN: ; NumVgprs: 41 define void @indirect_use_vcc() #1 { call void @use_vcc() @@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 { } ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: -; CI: ; NumSgprs: 38 -; VI-NOBUG: ; NumSgprs: 40 -; VI-BUG: ; NumSgprs: 96 +; CI: ; TotalNumSgprs: 38 +; VI-NOBUG: ; TotalNumSgprs: 40 +; VI-BUG: ; TotalNumSgprs: 96 ; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_vcc() @@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) } ; GCN-LABEL: {{^}}use_flat_scratch: -; CI: ; NumSgprs: 36 -; VI: ; NumSgprs: 38 +; CI: ; TotalNumSgprs: 36 +; VI: ; TotalNumSgprs: 38 ; GCN: ; NumVgprs: 0 define void @use_flat_scratch() #1 { call void asm sideeffect "", "~{flat_scratch}" () #0 @@ -52,8 +52,8 @@ define void @use_flat_scratch() #1 { } ; GCN-LABEL: {{^}}indirect_use_flat_scratch: -; CI: ; NumSgprs: 38 -; VI: ; NumSgprs: 40 +; CI: ; TotalNumSgprs: 38 +; VI: ; TotalNumSgprs: 40 ; GCN: ; NumVgprs: 41 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() @@ -61,9 +61,9 @@ define void @indirect_use_flat_scratch() #1 { } ; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: -; CI: ; NumSgprs: 38 -; VI-NOBUG: ; NumSgprs: 40 -; VI-BUG: ; NumSgprs: 96 +; CI: ; TotalNumSgprs: 38 +; VI-NOBUG: ; TotalNumSgprs: 40 +; VI-BUG: ; TotalNumSgprs: 96 ; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_flat_scratch() @@ -107,23 +107,23 @@ define void @indirect_use_50_vgpr() #0 { } ; GCN-LABEL: {{^}}use_80_sgpr: -; GCN: ; NumSgprs: 80 +; GCN: ; TotalNumSgprs: 80 define void @use_80_sgpr() #1 { call void asm sideeffect "", "~{s79}"() #0 ret void } ; GCN-LABEL: {{^}}indirect_use_80_sgpr: -; GCN: ; NumSgprs: 82 +; GCN: ; TotalNumSgprs: 82 define void @indirect_use_80_sgpr() #1 { call void @use_80_sgpr() ret void } ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr: -; CI: ; NumSgprs: 84 -; VI-NOBUG: ; NumSgprs: 86 -; VI-BUG: ; NumSgprs: 96 +; CI: ; TotalNumSgprs: 84 +; VI-NOBUG: ; TotalNumSgprs: 86 +; VI-BUG: ; TotalNumSgprs: 96 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { call void @indirect_use_80_sgpr() ret void @@ -176,7 +176,7 @@ define amdgpu_kernel void @multi_call_use_use_stack() #0 { declare void @external() #0 ; GCN-LABEL: {{^}}usage_external: -; NumSgprs: 48 +; TotalNumSgprs: 48 ; NumVgprs: 24 ; GCN: ScratchSize: 16384 ; @@ -190,7 +190,7 @@ define amdgpu_kernel void @usage_external() #0 { declare void @external_recurse() #2 ; GCN-LABEL: {{^}}usage_external_recurse: -; NumSgprs: 48 +; TotalNumSgprs: 48 ; NumVgprs: 24 ; GCN: ScratchSize: 16384 ; @@ -234,10 +234,11 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { ; Make sure there's no assert when a sgpr96 is used. ; GCN-LABEL: {{^}}count_use_sgpr96_external_call ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 84 -; VI-NOBUG: NumSgprs: 86 -; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 50 +; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4 +; VI-BUG: TotalNumSgprs: 96 +; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr define amdgpu_kernel void @count_use_sgpr96_external_call() { entry: tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> ) #1 @@ -248,10 +249,11 @@ entry: ; Make sure there's no assert when a sgpr160 is used. ; GCN-LABEL: {{^}}count_use_sgpr160_external_call ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 84 -; VI-NOBUG: NumSgprs: 86 -; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 50 +; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4 +; VI-BUG: TotalNumSgprs: 96 +; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr define amdgpu_kernel void @count_use_sgpr160_external_call() { entry: tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> ) #1 @@ -262,10 +264,11 @@ entry: ; Make sure there's no assert when a vgpr160 is used. ; GCN-LABEL: {{^}}count_use_vgpr160_external_call ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 84 -; VI-NOBUG: NumSgprs: 86 -; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 50 +; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, amdgpu.max_num_vgpr) +; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; CI: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+4 +; VI-BUG: TotalNumSgprs: 96 +; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr define amdgpu_kernel void @count_use_vgpr160_external_call() { entry: tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> ) #1 @@ -273,6 +276,27 @@ entry: ret void } +; GCN: .set amdgpu.max_num_vgpr, 50 +; GCN: .set amdgpu.max_num_agpr, 0 +; GCN: .set amdgpu.max_num_sgpr, 80 + +; GCN-LABEL: amdhsa.kernels: +; GCN: .name: count_use_sgpr96_external_call +; CI: .sgpr_count: 84 +; VI-NOBUG: .sgpr_count: 86 +; VI-BUG: .sgpr_count: 96 +; GCN: .vgpr_count: 50 +; GCN: .name: count_use_sgpr160_external_call +; CI: .sgpr_count: 84 +; VI-NOBUG: .sgpr_count: 86 +; VI-BUG: .sgpr_count: 96 +; GCN: .vgpr_count: 50 +; GCN: .name: count_use_vgpr160_external_call +; CI: .sgpr_count: 84 +; VI-NOBUG: .sgpr_count: 86 +; VI-BUG: .sgpr_count: 96 +; GCN: .vgpr_count: 50 + attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #2 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index b46cdb8ab3ba0..3e25904aa044d 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -36,66 +36,56 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_load_dword s1, s[2:3], 0xa ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s2, -1 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_mov_b64 s[4:5], exec ; GCN_DBG-NEXT: s_mov_b64 exec, -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm entry: %cmp = icmp eq i32 %n, -1 @@ -144,53 +134,48 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 @@ -232,53 +217,48 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 @@ -321,51 +301,46 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 @@ -422,66 +397,61 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: ds_read_u8 v1, v1 +; GCN_DBG-NEXT: ds_read_u8 v0, v0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v1 +; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 ; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 ; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 ; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s1, 2 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB4_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2 -; GCN_DBG-NEXT: v_readlane_b32 s4, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2 +; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 ; GCN_DBG-NEXT: s_mov_b32 s4, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s4, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s4 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s4 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 ; GCN_DBG-NEXT: s_branch .LBB4_2 diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll index 643f2619840a2..ede57f1a0a04c 100644 --- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -33,6 +33,7 @@ bb2: ; GCN-LABEL: {{^}}preserve_condition_undef_flag: ; GCN-NOT: vcc +; GCN: s_endpgm define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { bb0: %tmp = icmp sgt i32 %arg1, 4 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll index 9d93609b1e881..f198833059572 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll @@ -1,8 +1,8 @@ ; REQUIRES: asserts -; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s -; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -; CHECK: function must have been generated already +; CHECK-NOT: func define internal i32 @func() { ret i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 67a084068941a..7cec15ea5be87 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -48,72 +48,67 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v4, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v4, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v4, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB0_4 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 -; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 -; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[1:2], s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: v_writelane_b32 v4, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v4, s1, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB0_3 ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec @@ -130,26 +125,25 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB0_3: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -222,72 +216,67 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v4, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v4, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v4, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB1_3 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 -; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 -; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[1:2], s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: v_writelane_b32 v4, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v4, s1, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB1_4 ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec @@ -305,27 +294,27 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: .LBB1_3: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_5 ; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s2, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s3, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s2, v4, 4 +; GCN-O0-NEXT: v_readlane_b32 s3, v4, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec @@ -340,14 +329,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB1_3 ; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -433,19 +418,14 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v4, s2, 0 +; GCN-O0-NEXT: v_writelane_b32 v4, s3, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 @@ -453,42 +433,43 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s4, v1 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s4, v0 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 -; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 -; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v4, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v4, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_6 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[2:3], exec ; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] -; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v4, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v4, s3, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_2 @@ -496,31 +477,30 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB2_2: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 +; GCN-O0-NEXT: v_writelane_b32 v4, s0, 6 +; GCN-O0-NEXT: v_writelane_b32 v4, s1, 7 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb.then +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec @@ -536,16 +516,15 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB2_5 ; GCN-O0-NEXT: .LBB2_4: ; %bb.else +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec @@ -562,26 +541,25 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB2_5: ; %Flow1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v4, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v4, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -681,51 +659,46 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s0, v1 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v0 ; GCN-O0-NEXT: s_mov_b32 s1, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 -; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v1 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 ; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v4 -; GCN-O0-NEXT: v_add_i32_e64 v5, s[2:3], s2, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 -; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3] -; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v6, v2 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v5, v3 +; GCN-O0-NEXT: v_add_i32_e64 v4, s[2:3], s2, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: v_addc_u32_e64 v1, s[2:3], v1, v5, s[2:3] +; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v5, v1 +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s1 ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 -; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 +; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[2:3], exec ; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-O0-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v6, s2, 0 +; GCN-O0-NEXT: v_writelane_b32 v6, s3, 1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_1 @@ -733,28 +706,28 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: .LBB3_1: ; %Flow2 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v6, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v6, 1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v6, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v6, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 ; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -763,23 +736,24 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_writelane_b32 v6, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v6, s1, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 ; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 @@ -789,19 +763,18 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 ; GCN-O0-NEXT: s_branch .LBB3_7 ; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: s_mov_b32 s2, s0 @@ -810,22 +783,23 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s5, s0 ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_writelane_b32 v6, s0, 6 +; GCN-O0-NEXT: v_writelane_b32 v6, s1, 7 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_6 ; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 @@ -835,43 +809,41 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16 ; GCN-O0-NEXT: .LBB3_6: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v6, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v6, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB3_1 ; GCN-O0-NEXT: .LBB3_7: ; %Flow1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v6, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v6, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v6, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v6, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -935,44 +907,39 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v3, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v3, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v3, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v3, s1, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB4_2 ; GCN-O0-NEXT: ; %bb.1: ; %bb.then +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v3, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v3, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 @@ -983,14 +950,13 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: .LBB4_2: ; %bb.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v3, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v3, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_barrier -; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1082,91 +1048,84 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0: ; %bb.0: ; %bb ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GCN-O0-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: .LBB5_1: ; %bb1 ; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3 -; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1 -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s8, v6, 2 +; GCN-O0-NEXT: v_readlane_b32 s9, v6, 3 +; GCN-O0-NEXT: v_readlane_b32 s6, v6, 0 +; GCN-O0-NEXT: v_readlane_b32 s7, v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 4 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 5 ; GCN-O0-NEXT: s_mov_b32 s4, 0x207 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4 +; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4 ; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 7 -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 6 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 7 +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 1 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.2: ; %bb2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 6 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s6, 0 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s6 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 8 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 9 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 8 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 9 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_mov_b32 s8, s4 ; GCN-O0-NEXT: s_mov_b32 s9, s4 ; GCN-O0-NEXT: s_mov_b32 s10, s4 ; GCN-O0-NEXT: s_mov_b32 s11, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s11 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s11 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 10 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 11 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 10 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 11 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -1175,31 +1134,31 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v1, s4 +; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v0, s4 ; GCN-O0-NEXT: s_mov_b32 s8, s4 ; GCN-O0-NEXT: s_mov_b32 s9, s4 ; GCN-O0-NEXT: s_mov_b32 s10, s4 ; GCN-O0-NEXT: s_mov_b32 s11, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s11 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s11 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 12 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 13 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 12 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 13 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -1217,7 +1176,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b32 s5, s10 ; GCN-O0-NEXT: s_mov_b32 s6, s9 ; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 @@ -1229,69 +1188,64 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_6 ; GCN-O0-NEXT: .LBB5_5: ; %Flow2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(2) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 10 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 11 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_7 ; GCN-O0-NEXT: .LBB5_6: ; %Flow ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(2) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 12 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 13 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_5 ; GCN-O0-NEXT: .LBB5_7: ; %bb10 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(3) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s6, v0, 8 -; GCN-O0-NEXT: v_readlane_b32 s7, v0, 9 +; GCN-O0-NEXT: v_readlane_b32 s6, v6, 8 +; GCN-O0-NEXT: v_readlane_b32 s7, v6, 9 ; GCN-O0-NEXT: s_mov_b64 s[4:5], -1 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 15 ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 16 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 17 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -1300,103 +1254,99 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 15 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: .LBB5_9: ; %Flow3 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s8, v4, 16 -; GCN-O0-NEXT: v_readlane_b32 s9, v4, 17 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-O0-NEXT: v_readlane_b32 s6, v4, 4 -; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5 -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15 -; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: s_waitcnt expcnt(4) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(2) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s8, v6, 16 +; GCN-O0-NEXT: v_readlane_b32 s9, v6, 17 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-O0-NEXT: v_readlane_b32 s6, v6, 4 +; GCN-O0-NEXT: v_readlane_b32 s7, v6, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 14 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 15 ; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 ; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v4, s8, 0 -; GCN-O0-NEXT: v_writelane_b32 v4, s9, 1 -; GCN-O0-NEXT: v_writelane_b32 v4, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v4, s7, 3 +; GCN-O0-NEXT: v_writelane_b32 v6, s8, 0 +; GCN-O0-NEXT: v_writelane_b32 v6, s9, 1 +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 3 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v4, s6, 18 -; GCN-O0-NEXT: v_writelane_b32 v4, s7, 19 +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 18 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 19 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.10: ; %bb12 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(3) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(4) +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 18 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 19 +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 18 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 19 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: ; %bb.11: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt expcnt(3) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt expcnt(2) -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v3 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 789150f690d52..7c09fec908f93 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -46,6 +46,9 @@ ; VMEM: [[ENDIF]]: +; Restore val +; VGPR: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload + ; Reload and restore exec mask ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -58,7 +61,7 @@ ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] ; Restore val -; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] @@ -120,6 +123,7 @@ endif: ; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: [[END]]: +; VGPR: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -129,7 +133,8 @@ endif: ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] -; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload + +; VMEM: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] @@ -189,6 +194,7 @@ end: ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow +; VGPR: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -200,7 +206,7 @@ end: ; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]] -; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; Regular spill value restored after exec modification ; Followed by spill @@ -234,6 +240,7 @@ end: ; GCN-NEXT: s_branch [[FLOW]] ; GCN: [[ENDIF]]: +; VGPR: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] @@ -245,7 +252,7 @@ end: ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] -; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir index d5cdf584a75de..a14d515688a8b 100644 --- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir @@ -14,10 +14,10 @@ body: | ; CHECK-LABEL: name: def_csr_sgpr ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr42, $sgpr43, $sgpr46, $sgpr47, $vgpr0 + ; CHECK-NEXT: liveins: $sgpr42, $sgpr43, $sgpr46, $sgpr47 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr42, 0, $vgpr0 ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr43, 1, $vgpr0 @@ -26,8 +26,6 @@ body: | ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr42 = S_MOV_B32 0 ; CHECK-NEXT: $sgpr43 = S_MOV_B32 1 ; CHECK-NEXT: $sgpr46_sgpr47 = S_MOV_B64 2 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index b541be9f5aa44..6686742e449f5 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -220,334 +220,327 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 ; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s10, 2 +; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 ; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, s10, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v3, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v14, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v20, vcc +; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, s10, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v0, v2, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v13, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v19, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[9:10], s[4:5] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v20, v1, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v14, v1, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v13, v0, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v22 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v18, vcc, s10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v8, v9, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v13, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v15, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v17, vcc, s10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v7, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v7, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v14, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[21:22], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[20:21], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v15, v8, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v13, v8, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v14, v7, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v12, v7, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-O0-NEXT: v_xor_b32_e64 v15, v15, v20 -; GFX9-O0-NEXT: v_xor_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_xor_b32_e64 v14, v14, v19 +; GFX9-O0-NEXT: v_xor_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[13:14] -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[12:13] +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v15, v13, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v15, v13, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 ; GFX9-O0-NEXT: s_mov_b32 s13, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v8, v8, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 -; GFX9-O0-NEXT: v_min_u32_e64 v8, v8, v9 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 ; GFX9-O0-NEXT: s_mov_b32 s12, 0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 -; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 +; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 ; GFX9-O0-NEXT: s_mov_b32 s18, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s18 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 -; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr16 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 -; GFX9-O0-NEXT: v_min_u32_e64 v12, v5, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 ; GFX9-O0-NEXT: s_mov_b32 s12, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 ; GFX9-O0-NEXT: s_mov_b32 s14, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[12:13], v11, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v12, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[12:13], v10, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v11, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] -; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v7, 1 +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v2, v5, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 5 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -555,11 +548,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -588,20 +581,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -648,13 +640,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -663,9 +648,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -679,92 +670,87 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 11 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 +; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 +; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s5, v[23:24] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v23, v5, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] ; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] -; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[6:7] +; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -784,22 +770,22 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, v23 -; GFX9-O0-NEXT: v_and_b32_e64 v21, v11, v21 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v23 -; GFX9-O0-NEXT: v_and_b32_e64 v23, v11, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v20 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v19, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 @@ -815,66 +801,66 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v20, vcc, v11, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v21 -; GFX9-O0-NEXT: v_or_b32_e64 v19, v19, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v17, v17, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 10 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 11 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -912,52 +898,52 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22] +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s6, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], v12, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 ; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6 ; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_lshrrev_b64 v[22:23], v5, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s6, 0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v21 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v22 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s7 @@ -976,12 +962,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v17, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v16, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v15, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s4 @@ -993,7 +979,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -1006,10 +992,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 10 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 11 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -1037,201 +1024,194 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f -; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4 -; GFX9-O0-NEXT: s_mov_b32 s10, 63 -; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5] -; GFX9-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 +; GFX9-O0-NEXT: s_mov_b32 s10, 63 +; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v4, v7, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 -; GFX9-O0-NEXT: v_xor_b32_e64 v9, v6, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v8, v5, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: s_mov_b32 s4, 32 -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1444,258 +1424,252 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr3_vgpr4 killed $exec ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v12, v3, v8 -; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec -; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v11, v2, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec ; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v10, v1, v2 -; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v9, v0, v1 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v16 -; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v11, v3, v8 -; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec -; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v10, v2, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec ; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v9, v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v12, v1 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v12, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v10, v4 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v10, v3 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v12 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[6:7], v2, v12, s[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[6:7], v4, v10, s[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[6:7], v3, v10, s[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v0, v11, v0 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v11, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v9, v3 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v9, v2 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v0, s[6:7], v0, v11 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[6:7], v1, v11, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v5, s[6:7], v3, v9, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[6:7], v2, v9, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v8, v11, v5 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v5, v11, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v9, v7 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v9, v4 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v8, s[6:7], v8, v11 -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v5, s[6:7], v5, v11, s[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[6:7], v7, v9, s[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v4, s[6:7], v4, v9, s[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_xor_b32_e64 v13, v11, v12 -; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v11, v12 -; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v9, v10 -; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_xor_b32_e64 v9, v9, v10 -; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v8, v7 -; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v5, v4 -; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] -; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v1, v6 -; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v2, v3 -; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v10, v4 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v10, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v8, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v8, v3 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v7, s[6:7], v7, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v4, s[6:7], v4, v10, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[6:7], v6, v8, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[6:7], v3, v8, s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v12, v10, v11 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v10, v10, v11 +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v10, v8, v9 +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v8, v8, v9 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v4, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[8:9], v[10:11] +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v0, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v1, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11] ; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 -; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 -; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v8 -; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-G-O0-NEXT: v_add_u32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11] ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 ; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v7 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v4, v4, v7 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v3, v3 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v3, v3, v6 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] ; GFX9-G-O0-NEXT: s_mov_b32 s16, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], v[9:10] -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v2 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v1 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v4, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 -; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v5, v7 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v3 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 -; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 -; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v8 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[8:9] +; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[8:9] ; GFX9-G-O0-NEXT: s_mov_b32 s15, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v7, s[8:9], v4, v5 -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v3, v4 +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s16 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v4, s[8:9], v4, v5, s[8:9] -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s14 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] -; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13 -; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[8:9], v3, v4, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v4, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v4, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[11:12], v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[11:12], v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s13 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v4, v9, s[12:13] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7] -; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 -; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f -; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s7 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v4, s6 -; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v7, v9 -; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v8 -; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[7:8], v[9:10] -; GFX9-G-O0-NEXT: v_and_b32_e32 v4, 1, v5 -; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v2, v4, s[6:7] -; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-G-O0-NEXT: v_and_b32_e32 v4, 1, v5 -; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[10:11] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v4, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v3, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v8 +; GFX9-G-O0-NEXT: v_or_b32_e64 v3, v3, v7 ; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v4 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v4 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-G-O0-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s4, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s5, 1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -1703,11 +1677,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_branch .LBB0_8 ; GFX9-G-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 3 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -1736,24 +1710,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_5 ; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 1 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_nop 0 ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_9 ; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit @@ -1813,13 +1784,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_3 ; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload @@ -1828,13 +1792,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_nop 0 ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 @@ -1844,41 +1812,39 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_branch .LBB0_4 ; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 -; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 -; GFX9-G-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7 ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 @@ -1897,9 +1863,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 ; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1 -; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr23_vgpr24 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v25 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v26 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr22_vgpr23 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v25 ; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec ; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec ; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 @@ -1911,47 +1877,44 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v15 ; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 ; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v23 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v24 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v25 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v25 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[27:28], v0, v[2:3] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[26:27], v0, v[2:3] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[14:15] ; GFX9-G-O0-NEXT: ; kill: def $vgpr14 killed $vgpr2 killed $exec ; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec ; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v23, v2, v3 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v22, v2, v3 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v28, v30 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v32 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v25, v33 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v26, v34 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v27 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v28 -; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v27 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v23 ; GFX9-G-O0-NEXT: v_or_b32_e64 v15, v1, v15 ; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v25 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v26 -; GFX9-G-O0-NEXT: v_or3_b32 v14, v14, v23, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v25 +; GFX9-G-O0-NEXT: v_or3_b32 v14, v14, v22, v23 ; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v15 ; GFX9-G-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v15 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v13, s[8:9], v13, v4 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9] ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9] @@ -1968,15 +1931,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_and_b32_e64 v14, v10, s8 ; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, s4 ; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v23 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 ; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v10, v11 -; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v10, v22 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v10, v21 ; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 -; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 +; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v20 ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9] @@ -1985,60 +1948,60 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v19 ; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s11, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8 -; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[8:9], v11, v16 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9] +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v17, s[8:9], v10, v11, s[8:9] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v9, v10, s[8:9] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19 -; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20 -; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19 -; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v18 +; GFX9-G-O0-NEXT: v_or_b32_e64 v16, v16, v19 +; GFX9-G-O0-NEXT: v_or_b32_e64 v18, v17, v18 +; GFX9-G-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[18:19] ; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 3 ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 7 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 @@ -2072,87 +2035,88 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v17 -; GFX9-G-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v16 +; GFX9-G-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v19, v4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v18, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v19 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v18 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v6 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v18, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v19, v[21:22] -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v19, v[23:24] -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v26 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v18, v[20:21] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[25:26], v18, v[22:23] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v5, v[20:21] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v25 -; GFX9-G-O0-NEXT: v_or_b32_e64 v20, v20, v23 -; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v5, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v19, v22 +; GFX9-G-O0-NEXT: v_or_b32_e64 v18, v5, v18 ; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v19, s[4:5] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v18, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[6:7] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[20:21], v4, v[20:21] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v21 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v18, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v17, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v16, v5, v16, s[6:7] ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v6 ; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[4:5] ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v6 ; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v17 ; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4 -; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17 -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5] -; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v15, s[4:5], v15, v16 +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s10 ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] -; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s7 ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] -; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v12, s[4:5], v12, v13, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s8, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s9, 7 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 @@ -2180,165 +2144,157 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_6 ; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v3, v5 -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v6, v8, s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v7, v8, s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v2, v7, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v2, v4 +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v7, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v1, v6, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v7 -; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v6 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v9, s[6:7], v2, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v8, s[6:7], v1, v2 ; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 -; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v9, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v15, v1, v9 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v8, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v14, v0, v8 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v9, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v9, v1 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v9, v[13:14] -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v15, v[13:14] -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v9, v[11:12] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v8, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v0 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v8, v[12:13] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[17:18], v14, v[12:13] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[15:16], v8, v[10:11] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 -; GFX9-G-O0-NEXT: v_or_b32_e64 v12, v12, v15 -; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v9, v11 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v8, v10 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[12:13], v2, v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9] -; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[8:9] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[8:9] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] -; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 -; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 -; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8] -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v5, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(17) +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-G-O0-NEXT: s_branch .LBB0_7 ; GFX9-G-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v12 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v0, v0, v8 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v1, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v2, v6 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v3, v5 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v8 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v7, s[4:5] -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[4:5], v2, v6, s[4:5] -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] -; GFX9-G-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v0, v0, v7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v2, v5 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v3, v4 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v7 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v6, s[4:5] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[4:5], v2, v5, s[4:5] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[4:5], v3, v4, s[4:5] ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] @@ -2533,246 +2489,238 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v15, v4, v2 +; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v3, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 ; GFX9-O0-NEXT: s_mov_b32 s9, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 ; GFX9-O0-NEXT: s_mov_b32 s14, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 -; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 -; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 ; GFX9-O0-NEXT: s_mov_b32 s8, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 ; GFX9-O0-NEXT: s_mov_b32 s10, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[8:9], v11, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[8:9], v10, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[8:9], v4, v11, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[12:13], s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: s_mov_b32 s10, s6 ; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] -; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v7, 1 +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v2, v5, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -2780,11 +2728,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_8 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -2813,20 +2761,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2873,13 +2820,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -2888,9 +2828,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2904,92 +2850,87 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 +; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s5, v[23:24] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v23, v5, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] ; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] -; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[6:7] +; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -3009,22 +2950,22 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, v23 -; GFX9-O0-NEXT: v_and_b32_e64 v21, v11, v21 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v23 -; GFX9-O0-NEXT: v_and_b32_e64 v23, v11, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v20 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v19, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 @@ -3040,66 +2981,66 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v20, vcc, v11, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v21 -; GFX9-O0-NEXT: v_or_b32_e64 v19, v19, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v17, v17, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -3137,52 +3078,52 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22] +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s6, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], v12, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 ; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6 ; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_lshrrev_b64 v[22:23], v5, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s6, 0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v21 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v22 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s7 @@ -3201,12 +3142,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v17, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v16, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v15, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s4 @@ -3218,7 +3159,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -3231,10 +3172,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -3262,165 +3204,158 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f -; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4 +; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 ; GFX9-O0-NEXT: s_mov_b32 s10, 63 -; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5] +; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v4, v7, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 ; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[7:8] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[5:6] +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -3610,83 +3545,94 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v5 -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v8, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v9, v10 +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 -; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 -; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[8:9], v[10:11] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v8, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v9, v10 +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 -; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 -; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11] ; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], v[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v5 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v5, v4, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v6 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6] @@ -3697,7 +3643,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 ; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 ; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 @@ -3708,130 +3653,106 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 ; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] -; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 -; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 -; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 -; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 -; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] ; GFX9-G-O0-NEXT: s_mov_b32 s13, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s12, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6 -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9] -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] -; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v5, s[8:9], v4, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[8:9], v4, v6, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v4, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v4, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13 -; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[11:12], v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[11:12], v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s13 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v4, v9, s[12:13] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7] -; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[10:11] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v4, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 ; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 -; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6 -; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9 -; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8 -; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v5, v5, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[5:6], v[7:8] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5 -; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] -; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5 -; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] -; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_and_b32_e32 v2, 1, v4 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-G-O0-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s4, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s5, 1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -3839,68 +3760,65 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_branch .LBB1_8 ; GFX9-G-O0-NEXT: .LBB1_1: ; %Flow ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 3 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_5 ; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 1 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_9 ; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit -; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 @@ -3949,77 +3867,72 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_3 ; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_4 ; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 -; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7 ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[20:21], v2, v[0:1] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4] ; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec @@ -4043,8 +3956,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v21 ; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 ; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 @@ -4052,7 +3965,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[22:23], v0, v[2:3] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13] ; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec @@ -4064,22 +3977,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v28, v30 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v32 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v23 ; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15 ; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13 ; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v21 ; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15 ; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13 ; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec @@ -4087,7 +3998,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] @@ -4109,18 +4019,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v24 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v26 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v23 ; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11 ; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v21 ; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 -; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 +; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v20 ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9] @@ -4129,351 +4039,344 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v19 ; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s11, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8 -; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[8:9], v11, v16 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9] +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v17, s[8:9], v10, v11, s[8:9] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v9, v10, s[8:9] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19 -; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20 -; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19 -; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v18 +; GFX9-G-O0-NEXT: v_or_b32_e64 v16, v16, v19 +; GFX9-G-O0-NEXT: v_or_b32_e64 v18, v17, v18 +; GFX9-G-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[18:19] ; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 3 ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 7 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-G-O0-NEXT: s_branch .LBB1_1 ; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v12, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v12 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v12, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6 -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22] -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16] -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, v6 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v12, v[20:21] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[25:26], v12, v[14:15] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v5, v[20:21] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25 -; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23 -; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v13, v22 +; GFX9-G-O0-NEXT: v_or_b32_e64 v12, v5, v12 ; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[20:21], v4, v[20:21] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v21 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v13, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v15 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v13, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v12, v5, v12, s[6:7] ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v6 ; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v6 ; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v19 ; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4 -; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17 -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v15, s[4:5], v15, v16 ; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s10 ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] ; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s7 ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v12, s[4:5], v12, v13, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s8, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s9, 7 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4 -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_6 ; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5 -; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v1, v4 +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v2, v3, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v0, v2, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v3, s[6:7], v0, v1 ; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v3, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v8, v0, v3 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14] -; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14] -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v3, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v3, v0 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v3, v[12:13] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[17:18], v8, v[12:13] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[15:16], v3, v[10:11] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 -; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15 -; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v3, v3, v8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[12:13], v2, v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9] -; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] -; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 -; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 -; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8] -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v5, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(17) +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-G-O0-NEXT: s_branch .LBB1_7 ; GFX9-G-O0-NEXT: .LBB1_9: ; %udiv-end -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v8 -; GFX9-G-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32-wave32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32-wave32.mir index 18828bb461892..a680b63a34b9a 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32-wave32.mir @@ -16,10 +16,33 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW32-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0 - ; MUBUFW32: renamable $vgpr0, $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec + ; MUBUFW32: renamable $vgpr0, dead $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr0 ; ; FLATSCRW32-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0 + ; FLATSCRW32: renamable $vgpr0, dead $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr0 + renamable $vgpr0, dead $vcc_lo = V_ADD_CO_U32_e64 12, %stack.0, 0, implicit $exec + SI_RETURN implicit $vgpr0, implicit $sgpr0 + +... + +--- +name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc +tracksRegLiveness: true +stack: + - { id: 0, size: 4, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; MUBUFW32-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc + ; MUBUFW32: renamable $vgpr0, $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc ; FLATSCRW32: renamable $vgpr0, $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr0 renamable $vgpr0, $vcc_lo = V_ADD_CO_U32_e64 12, %stack.0, 0, implicit $exec @@ -271,7 +294,7 @@ body: | liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; MUBUFW32-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; MUBUFW32: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUFW32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; MUBUFW32-NEXT: {{ $}} ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc ; MUBUFW32-NEXT: renamable $vgpr0, renamable $vcc_lo = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec @@ -279,7 +302,7 @@ body: | ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW32-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; FLATSCRW32: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; FLATSCRW32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; FLATSCRW32-NEXT: {{ $}} ; FLATSCRW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc ; FLATSCRW32-NEXT: renamable $vgpr0, renamable $vcc_lo = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir index 6ec296144bf19..fa442aa849d17 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir @@ -1192,7 +1192,7 @@ body: | liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX7-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX7: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1204,7 +1204,7 @@ body: | ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1216,7 +1216,7 @@ body: | ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1227,7 +1227,7 @@ body: | ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1238,7 +1238,7 @@ body: | ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1249,7 +1249,7 @@ body: | ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX940: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX940-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc @@ -1260,7 +1260,7 @@ body: | ; GFX940-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX11: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX11: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec @@ -1268,7 +1268,7 @@ body: | ; GFX11-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX12-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX12: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX12: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec @@ -1296,7 +1296,7 @@ body: | liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX7-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX7: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1308,7 +1308,7 @@ body: | ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1320,7 +1320,7 @@ body: | ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1331,7 +1331,7 @@ body: | ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1342,7 +1342,7 @@ body: | ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1353,7 +1353,7 @@ body: | ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required - ; FLATSCRW64: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; FLATSCRW64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; FLATSCRW64-NEXT: {{ $}} ; FLATSCRW64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc @@ -1384,7 +1384,7 @@ body: | liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX7-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required - ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX7: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1396,7 +1396,7 @@ body: | ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required - ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1408,7 +1408,7 @@ body: | ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required - ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1419,7 +1419,7 @@ body: | ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required - ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1430,7 +1430,7 @@ body: | ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required - ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec @@ -1441,7 +1441,7 @@ body: | ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required - ; FLATSCRW64: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; FLATSCRW64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 ; FLATSCRW64-NEXT: {{ $}} ; FLATSCRW64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir index 3bf7e7b8c5696..2f43c8264bf90 100644 --- a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir +++ b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir @@ -25,13 +25,12 @@ body: | ; GCN-LABEL: name: test_single_block ; GCN: liveins: $sgpr4, $vgpr2_vgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0 + ; GCN-NEXT: renamable $vgpr63 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr63 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, [[V_MOV_B32_e32_]], 0, 0, implicit $exec ; GCN-NEXT: SI_RETURN SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_NOP 0 @@ -63,32 +62,31 @@ body: | ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr6, 0, killed $vgpr0 + ; GCN-NEXT: renamable $vgpr63 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr6, 0, killed $vgpr63 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: - ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: KILL killed renamable $vgpr0 ; GCN-NEXT: SI_RETURN bb.0: liveins: $sgpr6, $sgpr10_sgpr11 @@ -135,52 +133,50 @@ body: | ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GCN-NEXT: liveins: $sgpr4, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr4, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0 + ; GCN-NEXT: renamable $vgpr63 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr63 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr1, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr4, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 5, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) - ; GCN-NEXT: liveins: $vgpr0, $vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $vcc = V_CMP_EQ_U32_e64 0, $vgpr1, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U32_e64 0, [[V_MOV_B32_e32_1]], implicit $exec ; GCN-NEXT: $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN-NEXT: S_CBRANCH_SCC1 %bb.5, implicit $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr6_sgpr7 + ; GCN-NEXT: liveins: $sgpr6_sgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U32_e32 1, killed $vgpr1, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec + ; GCN-NEXT: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 1, [[V_MOV_B32_e32_1]], implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[V_SUB_U32_e32_]], implicit $exec ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: - ; GCN-NEXT: liveins: $vgpr0, $sgpr6_sgpr7 + ; GCN-NEXT: liveins: $sgpr6_sgpr7 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc - ; GCN-NEXT: KILL killed renamable $vgpr0 ; GCN-NEXT: SI_RETURN bb.0: liveins: $sgpr4, $sgpr10_sgpr11 @@ -239,26 +235,24 @@ body: | ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GCN-NEXT: liveins: $sgpr4, $vgpr2_vgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3 + ; GCN-NEXT: liveins: $sgpr4, $vgpr2_vgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0 + ; GCN-NEXT: renamable $vgpr63 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr63 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, [[V_MOV_B32_e32_]], 0, 0, implicit $exec ; GCN-NEXT: SI_RETURN ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: - ; GCN-NEXT: liveins: $vgpr0, $vgpr2_vgpr3 + ; GCN-NEXT: liveins: $vgpr2_vgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, [[V_MOV_B32_e32_1]], 0, 0, implicit $exec ; GCN-NEXT: SI_RETURN bb.0: liveins: $sgpr4, $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 431b7d5400f43..798cd6239d262 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -108,255 +108,114 @@ define amdgpu_kernel void @kernel_calls_no_stack() { } define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { -; FLAT_SCR_OPT-LABEL: test: -; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s0, 0 -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s1, 1 -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 -; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr2_sgpr3 -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, vcc_lo -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: ;;#ASMSTART -; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_OPT-NEXT: ; kill: killed $vgpr1 -; FLAT_SCR_OPT-NEXT: global_store_dword v2, v0, s[0:1] -; FLAT_SCR_OPT-NEXT: s_endpgm -; -; FLAT_SCR_ARCH-LABEL: test: -; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s0, 0 -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s1, 1 -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 -; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr2_sgpr3 -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, vcc_lo -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART -; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 0 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_ARCH-NEXT: ; kill: killed $vgpr1 -; FLAT_SCR_ARCH-NEXT: global_store_dword v2, v0, s[0:1] -; FLAT_SCR_ARCH-NEXT: s_endpgm +; GCN-LABEL: test: +; GCN: ; %bb.0: +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_writelane_b32 v0, s0, 0 +; GCN-NEXT: v_writelane_b32 v0, s1, 1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, vcc_lo +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: global_store_dword v2, v1, s[0:1] +; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{s[0:7]}" () call void asm sideeffect "", "~{s[8:15]}" () call void asm sideeffect "", "~{s[16:23]}" () @@ -371,7 +230,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { call void asm sideeffect "", "~{s[88:95]}" () call void asm sideeffect "", "~{s[96:103]}" () call void asm sideeffect "", "~{s[104:105]}" () - call void asm sideeffect "", "~{v[0:7]}" () + call void asm sideeffect "", "~{v[1:7]}" () call void asm sideeffect "", "~{v[8:15]}" () call void asm sideeffect "", "~{v[16:23]}" () call void asm sideeffect "", "~{v[24:31]}" () diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index e4ffedd686ac9..02eb1ad945329 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -25,11 +25,11 @@ ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 8 -; VI-NOXNACK: ; NumSgprs: 8 -; VI-XNACK: ; NumSgprs: 12 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 8 +; CI: ; TotalNumSgprs: 8 +; VI-NOXNACK: ; TotalNumSgprs: 8 +; VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{s7}"() @@ -42,11 +42,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 10 -; VI-NOXNACK: ; NumSgprs: 10 -; VI-XNACK: ; NumSgprs: 12 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 10 +; CI: ; TotalNumSgprs: 10 +; VI-NOXNACK: ; TotalNumSgprs: 10 +; VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc}"() @@ -59,11 +59,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 12 -; VI-NOXNACK: ; NumSgprs: 14 -; VI-XNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 8 +; CI: ; TotalNumSgprs: 12 +; VI-NOXNACK: ; TotalNumSgprs: 14 +; VI-XNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{flat_scratch}"() @@ -76,11 +76,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 12 -; VI-NOXNACK: ; NumSgprs: 14 -; VI-XNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 10 +; CI: ; TotalNumSgprs: 12 +; VI-NOXNACK: ; TotalNumSgprs: 14 +; VI-XNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"() @@ -96,11 +96,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 -; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; CI: TotalNumSgprs: 4 +; VI-NOXNACK: TotalNumSgprs: 6 +; VI-XNACK: TotalNumSgprs: 6 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch}"() @@ -113,11 +113,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 -; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; CI: TotalNumSgprs: 4 +; VI-NOXNACK: TotalNumSgprs: 6 +; VI-XNACK: TotalNumSgprs: 6 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_lo() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"() @@ -130,11 +130,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 -; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; CI: TotalNumSgprs: 4 +; VI-NOXNACK: TotalNumSgprs: 6 +; VI-XNACK: TotalNumSgprs: 6 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_hi() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"() diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 9653f8fdacac6..ef9590b3fd33f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4891,5 +4891,449 @@ bb: ret void } +define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_i32 s2, s2, 0xffe8 +; GFX9-NEXT: scratch_load_dword v2, off, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_i32 s2, s2, 0xffe8 +; GFX10-NEXT: scratch_load_dword v2, off, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_add_i32 s0, s0, 0xffe8 +; GFX11-NEXT: scratch_load_b32 v2, off, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_large_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: sgpr_base_large_offset: +; GFX9-PAL: ; %bb.0: ; %entry +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s8 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0xffe8 +; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-PAL-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_add_i32 s0, s0, 0xffe8 +; GFX940-NEXT: scratch_load_dword v2, off, s0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: sgpr_base_large_offset: +; GFX10-PAL: ; %bb.0: ; %entry +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s8 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0xffe8 +; GFX10-PAL-NEXT: scratch_load_dword v2, off, s0 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-PAL-NEXT: s_endpgm +; +; GFX11-PAL-LABEL: sgpr_base_large_offset: +; GFX11-PAL: ; %bb.0: ; %entry +; GFX11-PAL-NEXT: s_add_i32 s0, s0, 0xffe8 +; GFX11-PAL-NEXT: scratch_load_b32 v2, off, s0 +; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-PAL-NEXT: s_nop 0 +; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: sgpr_base_large_offset: +; GFX12-PAL: ; %bb.0: ; %entry +; GFX12-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:65512 +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-PAL-NEXT: s_nop 0 +; GFX12-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-PAL-NEXT: s_endpgm +entry: + %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 + %load = load i32, ptr addrspace(5) %large_offset, align 4 + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset_split: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_and_b32 s0, s2, -4 +; GFX9-NEXT: s_add_i32 s0, s0, 0x100f000 +; GFX9-NEXT: scratch_load_dword v2, off, s0 offset:4072 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset_split: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_and_b32 s0, s2, -4 +; GFX10-NEXT: s_add_i32 s0, s0, 0x100f800 +; GFX10-NEXT: scratch_load_dword v2, off, s0 offset:2024 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset_split: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v2, 0x100f000 +; GFX11-NEXT: s_and_b32 s0, s0, -4 +; GFX11-NEXT: scratch_load_b32 v2, v2, s0 offset:4072 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_large_offset_split: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: v_mov_b32_e32 v2, 0x1000000 +; GFX12-NEXT: s_and_b32 s0, s0, -4 +; GFX12-NEXT: scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: sgpr_base_large_offset_split: +; GFX9-PAL: ; %bb.0: ; %entry +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s8 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, -4 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0x100f000 +; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 offset:4072 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-PAL-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset_split: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_and_b32 s0, s0, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, 0x100f000 +; GFX940-NEXT: scratch_load_dword v2, v2, s0 offset:4072 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: sgpr_base_large_offset_split: +; GFX10-PAL: ; %bb.0: ; %entry +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s8 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: s_and_b32 s0, s0, -4 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0x100f800 +; GFX10-PAL-NEXT: scratch_load_dword v2, off, s0 offset:2024 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-PAL-NEXT: s_endpgm +; +; GFX11-PAL-LABEL: sgpr_base_large_offset_split: +; GFX11-PAL: ; %bb.0: ; %entry +; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0x100f000 +; GFX11-PAL-NEXT: s_and_b32 s0, s0, -4 +; GFX11-PAL-NEXT: scratch_load_b32 v2, v2, s0 offset:4072 glc dlc +; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-PAL-NEXT: s_nop 0 +; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: sgpr_base_large_offset_split: +; GFX12-PAL: ; %bb.0: ; %entry +; GFX12-PAL-NEXT: v_mov_b32_e32 v2, 0x1000000 +; GFX12-PAL-NEXT: s_and_b32 s0, s0, -4 +; GFX12-PAL-NEXT: scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-PAL-NEXT: s_nop 0 +; GFX12-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-PAL-NEXT: s_endpgm +entry: + ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5) + %sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32 + %sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292 + %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5) + %split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728 + %load = load volatile i32, ptr addrspace(5) %split_offset, align 4 + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) { +; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0xffe8, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8 +; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8 +; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s8 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, s1 +; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffe8, v0 +; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_add_i32 s0, s0, s1 +; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-NEXT: v_add_u32_e32 v0, 0xffe8, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s8 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, s1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: v_add3_u32 v0, s0, v0, 0xffe8 +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_endpgm +; +; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX11-PAL: ; %bb.0: ; %bb +; GFX11-PAL-NEXT: s_add_i32 s0, s0, s1 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: v_add3_u32 v0, s0, v0, 0xffe8 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: s_endpgm +bb: + %add1 = add nsw i32 %sidx, %vidx + %add2 = add nsw i32 %add1, 65512 + %gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2 + store volatile i32 15, ptr addrspace(5) %gep, align 4 + ret void +} + +define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) { +; GFX9-LABEL: sgpr_base_negative_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_addk_i32 s2, 0xffe8 +; GFX9-NEXT: scratch_load_dword v2, off, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_negative_offset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_negative_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_negative_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: sgpr_base_negative_offset: +; GFX9-PAL: ; %bb.0: ; %entry +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s8 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_addk_i32 s0, 0xffe8 +; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-PAL-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_negative_offset: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_addk_i32 s0, 0xffe8 +; GFX940-NEXT: scratch_load_dword v2, off, s0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: sgpr_base_negative_offset: +; GFX10-PAL: ; %bb.0: ; %entry +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s8 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: scratch_load_dword v2, off, s0 offset:-24 +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-PAL-NEXT: s_endpgm +; +; GFX11-PAL-LABEL: sgpr_base_negative_offset: +; GFX11-PAL: ; %bb.0: ; %entry +; GFX11-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:-24 +; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-PAL-NEXT: s_nop 0 +; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: sgpr_base_negative_offset: +; GFX12-PAL: ; %bb.0: ; %entry +; GFX12-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-PAL-NEXT: s_nop 0 +; GFX12-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-PAL-NEXT: s_endpgm +entry: + %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24 + %0 = load i32, ptr addrspace(5) %scevgep28, align 4 + store i32 %0, ptr addrspace(1) %out + ret void +} + declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir index ba619a659f1b0..5f36d5403ebcf 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -12,13 +12,13 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo + ; CHECK: S_NOP 0, implicit-def $exec_lo ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_lo @@ -37,13 +37,13 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi + ; CHECK: S_NOP 0, implicit-def $exec_hi ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_hi @@ -62,16 +62,16 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def $exec + ; CHECK: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec @@ -93,12 +93,12 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo + ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo @@ -116,12 +116,12 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi + ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi @@ -139,15 +139,15 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec + ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir index 1c7896fcb4f14..1c2436bd6b6cd 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -13,13 +13,13 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def $m0 + ; CHECK: S_NOP 0, implicit-def $m0 ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -43,12 +43,12 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir index f388aeb047029..0309a156171d7 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir @@ -166,7 +166,7 @@ body: | bb.0: liveins: $sgpr30_sgpr31, $sgpr10 ; GCN-LABEL: name: func_add_constant_to_fi_uniform_live_SCC_i32 - ; GCN: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 $sgpr10, 4, implicit-def $scc ; GCN-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll new file mode 100644 index 0000000000000..9e3264eb9c07f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll @@ -0,0 +1,531 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefix=GCN %s + +; Functions that don't make calls should have constants as its resource usage as no resource information has to be propagated. + +; GCN-LABEL: {{^}}use_vcc: +; GCN: .set use_vcc.num_vgpr, 0 +; GCN: .set use_vcc.num_agpr, 0 +; GCN: .set use_vcc.numbered_sgpr, 32 +; GCN: .set use_vcc.private_seg_size, 0 +; GCN: .set use_vcc.uses_vcc, 1 +; GCN: .set use_vcc.uses_flat_scratch, 0 +; GCN: .set use_vcc.has_dyn_sized_stack, 0 +; GCN: .set use_vcc.has_recursion, 0 +; GCN: .set use_vcc.has_indirect_call, 0 +; GCN: TotalNumSgprs: 36 +; GCN: NumVgprs: 0 +; GCN: ScratchSize: 0 +define void @use_vcc() #1 { + call void asm sideeffect "", "~{vcc}" () #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_vcc: +; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr) +; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr) +; GCN: .set indirect_use_vcc.numbered_sgpr, max(34, use_vcc.numbered_sgpr) +; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size)) +; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc) +; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch) +; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack) +; GCN: .set indirect_use_vcc.has_recursion, or(0, use_vcc.has_recursion) +; GCN: .set indirect_use_vcc.has_indirect_call, or(0, use_vcc.has_indirect_call) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define void @indirect_use_vcc() #1 { + call void @use_vcc() + ret void +} + +; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: +; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr) +; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr) +; GCN: .set indirect_2level_use_vcc_kernel.numbered_sgpr, max(33, indirect_use_vcc.numbered_sgpr) +; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size)) +; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc) +; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch) +; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack) +; GCN: .set indirect_2level_use_vcc_kernel.has_recursion, or(0, indirect_use_vcc.has_recursion) +; GCN: .set indirect_2level_use_vcc_kernel.has_indirect_call, or(0, indirect_use_vcc.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { + call void @indirect_use_vcc() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scratch: +; GCN: .set use_flat_scratch.num_vgpr, 0 +; GCN: .set use_flat_scratch.num_agpr, 0 +; GCN: .set use_flat_scratch.numbered_sgpr, 32 +; GCN: .set use_flat_scratch.private_seg_size, 0 +; GCN: .set use_flat_scratch.uses_vcc, 0 +; GCN: .set use_flat_scratch.uses_flat_scratch, 1 +; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0 +; GCN: .set use_flat_scratch.has_recursion, 0 +; GCN: .set use_flat_scratch.has_indirect_call, 0 +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 0 +; GCN: ScratchSize: 0 +define void @use_flat_scratch() #1 { + call void asm sideeffect "", "~{flat_scratch}" () #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_flat_scratch: +; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr) +; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr) +; GCN: .set indirect_use_flat_scratch.numbered_sgpr, max(34, use_flat_scratch.numbered_sgpr) +; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size)) +; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc) +; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch) +; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack) +; GCN: .set indirect_use_flat_scratch.has_recursion, or(0, use_flat_scratch.has_recursion) +; GCN: .set indirect_use_flat_scratch.has_indirect_call, or(0, use_flat_scratch.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define void @indirect_use_flat_scratch() #1 { + call void @use_flat_scratch() + ret void +} + +; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: +; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr) +; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr) +; GCN: .set indirect_2level_use_flat_scratch_kernel.numbered_sgpr, max(33, indirect_use_flat_scratch.numbered_sgpr) +; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size)) +; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc) +; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch) +; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack) +; GCN: .set indirect_2level_use_flat_scratch_kernel.has_recursion, or(0, indirect_use_flat_scratch.has_recursion) +; GCN: .set indirect_2level_use_flat_scratch_kernel.has_indirect_call, or(0, indirect_use_flat_scratch.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 { + call void @indirect_use_flat_scratch() + ret void +} + +; GCN-LABEL: {{^}}use_10_vgpr: +; GCN: .set use_10_vgpr.num_vgpr, 10 +; GCN: .set use_10_vgpr.num_agpr, 0 +; GCN: .set use_10_vgpr.numbered_sgpr, 32 +; GCN: .set use_10_vgpr.private_seg_size, 0 +; GCN: .set use_10_vgpr.uses_vcc, 0 +; GCN: .set use_10_vgpr.uses_flat_scratch, 0 +; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0 +; GCN: .set use_10_vgpr.has_recursion, 0 +; GCN: .set use_10_vgpr.has_indirect_call, 0 +; GCN: TotalNumSgprs: 36 +; GCN: NumVgprs: 10 +; GCN: ScratchSize: 0 +define void @use_10_vgpr() #1 { + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0 + call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_10_vgpr: +; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr) +; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr) +; GCN: .set indirect_use_10_vgpr.numbered_sgpr, max(34, use_10_vgpr.numbered_sgpr) +; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size)) +; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc) +; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch) +; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack) +; GCN: .set indirect_use_10_vgpr.has_recursion, or(0, use_10_vgpr.has_recursion) +; GCN: .set indirect_use_10_vgpr.has_indirect_call, or(0, use_10_vgpr.has_indirect_call) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define void @indirect_use_10_vgpr() #0 { + call void @use_10_vgpr() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: +; GCN: .set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr) +; GCN: .set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr) +; GCN: .set indirect_2_level_use_10_vgpr.numbered_sgpr, max(33, indirect_use_10_vgpr.numbered_sgpr) +; GCN: .set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size)) +; GCN: .set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc) +; GCN: .set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch) +; GCN: .set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack) +; GCN: .set indirect_2_level_use_10_vgpr.has_recursion, or(0, indirect_use_10_vgpr.has_recursion) +; GCN: .set indirect_2_level_use_10_vgpr.has_indirect_call, or(0, indirect_use_10_vgpr.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { + call void @indirect_use_10_vgpr() + ret void +} + +; GCN-LABEL: {{^}}use_50_vgpr: +; GCN: .set use_50_vgpr.num_vgpr, 50 +; GCN: .set use_50_vgpr.num_agpr, 0 +; GCN: .set use_50_vgpr.numbered_sgpr, 32 +; GCN: .set use_50_vgpr.private_seg_size, 0 +; GCN: .set use_50_vgpr.uses_vcc, 0 +; GCN: .set use_50_vgpr.uses_flat_scratch, 0 +; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0 +; GCN: .set use_50_vgpr.has_recursion, 0 +; GCN: .set use_50_vgpr.has_indirect_call, 0 +; GCN: TotalNumSgprs: 36 +; GCN: NumVgprs: 50 +; GCN: ScratchSize: 0 +define void @use_50_vgpr() #1 { + call void asm sideeffect "", "~{v49}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_50_vgpr: +; GCN: .set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr) +; GCN: .set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr) +; GCN: .set indirect_use_50_vgpr.numbered_sgpr, max(34, use_50_vgpr.numbered_sgpr) +; GCN: .set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size)) +; GCN: .set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc) +; GCN: .set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch) +; GCN: .set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack) +; GCN: .set indirect_use_50_vgpr.has_recursion, or(0, use_50_vgpr.has_recursion) +; GCN: .set indirect_use_50_vgpr.has_indirect_call, or(0, use_50_vgpr.has_indirect_call) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 50 +; GCN: ScratchSize: 16 +define void @indirect_use_50_vgpr() #0 { + call void @use_50_vgpr() + ret void +} + +; GCN-LABEL: {{^}}use_80_sgpr: +; GCN: .set use_80_sgpr.num_vgpr, 1 +; GCN: .set use_80_sgpr.num_agpr, 0 +; GCN: .set use_80_sgpr.numbered_sgpr, 80 +; GCN: .set use_80_sgpr.private_seg_size, 8 +; GCN: .set use_80_sgpr.uses_vcc, 0 +; GCN: .set use_80_sgpr.uses_flat_scratch, 0 +; GCN: .set use_80_sgpr.has_dyn_sized_stack, 0 +; GCN: .set use_80_sgpr.has_recursion, 0 +; GCN: .set use_80_sgpr.has_indirect_call, 0 +; GCN: TotalNumSgprs: 84 +; GCN: NumVgprs: 1 +; GCN: ScratchSize: 8 +define void @use_80_sgpr() #1 { + call void asm sideeffect "", "~{s79}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_80_sgpr: +; GCN: .set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr) +; GCN: .set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr) +; GCN: .set indirect_use_80_sgpr.numbered_sgpr, max(34, use_80_sgpr.numbered_sgpr) +; GCN: .set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size)) +; GCN: .set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc) +; GCN: .set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch) +; GCN: .set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack) +; GCN: .set indirect_use_80_sgpr.has_recursion, or(0, use_80_sgpr.has_recursion) +; GCN: .set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call) +; GCN: TotalNumSgprs: 84 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 24 +define void @indirect_use_80_sgpr() #1 { + call void @use_80_sgpr() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr: +; GCN: .set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr) +; GCN: .set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr) +; GCN: .set indirect_2_level_use_80_sgpr.numbered_sgpr, max(33, indirect_use_80_sgpr.numbered_sgpr) +; GCN: .set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size)) +; GCN: .set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc) +; GCN: .set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch) +; GCN: .set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack) +; GCN: .set indirect_2_level_use_80_sgpr.has_recursion, or(0, indirect_use_80_sgpr.has_recursion) +; GCN: .set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call) +; GCN: TotalNumSgprs: 86 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 24 +define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { + call void @indirect_use_80_sgpr() + ret void +} + +; GCN-LABEL: {{^}}use_stack0: +; GCN: .set use_stack0.num_vgpr, 1 +; GCN: .set use_stack0.num_agpr, 0 +; GCN: .set use_stack0.numbered_sgpr, 33 +; GCN: .set use_stack0.private_seg_size, 2052 +; GCN: .set use_stack0.uses_vcc, 0 +; GCN: .set use_stack0.uses_flat_scratch, 0 +; GCN: .set use_stack0.has_dyn_sized_stack, 0 +; GCN: .set use_stack0.has_recursion, 0 +; GCN: .set use_stack0.has_indirect_call, 0 +; GCN: TotalNumSgprs: 37 +; GCN: NumVgprs: 1 +; GCN: ScratchSize: 2052 +define void @use_stack0() #1 { + %alloca = alloca [512 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0 + ret void +} + +; GCN-LABEL: {{^}}use_stack1: +; GCN: .set use_stack1.num_vgpr, 1 +; GCN: .set use_stack1.num_agpr, 0 +; GCN: .set use_stack1.numbered_sgpr, 33 +; GCN: .set use_stack1.private_seg_size, 404 +; GCN: .set use_stack1.uses_vcc, 0 +; GCN: .set use_stack1.uses_flat_scratch, 0 +; GCN: .set use_stack1.has_dyn_sized_stack, 0 +; GCN: .set use_stack1.has_recursion, 0 +; GCN: .set use_stack1.has_indirect_call, 0 +; GCN: TotalNumSgprs: 37 +; GCN: NumVgprs: 1 +; GCN: ScratchSize: 404 +define void @use_stack1() #1 { + %alloca = alloca [100 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_stack: +; GCN: .set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr) +; GCN: .set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr) +; GCN: .set indirect_use_stack.numbered_sgpr, max(34, use_stack0.numbered_sgpr) +; GCN: .set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size)) +; GCN: .set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc) +; GCN: .set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch) +; GCN: .set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack) +; GCN: .set indirect_use_stack.has_recursion, or(0, use_stack0.has_recursion) +; GCN: .set indirect_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2132 +define void @indirect_use_stack() #1 { + %alloca = alloca [16 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0 + call void @use_stack0() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_stack: +; GCN: .set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr) +; GCN: .set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr) +; GCN: .set indirect_2_level_use_stack.numbered_sgpr, max(33, indirect_use_stack.numbered_sgpr) +; GCN: .set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size)) +; GCN: .set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc) +; GCN: .set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch) +; GCN: .set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack) +; GCN: .set indirect_2_level_use_stack.has_recursion, or(0, indirect_use_stack.has_recursion) +; GCN: .set indirect_2_level_use_stack.has_indirect_call, or(0, indirect_use_stack.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2132 +define amdgpu_kernel void @indirect_2_level_use_stack() #0 { + call void @indirect_use_stack() + ret void +} + + +; Should be maximum of callee usage +; GCN-LABEL: {{^}}multi_call_use_use_stack: +; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr) +; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr) +; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(42, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) +; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) +; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc) +; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch) +; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack) +; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion) +; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call) +; GCN: TotalNumSgprs: 48 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2052 +define amdgpu_kernel void @multi_call_use_use_stack() #0 { + call void @use_stack0() + call void @use_stack1() + ret void +} + +declare void @external() #0 + +; GCN-LABEL: {{^}}multi_call_with_external: +; GCN: .set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr) +; GCN: .set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set multi_call_with_external.numbered_sgpr, max(42, amdgpu.max_num_sgpr) +; GCN: .set multi_call_with_external.private_seg_size, 0 +; GCN: .set multi_call_with_external.uses_vcc, 1 +; GCN: .set multi_call_with_external.uses_flat_scratch, 1 +; GCN: .set multi_call_with_external.has_dyn_sized_stack, 1 +; GCN: .set multi_call_with_external.has_recursion, 0 +; GCN: .set multi_call_with_external.has_indirect_call, 1 +; GCN: TotalNumSgprs: multi_call_with_external.numbered_sgpr+6 +; GCN: NumVgprs: multi_call_with_external.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @multi_call_with_external() #0 { + call void @use_stack0() + call void @use_stack1() + call void @external() + ret void +} + +; GCN-LABEL: {{^}}usage_external: +; GCN: .set usage_external.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set usage_external.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set usage_external.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set usage_external.private_seg_size, 0 +; GCN: .set usage_external.uses_vcc, 1 +; GCN: .set usage_external.uses_flat_scratch, 1 +; GCN: .set usage_external.has_dyn_sized_stack, 1 +; GCN: .set usage_external.has_recursion, 0 +; GCN: .set usage_external.has_indirect_call, 1 +; GCN: TotalNumSgprs: usage_external.numbered_sgpr+6 +; GCN: NumVgprs: usage_external.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @usage_external() #0 { + call void @external() + ret void +} + +declare void @external_recurse() #2 + +; GCN-LABEL: {{^}}usage_external_recurse: +; GCN: .set usage_external_recurse.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set usage_external_recurse.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set usage_external_recurse.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set usage_external_recurse.private_seg_size, 0 +; GCN: .set usage_external_recurse.uses_vcc, 1 +; GCN: .set usage_external_recurse.uses_flat_scratch, 1 +; GCN: .set usage_external_recurse.has_dyn_sized_stack, 1 +; GCN: .set usage_external_recurse.has_recursion, 1 +; GCN: .set usage_external_recurse.has_indirect_call, 1 +; GCN: TotalNumSgprs: usage_external_recurse.numbered_sgpr+6 +; GCN: NumVgprs: usage_external_recurse.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @usage_external_recurse() #0 { + call void @external_recurse() + ret void +} + +; GCN-LABEL: {{^}}direct_recursion_use_stack: +; GCN: .set direct_recursion_use_stack.num_vgpr, 41 +; GCN: .set direct_recursion_use_stack.num_agpr, 0 +; GCN: .set direct_recursion_use_stack.numbered_sgpr, 36 +; GCN: .set direct_recursion_use_stack.private_seg_size, 2064 +; GCN: .set direct_recursion_use_stack.uses_vcc, 1 +; GCN: .set direct_recursion_use_stack.uses_flat_scratch, 0 +; GCN: .set direct_recursion_use_stack.has_dyn_sized_stack, 0 +; GCN: .set direct_recursion_use_stack.has_recursion, 1 +; GCN: .set direct_recursion_use_stack.has_indirect_call, 0 +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2064 +define void @direct_recursion_use_stack(i32 %val) #2 { + %alloca = alloca [512 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0 + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %ret, label %call + +call: + %val.sub1 = sub i32 %val, 1 + call void @direct_recursion_use_stack(i32 %val.sub1) + br label %ret + +ret: + ret void +} + +; GCN-LABEL: {{^}}usage_direct_recursion: +; GCN: .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr) +; GCN: .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr) +; GCN: .set usage_direct_recursion.numbered_sgpr, max(33, direct_recursion_use_stack.numbered_sgpr) +; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size)) +; GCN: .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc) +; GCN: .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch) +; GCN: .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack) +; GCN: .set usage_direct_recursion.has_recursion, or(1, direct_recursion_use_stack.has_recursion) +; GCN: .set usage_direct_recursion.has_indirect_call, or(0, direct_recursion_use_stack.has_indirect_call) +; GCN: TotalNumSgprs: 42 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2064 +define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { + call void @direct_recursion_use_stack(i32 %n) + ret void +} + +; Make sure there's no assert when a sgpr96 is used. +; GCN-LABEL: {{^}}count_use_sgpr96_external_call +; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr96_external_call.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set count_use_sgpr96_external_call.private_seg_size, 0 +; GCN: .set count_use_sgpr96_external_call.uses_vcc, 1 +; GCN: .set count_use_sgpr96_external_call.uses_flat_scratch, 1 +; GCN: .set count_use_sgpr96_external_call.has_dyn_sized_stack, 1 +; GCN: .set count_use_sgpr96_external_call.has_recursion, 0 +; GCN: .set count_use_sgpr96_external_call.has_indirect_call, 1 +; GCN: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+6 +; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @count_use_sgpr96_external_call() { +entry: + tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> ) #1 + call void @external() + ret void +} + +; Make sure there's no assert when a sgpr160 is used. +; GCN-LABEL: {{^}}count_use_sgpr160_external_call +; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr160_external_call.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set count_use_sgpr160_external_call.private_seg_size, 0 +; GCN: .set count_use_sgpr160_external_call.uses_vcc, 1 +; GCN: .set count_use_sgpr160_external_call.uses_flat_scratch, 1 +; GCN: .set count_use_sgpr160_external_call.has_dyn_sized_stack, 1 +; GCN: .set count_use_sgpr160_external_call.has_recursion, 0 +; GCN: .set count_use_sgpr160_external_call.has_indirect_call, 1 +; GCN: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+6 +; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @count_use_sgpr160_external_call() { +entry: + tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> ) #1 + call void @external() + ret void +} + +; Make sure there's no assert when a vgpr160 is used. +; GCN-LABEL: {{^}}count_use_vgpr160_external_call +; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set count_use_vgpr160_external_call.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set count_use_vgpr160_external_call.private_seg_size, 0 +; GCN: .set count_use_vgpr160_external_call.uses_vcc, 1 +; GCN: .set count_use_vgpr160_external_call.uses_flat_scratch, 1 +; GCN: .set count_use_vgpr160_external_call.has_dyn_sized_stack, 1 +; GCN: .set count_use_vgpr160_external_call.has_recursion, 0 +; GCN: .set count_use_vgpr160_external_call.has_indirect_call, 1 +; GCN: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+6 +; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @count_use_vgpr160_external_call() { +entry: + tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> ) #1 + call void @external() + ret void +} + +; Added at the of the .s are the module level maximums +; GCN: .set amdgpu.max_num_vgpr, 50 +; GCN: .set amdgpu.max_num_agpr, 0 +; GCN: .set amdgpu.max_num_sgpr, 80 + +attributes #0 = { nounwind noinline norecurse } +attributes #1 = { nounwind noinline norecurse } +attributes #2 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 3b078c41f4a84..7d07641f455e3 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2635,7 +2635,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: s_add_i32 s33, s32, 0x7fc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xffff8000 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2775,25 +2775,25 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:528 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:532 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:540 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 @@ -2861,13 +2861,13 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 @@ -2890,7 +2890,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffd8000 ; GFX9-NEXT: s_mov_b32 s33, s36 @@ -2904,7 +2904,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: s_add_i32 s33, s32, 0x3fe0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -3046,28 +3046,28 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX10-NEXT: s_clause 0x15 ; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 @@ -3134,14 +3134,14 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1540 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1544 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1548 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1552 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1556 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1560 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1564 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:1568 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1548 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1552 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1556 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1560 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:1564 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 @@ -3165,7 +3165,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_readlane_b32 s31, v63, 1 ; GFX10-NEXT: v_readlane_b32 s30, v63, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_add_i32 s32, s32, 0xfffec000 @@ -3181,7 +3181,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:1536 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:1600 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3267,7 +3267,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:528 ; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 @@ -3277,13 +3277,13 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_mov_b32_e32 v10, v21 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 ; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v32, v36 ; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 @@ -3333,13 +3333,13 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_store_b128 off, v[48:51], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 16 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; 16-byte Folded Reload ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, 42 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572 -; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556 -; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540 +; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 +; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 +; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 ; GFX11-NEXT: s_add_i32 s2, s33, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 @@ -3360,7 +3360,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_readlane_b32 s31, v60, 1 ; GFX11-NEXT: v_readlane_b32 s30, v60, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 ; GFX11-NEXT: s_mov_b32 s33, s34 diff --git a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir index 742498cdd8bd1..c76a84cb1c5d4 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir @@ -21,14 +21,10 @@ body: | ; CHECK-LABEL: name: split_instruction_subranges ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 - ; CHECK-NEXT: S_NOP 0, implicit [[COPY]].sub1 - ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1 - ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]].sub1 + ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 + ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR1]].sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1 ; CHECK-NEXT: S_ENDPGM 0 %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1) %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1) @@ -61,23 +57,13 @@ body: | ; CHECK-LABEL: name: split_instruction_subranges_use_is_subreg_def ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0 - ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0 - ; CHECK-NEXT: S_NOP 0, implicit-def [[COPY]].sub1 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1 - ; CHECK-NEXT: S_NOP 0, implicit-def [[COPY2]].sub0 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY [[COPY2]].sub1 + ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR1]].sub1 + ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub0 ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 - ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64 = COPY [[COPY1]].sub0 - ; CHECK-NEXT: S_NOP 0, implicit [[COPY4]].sub0 - ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY3]].sub1 - ; CHECK-NEXT: S_NOP 0, implicit [[COPY5]].sub1 + ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR1]].sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1 ; CHECK-NEXT: S_ENDPGM 0 %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1) %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 5abd4c9069c91..a4a8f43646d4b 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,209 +6,209 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v8, s30, 0 -; CHECK-NEXT: v_writelane_b32 v8, s31, 1 -; CHECK-NEXT: v_writelane_b32 v8, s36, 2 -; CHECK-NEXT: v_writelane_b32 v8, s37, 3 -; CHECK-NEXT: v_writelane_b32 v8, s38, 4 -; CHECK-NEXT: v_writelane_b32 v8, s39, 5 -; CHECK-NEXT: v_writelane_b32 v8, s40, 6 -; CHECK-NEXT: v_writelane_b32 v8, s41, 7 -; CHECK-NEXT: v_writelane_b32 v8, s42, 8 -; CHECK-NEXT: v_writelane_b32 v8, s43, 9 -; CHECK-NEXT: v_writelane_b32 v8, s44, 10 -; CHECK-NEXT: v_writelane_b32 v8, s45, 11 -; CHECK-NEXT: v_writelane_b32 v8, s46, 12 -; CHECK-NEXT: v_writelane_b32 v8, s47, 13 -; CHECK-NEXT: v_writelane_b32 v8, s48, 14 -; CHECK-NEXT: v_writelane_b32 v8, s49, 15 +; CHECK-NEXT: v_writelane_b32 v5, s30, 0 +; CHECK-NEXT: v_writelane_b32 v5, s31, 1 +; CHECK-NEXT: v_writelane_b32 v5, s36, 2 +; CHECK-NEXT: v_writelane_b32 v5, s37, 3 +; CHECK-NEXT: v_writelane_b32 v5, s38, 4 +; CHECK-NEXT: v_writelane_b32 v5, s39, 5 +; CHECK-NEXT: v_writelane_b32 v5, s40, 6 +; CHECK-NEXT: v_writelane_b32 v5, s41, 7 +; CHECK-NEXT: v_writelane_b32 v5, s42, 8 +; CHECK-NEXT: v_writelane_b32 v5, s43, 9 +; CHECK-NEXT: v_writelane_b32 v5, s44, 10 +; CHECK-NEXT: v_writelane_b32 v5, s45, 11 +; CHECK-NEXT: v_writelane_b32 v5, s46, 12 +; CHECK-NEXT: v_writelane_b32 v5, s47, 13 +; CHECK-NEXT: v_writelane_b32 v5, s48, 14 +; CHECK-NEXT: v_writelane_b32 v5, s49, 15 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v8, s50, 16 +; CHECK-NEXT: v_writelane_b32 v5, s50, 16 ; CHECK-NEXT: s_movk_i32 s4, 0xf0 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v8, s51, 17 +; CHECK-NEXT: v_writelane_b32 v5, s51, 17 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane +; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 ; CHECK-NEXT: s_movk_i32 s20, 0x130 ; CHECK-NEXT: s_mov_b32 s21, s24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s36, 0 -; CHECK-NEXT: v_writelane_b32 v4, s37, 1 -; CHECK-NEXT: v_writelane_b32 v4, s38, 2 -; CHECK-NEXT: v_writelane_b32 v4, s39, 3 -; CHECK-NEXT: v_writelane_b32 v4, s40, 4 -; CHECK-NEXT: v_writelane_b32 v4, s41, 5 -; CHECK-NEXT: v_writelane_b32 v4, s42, 6 -; CHECK-NEXT: v_writelane_b32 v4, s43, 7 -; CHECK-NEXT: v_writelane_b32 v4, s44, 8 -; CHECK-NEXT: v_writelane_b32 v4, s45, 9 -; CHECK-NEXT: v_writelane_b32 v4, s46, 10 +; CHECK-NEXT: v_writelane_b32 v7, s36, 0 +; CHECK-NEXT: v_writelane_b32 v7, s37, 1 +; CHECK-NEXT: v_writelane_b32 v7, s38, 2 +; CHECK-NEXT: v_writelane_b32 v7, s39, 3 +; CHECK-NEXT: v_writelane_b32 v7, s40, 4 +; CHECK-NEXT: v_writelane_b32 v7, s41, 5 +; CHECK-NEXT: v_writelane_b32 v7, s42, 6 +; CHECK-NEXT: v_writelane_b32 v7, s43, 7 +; CHECK-NEXT: v_writelane_b32 v7, s44, 8 +; CHECK-NEXT: v_writelane_b32 v7, s45, 9 +; CHECK-NEXT: v_writelane_b32 v7, s46, 10 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; CHECK-NEXT: v_writelane_b32 v4, s47, 11 -; CHECK-NEXT: v_writelane_b32 v4, s48, 12 -; CHECK-NEXT: v_writelane_b32 v4, s49, 13 +; CHECK-NEXT: v_writelane_b32 v7, s47, 11 +; CHECK-NEXT: v_writelane_b32 v7, s48, 12 ; CHECK-NEXT: s_mov_b32 s20, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v4, s50, 14 -; CHECK-NEXT: v_mov_b32_e32 v5, s28 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_writelane_b32 v7, s49, 13 +; CHECK-NEXT: v_mov_b32_e32 v2, s28 +; CHECK-NEXT: v_mov_b32_e32 v3, v1 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_writelane_b32 v4, s51, 15 +; CHECK-NEXT: v_writelane_b32 v7, s50, 14 +; CHECK-NEXT: v_writelane_b32 v7, s51, 15 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 16 -; CHECK-NEXT: v_writelane_b32 v4, s5, 17 -; CHECK-NEXT: v_writelane_b32 v4, s6, 18 -; CHECK-NEXT: v_writelane_b32 v4, s7, 19 -; CHECK-NEXT: v_writelane_b32 v4, s8, 20 -; CHECK-NEXT: v_writelane_b32 v4, s9, 21 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v4, s10, 22 -; CHECK-NEXT: v_writelane_b32 v4, s11, 23 -; CHECK-NEXT: v_writelane_b32 v4, s12, 24 -; CHECK-NEXT: v_writelane_b32 v4, s13, 25 -; CHECK-NEXT: v_writelane_b32 v4, s14, 26 -; CHECK-NEXT: v_writelane_b32 v4, s15, 27 -; CHECK-NEXT: v_writelane_b32 v8, s52, 18 -; CHECK-NEXT: v_writelane_b32 v4, s16, 28 -; CHECK-NEXT: v_writelane_b32 v8, s53, 19 -; CHECK-NEXT: v_writelane_b32 v4, s17, 29 -; CHECK-NEXT: v_writelane_b32 v8, s54, 20 -; CHECK-NEXT: v_writelane_b32 v4, s18, 30 +; CHECK-NEXT: v_writelane_b32 v7, s4, 16 +; CHECK-NEXT: v_writelane_b32 v7, s5, 17 +; CHECK-NEXT: v_writelane_b32 v7, s6, 18 +; CHECK-NEXT: v_writelane_b32 v7, s7, 19 +; CHECK-NEXT: v_writelane_b32 v7, s8, 20 +; CHECK-NEXT: v_writelane_b32 v7, s9, 21 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[4:11], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v7, s10, 22 +; CHECK-NEXT: v_writelane_b32 v7, s11, 23 +; CHECK-NEXT: v_writelane_b32 v7, s12, 24 +; CHECK-NEXT: v_writelane_b32 v7, s13, 25 +; CHECK-NEXT: v_writelane_b32 v7, s14, 26 +; CHECK-NEXT: v_writelane_b32 v7, s15, 27 +; CHECK-NEXT: v_writelane_b32 v5, s52, 18 +; CHECK-NEXT: v_writelane_b32 v7, s16, 28 +; CHECK-NEXT: v_writelane_b32 v5, s53, 19 +; CHECK-NEXT: v_writelane_b32 v7, s17, 29 +; CHECK-NEXT: v_writelane_b32 v5, s54, 20 +; CHECK-NEXT: v_writelane_b32 v7, s18, 30 ; CHECK-NEXT: s_mov_b32 s26, 48 ; CHECK-NEXT: s_mov_b32 s27, s24 -; CHECK-NEXT: v_writelane_b32 v8, s55, 21 -; CHECK-NEXT: v_writelane_b32 v4, s19, 31 +; CHECK-NEXT: v_writelane_b32 v5, s55, 21 +; CHECK-NEXT: v_writelane_b32 v7, s19, 31 ; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0 -; CHECK-NEXT: v_writelane_b32 v8, s56, 22 -; CHECK-NEXT: v_writelane_b32 v8, s57, 23 -; CHECK-NEXT: v_writelane_b32 v8, s58, 24 -; CHECK-NEXT: v_writelane_b32 v8, s59, 25 -; CHECK-NEXT: v_writelane_b32 v8, s60, 26 +; CHECK-NEXT: v_writelane_b32 v5, s56, 22 +; CHECK-NEXT: v_writelane_b32 v5, s57, 23 +; CHECK-NEXT: v_writelane_b32 v5, s58, 24 +; CHECK-NEXT: v_writelane_b32 v5, s59, 25 +; CHECK-NEXT: v_writelane_b32 v5, s60, 26 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 32 -; CHECK-NEXT: v_writelane_b32 v8, s61, 27 -; CHECK-NEXT: v_writelane_b32 v4, s5, 33 -; CHECK-NEXT: v_writelane_b32 v8, s62, 28 -; CHECK-NEXT: v_writelane_b32 v4, s6, 34 -; CHECK-NEXT: v_writelane_b32 v8, s63, 29 -; CHECK-NEXT: v_writelane_b32 v4, s7, 35 -; CHECK-NEXT: v_writelane_b32 v8, s64, 30 -; CHECK-NEXT: v_writelane_b32 v4, s8, 36 -; CHECK-NEXT: v_writelane_b32 v8, s65, 31 -; CHECK-NEXT: v_writelane_b32 v4, s9, 37 -; CHECK-NEXT: v_writelane_b32 v8, s66, 32 +; CHECK-NEXT: v_writelane_b32 v7, s4, 32 +; CHECK-NEXT: v_writelane_b32 v5, s61, 27 +; CHECK-NEXT: v_writelane_b32 v7, s5, 33 +; CHECK-NEXT: v_writelane_b32 v5, s62, 28 +; CHECK-NEXT: v_writelane_b32 v7, s6, 34 +; CHECK-NEXT: v_writelane_b32 v5, s63, 29 +; CHECK-NEXT: v_writelane_b32 v7, s7, 35 +; CHECK-NEXT: v_writelane_b32 v5, s64, 30 +; CHECK-NEXT: v_writelane_b32 v7, s8, 36 +; CHECK-NEXT: v_writelane_b32 v5, s65, 31 +; CHECK-NEXT: v_writelane_b32 v7, s9, 37 +; CHECK-NEXT: v_writelane_b32 v5, s66, 32 ; CHECK-NEXT: s_movk_i32 s28, 0x1f0 ; CHECK-NEXT: s_movk_i32 s30, 0x2f0 ; CHECK-NEXT: s_mov_b32 s29, s24 ; CHECK-NEXT: s_mov_b32 s31, s24 -; CHECK-NEXT: v_writelane_b32 v4, s10, 38 -; CHECK-NEXT: v_writelane_b32 v8, s67, 33 -; CHECK-NEXT: v_writelane_b32 v4, s11, 39 +; CHECK-NEXT: v_writelane_b32 v7, s10, 38 +; CHECK-NEXT: v_writelane_b32 v5, s67, 33 +; CHECK-NEXT: v_writelane_b32 v7, s11, 39 ; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1 -; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v6, v5 +; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 ; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25] ; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 +; CHECK-NEXT: v_readlane_b32 s36, v7, 0 +; CHECK-NEXT: v_readlane_b32 s44, v7, 8 +; CHECK-NEXT: v_readlane_b32 s45, v7, 9 +; CHECK-NEXT: v_readlane_b32 s46, v7, 10 +; CHECK-NEXT: v_readlane_b32 s47, v7, 11 +; CHECK-NEXT: v_readlane_b32 s48, v7, 12 +; CHECK-NEXT: v_readlane_b32 s49, v7, 13 +; CHECK-NEXT: v_readlane_b32 s50, v7, 14 +; CHECK-NEXT: v_readlane_b32 s51, v7, 15 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s37, v7, 1 +; CHECK-NEXT: v_readlane_b32 s38, v7, 2 +; CHECK-NEXT: v_readlane_b32 s39, v7, 3 +; CHECK-NEXT: v_readlane_b32 s40, v7, 4 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 +; CHECK-NEXT: v_readlane_b32 s41, v7, 5 +; CHECK-NEXT: v_readlane_b32 s42, v7, 6 +; CHECK-NEXT: v_readlane_b32 s43, v7, 7 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 32 -; CHECK-NEXT: v_readlane_b32 s40, v4, 36 -; CHECK-NEXT: v_readlane_b32 s41, v4, 37 -; CHECK-NEXT: v_readlane_b32 s42, v4, 38 -; CHECK-NEXT: v_readlane_b32 s43, v4, 39 +; CHECK-NEXT: v_readlane_b32 s36, v7, 32 +; CHECK-NEXT: v_readlane_b32 s40, v7, 36 +; CHECK-NEXT: v_readlane_b32 s41, v7, 37 +; CHECK-NEXT: v_readlane_b32 s42, v7, 38 +; CHECK-NEXT: v_readlane_b32 s43, v7, 39 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_readlane_b32 s37, v4, 33 -; CHECK-NEXT: v_readlane_b32 s38, v4, 34 +; CHECK-NEXT: v_readlane_b32 s37, v7, 33 +; CHECK-NEXT: v_readlane_b32 s38, v7, 34 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s39, v4, 35 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[60:67], s[40:43] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s39, v7, 35 ; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, v1, v5 +; CHECK-NEXT: v_mul_f32_e32 v1, v1, v3 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s12, v4, 32 -; CHECK-NEXT: v_readlane_b32 s13, v4, 33 -; CHECK-NEXT: v_readlane_b32 s14, v4, 34 -; CHECK-NEXT: v_readlane_b32 s15, v4, 35 -; CHECK-NEXT: v_readlane_b32 s16, v4, 36 -; CHECK-NEXT: v_readlane_b32 s17, v4, 37 -; CHECK-NEXT: v_readlane_b32 s18, v4, 38 -; CHECK-NEXT: v_readlane_b32 s19, v4, 39 -; CHECK-NEXT: v_writelane_b32 v4, s4, 40 -; CHECK-NEXT: v_writelane_b32 v4, s5, 41 -; CHECK-NEXT: v_writelane_b32 v4, s6, 42 -; CHECK-NEXT: v_writelane_b32 v4, s7, 43 -; CHECK-NEXT: v_writelane_b32 v4, s8, 44 -; CHECK-NEXT: v_writelane_b32 v4, s9, 45 -; CHECK-NEXT: v_writelane_b32 v4, s10, 46 -; CHECK-NEXT: v_writelane_b32 v4, s11, 47 -; CHECK-NEXT: v_writelane_b32 v4, s12, 48 -; CHECK-NEXT: v_writelane_b32 v4, s13, 49 -; CHECK-NEXT: v_writelane_b32 v4, s14, 50 -; CHECK-NEXT: v_writelane_b32 v4, s15, 51 -; CHECK-NEXT: v_writelane_b32 v4, s16, 52 -; CHECK-NEXT: v_writelane_b32 v4, s17, 53 -; CHECK-NEXT: v_writelane_b32 v4, s18, 54 -; CHECK-NEXT: v_writelane_b32 v4, s19, 55 -; CHECK-NEXT: v_writelane_b32 v4, s52, 56 -; CHECK-NEXT: v_writelane_b32 v3, s60, 0 -; CHECK-NEXT: v_writelane_b32 v4, s53, 57 -; CHECK-NEXT: v_writelane_b32 v3, s61, 1 -; CHECK-NEXT: v_writelane_b32 v4, s54, 58 -; CHECK-NEXT: v_writelane_b32 v3, s62, 2 -; CHECK-NEXT: v_writelane_b32 v4, s55, 59 -; CHECK-NEXT: v_writelane_b32 v3, s63, 3 -; CHECK-NEXT: v_writelane_b32 v4, s56, 60 -; CHECK-NEXT: v_writelane_b32 v3, s64, 4 -; CHECK-NEXT: v_writelane_b32 v4, s57, 61 -; CHECK-NEXT: v_writelane_b32 v3, s65, 5 -; CHECK-NEXT: v_writelane_b32 v4, s58, 62 -; CHECK-NEXT: v_writelane_b32 v3, s66, 6 -; CHECK-NEXT: v_writelane_b32 v4, s59, 63 -; CHECK-NEXT: v_writelane_b32 v3, s67, 7 +; CHECK-NEXT: v_readlane_b32 s12, v7, 32 +; CHECK-NEXT: v_readlane_b32 s13, v7, 33 +; CHECK-NEXT: v_readlane_b32 s14, v7, 34 +; CHECK-NEXT: v_readlane_b32 s15, v7, 35 +; CHECK-NEXT: v_readlane_b32 s16, v7, 36 +; CHECK-NEXT: v_readlane_b32 s17, v7, 37 +; CHECK-NEXT: v_readlane_b32 s18, v7, 38 +; CHECK-NEXT: v_readlane_b32 s19, v7, 39 +; CHECK-NEXT: v_writelane_b32 v7, s4, 40 +; CHECK-NEXT: v_writelane_b32 v7, s5, 41 +; CHECK-NEXT: v_writelane_b32 v7, s6, 42 +; CHECK-NEXT: v_writelane_b32 v7, s7, 43 +; CHECK-NEXT: v_writelane_b32 v7, s8, 44 +; CHECK-NEXT: v_writelane_b32 v7, s9, 45 +; CHECK-NEXT: v_writelane_b32 v7, s10, 46 +; CHECK-NEXT: v_writelane_b32 v7, s11, 47 +; CHECK-NEXT: v_writelane_b32 v7, s12, 48 +; CHECK-NEXT: v_writelane_b32 v7, s13, 49 +; CHECK-NEXT: v_writelane_b32 v7, s14, 50 +; CHECK-NEXT: v_writelane_b32 v7, s15, 51 +; CHECK-NEXT: v_writelane_b32 v7, s16, 52 +; CHECK-NEXT: v_writelane_b32 v7, s17, 53 +; CHECK-NEXT: v_writelane_b32 v7, s18, 54 +; CHECK-NEXT: v_writelane_b32 v7, s19, 55 +; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v7, s52, 56 +; CHECK-NEXT: v_writelane_b32 v6, s60, 0 +; CHECK-NEXT: v_writelane_b32 v7, s53, 57 +; CHECK-NEXT: v_writelane_b32 v6, s61, 1 +; CHECK-NEXT: v_writelane_b32 v7, s54, 58 +; CHECK-NEXT: v_writelane_b32 v6, s62, 2 +; CHECK-NEXT: v_writelane_b32 v7, s55, 59 +; CHECK-NEXT: v_writelane_b32 v6, s63, 3 +; CHECK-NEXT: v_writelane_b32 v7, s56, 60 +; CHECK-NEXT: v_writelane_b32 v6, s64, 4 +; CHECK-NEXT: v_writelane_b32 v7, s57, 61 +; CHECK-NEXT: v_writelane_b32 v6, s65, 5 +; CHECK-NEXT: v_writelane_b32 v7, s58, 62 +; CHECK-NEXT: v_writelane_b32 v6, s66, 6 +; CHECK-NEXT: v_writelane_b32 v7, s59, 63 +; CHECK-NEXT: v_writelane_b32 v6, s67, 7 ; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 @@ -219,68 +219,68 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 +; CHECK-NEXT: v_readlane_b32 s36, v7, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 -; CHECK-NEXT: image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 16 -; CHECK-NEXT: v_readlane_b32 s44, v4, 24 -; CHECK-NEXT: v_readlane_b32 s45, v4, 25 -; CHECK-NEXT: v_readlane_b32 s46, v4, 26 -; CHECK-NEXT: v_readlane_b32 s47, v4, 27 -; CHECK-NEXT: v_readlane_b32 s48, v4, 28 -; CHECK-NEXT: v_readlane_b32 s49, v4, 29 -; CHECK-NEXT: v_readlane_b32 s50, v4, 30 -; CHECK-NEXT: v_readlane_b32 s51, v4, 31 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, v6 -; CHECK-NEXT: v_readlane_b32 s37, v4, 17 -; CHECK-NEXT: v_readlane_b32 s38, v4, 18 -; CHECK-NEXT: v_readlane_b32 s39, v4, 19 +; CHECK-NEXT: v_readlane_b32 s37, v7, 1 +; CHECK-NEXT: v_readlane_b32 s38, v7, 2 +; CHECK-NEXT: v_readlane_b32 s39, v7, 3 +; CHECK-NEXT: v_readlane_b32 s40, v7, 4 +; CHECK-NEXT: v_readlane_b32 s41, v7, 5 +; CHECK-NEXT: v_readlane_b32 s42, v7, 6 +; CHECK-NEXT: v_readlane_b32 s43, v7, 7 +; CHECK-NEXT: v_readlane_b32 s44, v7, 8 +; CHECK-NEXT: v_readlane_b32 s45, v7, 9 +; CHECK-NEXT: v_readlane_b32 s46, v7, 10 +; CHECK-NEXT: v_readlane_b32 s47, v7, 11 +; CHECK-NEXT: v_readlane_b32 s48, v7, 12 +; CHECK-NEXT: v_readlane_b32 s49, v7, 13 +; CHECK-NEXT: v_readlane_b32 s50, v7, 14 +; CHECK-NEXT: v_readlane_b32 s51, v7, 15 +; CHECK-NEXT: image_sample_lz v2, v[0:1], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s36, v7, 16 +; CHECK-NEXT: v_readlane_b32 s44, v7, 24 +; CHECK-NEXT: v_readlane_b32 s45, v7, 25 +; CHECK-NEXT: v_readlane_b32 s46, v7, 26 +; CHECK-NEXT: v_readlane_b32 s47, v7, 27 +; CHECK-NEXT: v_readlane_b32 s48, v7, 28 +; CHECK-NEXT: v_readlane_b32 s49, v7, 29 +; CHECK-NEXT: v_readlane_b32 s50, v7, 30 +; CHECK-NEXT: v_readlane_b32 s51, v7, 31 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_readlane_b32 s37, v7, 17 +; CHECK-NEXT: v_readlane_b32 s38, v7, 18 +; CHECK-NEXT: v_readlane_b32 s39, v7, 19 ; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s40, v4, 20 -; CHECK-NEXT: v_readlane_b32 s41, v4, 21 -; CHECK-NEXT: v_readlane_b32 s42, v4, 22 -; CHECK-NEXT: v_readlane_b32 s43, v4, 23 +; CHECK-NEXT: v_readlane_b32 s40, v7, 20 +; CHECK-NEXT: v_readlane_b32 s41, v7, 21 +; CHECK-NEXT: v_readlane_b32 s42, v7, 22 +; CHECK-NEXT: v_readlane_b32 s43, v7, 23 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[5:7], off, s[8:11], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[2:4], off, s[8:11], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: .LBB0_6: ; %Flow12 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23] -; CHECK-NEXT: v_readlane_b32 s52, v4, 40 -; CHECK-NEXT: v_readlane_b32 s53, v4, 41 -; CHECK-NEXT: v_readlane_b32 s54, v4, 42 -; CHECK-NEXT: v_readlane_b32 s55, v4, 43 -; CHECK-NEXT: v_readlane_b32 s56, v4, 44 -; CHECK-NEXT: v_readlane_b32 s57, v4, 45 -; CHECK-NEXT: v_readlane_b32 s58, v4, 46 -; CHECK-NEXT: v_readlane_b32 s59, v4, 47 -; CHECK-NEXT: v_readlane_b32 s60, v4, 48 -; CHECK-NEXT: v_readlane_b32 s61, v4, 49 -; CHECK-NEXT: v_readlane_b32 s62, v4, 50 -; CHECK-NEXT: v_readlane_b32 s63, v4, 51 -; CHECK-NEXT: v_readlane_b32 s64, v4, 52 -; CHECK-NEXT: v_readlane_b32 s65, v4, 53 -; CHECK-NEXT: v_readlane_b32 s66, v4, 54 -; CHECK-NEXT: v_readlane_b32 s67, v4, 55 +; CHECK-NEXT: v_readlane_b32 s52, v7, 40 +; CHECK-NEXT: v_readlane_b32 s53, v7, 41 +; CHECK-NEXT: v_readlane_b32 s54, v7, 42 +; CHECK-NEXT: v_readlane_b32 s55, v7, 43 +; CHECK-NEXT: v_readlane_b32 s56, v7, 44 +; CHECK-NEXT: v_readlane_b32 s57, v7, 45 +; CHECK-NEXT: v_readlane_b32 s58, v7, 46 +; CHECK-NEXT: v_readlane_b32 s59, v7, 47 +; CHECK-NEXT: v_readlane_b32 s60, v7, 48 +; CHECK-NEXT: v_readlane_b32 s61, v7, 49 +; CHECK-NEXT: v_readlane_b32 s62, v7, 50 +; CHECK-NEXT: v_readlane_b32 s63, v7, 51 +; CHECK-NEXT: v_readlane_b32 s64, v7, 52 +; CHECK-NEXT: v_readlane_b32 s65, v7, 53 +; CHECK-NEXT: v_readlane_b32 s66, v7, 54 +; CHECK-NEXT: v_readlane_b32 s67, v7, 55 ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader @@ -288,32 +288,32 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s6 -; CHECK-NEXT: v_readlane_b32 s36, v4, 56 +; CHECK-NEXT: v_readlane_b32 s36, v7, 56 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 ; CHECK-NEXT: v_mov_b32_e32 v2, s7 -; CHECK-NEXT: v_readlane_b32 s37, v4, 57 -; CHECK-NEXT: v_readlane_b32 s38, v4, 58 -; CHECK-NEXT: v_readlane_b32 s39, v4, 59 -; CHECK-NEXT: v_readlane_b32 s40, v4, 60 -; CHECK-NEXT: v_readlane_b32 s41, v4, 61 -; CHECK-NEXT: v_readlane_b32 s42, v4, 62 -; CHECK-NEXT: v_readlane_b32 s43, v4, 63 +; CHECK-NEXT: v_readlane_b32 s37, v7, 57 +; CHECK-NEXT: v_readlane_b32 s38, v7, 58 +; CHECK-NEXT: v_readlane_b32 s39, v7, 59 +; CHECK-NEXT: v_readlane_b32 s40, v7, 60 +; CHECK-NEXT: v_readlane_b32 s41, v7, 61 +; CHECK-NEXT: v_readlane_b32 s42, v7, 62 +; CHECK-NEXT: v_readlane_b32 s43, v7, 63 ; CHECK-NEXT: s_nop 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1 ; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2 ; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37] ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: v_readlane_b32 s44, v3, 0 -; CHECK-NEXT: v_readlane_b32 s45, v3, 1 -; CHECK-NEXT: v_readlane_b32 s46, v3, 2 -; CHECK-NEXT: v_readlane_b32 s47, v3, 3 -; CHECK-NEXT: v_readlane_b32 s48, v3, 4 -; CHECK-NEXT: v_readlane_b32 s49, v3, 5 -; CHECK-NEXT: v_readlane_b32 s50, v3, 6 -; CHECK-NEXT: v_readlane_b32 s51, v3, 7 +; CHECK-NEXT: v_readlane_b32 s44, v6, 0 +; CHECK-NEXT: v_readlane_b32 s45, v6, 1 +; CHECK-NEXT: v_readlane_b32 s46, v6, 2 +; CHECK-NEXT: v_readlane_b32 s47, v6, 3 +; CHECK-NEXT: v_readlane_b32 s48, v6, 4 +; CHECK-NEXT: v_readlane_b32 s49, v6, 5 +; CHECK-NEXT: v_readlane_b32 s50, v6, 6 +; CHECK-NEXT: v_readlane_b32 s51, v6, 7 ; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39] ; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41] ; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43] @@ -321,7 +321,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 ; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v6, v5 +; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3 ; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB0_8: ; %bb33 @@ -334,46 +334,44 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] -; CHECK-NEXT: v_readlane_b32 s67, v8, 33 -; CHECK-NEXT: v_readlane_b32 s66, v8, 32 -; CHECK-NEXT: v_readlane_b32 s65, v8, 31 -; CHECK-NEXT: v_readlane_b32 s64, v8, 30 -; CHECK-NEXT: v_readlane_b32 s63, v8, 29 -; CHECK-NEXT: v_readlane_b32 s62, v8, 28 -; CHECK-NEXT: v_readlane_b32 s61, v8, 27 -; CHECK-NEXT: v_readlane_b32 s60, v8, 26 -; CHECK-NEXT: v_readlane_b32 s59, v8, 25 -; CHECK-NEXT: v_readlane_b32 s58, v8, 24 -; CHECK-NEXT: v_readlane_b32 s57, v8, 23 -; CHECK-NEXT: v_readlane_b32 s56, v8, 22 -; CHECK-NEXT: v_readlane_b32 s55, v8, 21 -; CHECK-NEXT: v_readlane_b32 s54, v8, 20 -; CHECK-NEXT: v_readlane_b32 s53, v8, 19 -; CHECK-NEXT: v_readlane_b32 s52, v8, 18 -; CHECK-NEXT: v_readlane_b32 s51, v8, 17 -; CHECK-NEXT: v_readlane_b32 s50, v8, 16 -; CHECK-NEXT: v_readlane_b32 s49, v8, 15 -; CHECK-NEXT: v_readlane_b32 s48, v8, 14 -; CHECK-NEXT: v_readlane_b32 s47, v8, 13 -; CHECK-NEXT: v_readlane_b32 s46, v8, 12 -; CHECK-NEXT: v_readlane_b32 s45, v8, 11 -; CHECK-NEXT: v_readlane_b32 s44, v8, 10 -; CHECK-NEXT: v_readlane_b32 s43, v8, 9 -; CHECK-NEXT: v_readlane_b32 s42, v8, 8 -; CHECK-NEXT: v_readlane_b32 s41, v8, 7 -; CHECK-NEXT: v_readlane_b32 s40, v8, 6 -; CHECK-NEXT: v_readlane_b32 s39, v8, 5 -; CHECK-NEXT: v_readlane_b32 s38, v8, 4 -; CHECK-NEXT: v_readlane_b32 s37, v8, 3 -; CHECK-NEXT: v_readlane_b32 s36, v8, 2 -; CHECK-NEXT: v_readlane_b32 s31, v8, 1 -; CHECK-NEXT: v_readlane_b32 s30, v8, 0 -; CHECK-NEXT: ; kill: killed $vgpr4 -; CHECK-NEXT: ; kill: killed $vgpr3 +; CHECK-NEXT: v_readlane_b32 s67, v5, 33 +; CHECK-NEXT: v_readlane_b32 s66, v5, 32 +; CHECK-NEXT: v_readlane_b32 s65, v5, 31 +; CHECK-NEXT: v_readlane_b32 s64, v5, 30 +; CHECK-NEXT: v_readlane_b32 s63, v5, 29 +; CHECK-NEXT: v_readlane_b32 s62, v5, 28 +; CHECK-NEXT: v_readlane_b32 s61, v5, 27 +; CHECK-NEXT: v_readlane_b32 s60, v5, 26 +; CHECK-NEXT: v_readlane_b32 s59, v5, 25 +; CHECK-NEXT: v_readlane_b32 s58, v5, 24 +; CHECK-NEXT: v_readlane_b32 s57, v5, 23 +; CHECK-NEXT: v_readlane_b32 s56, v5, 22 +; CHECK-NEXT: v_readlane_b32 s55, v5, 21 +; CHECK-NEXT: v_readlane_b32 s54, v5, 20 +; CHECK-NEXT: v_readlane_b32 s53, v5, 19 +; CHECK-NEXT: v_readlane_b32 s52, v5, 18 +; CHECK-NEXT: v_readlane_b32 s51, v5, 17 +; CHECK-NEXT: v_readlane_b32 s50, v5, 16 +; CHECK-NEXT: v_readlane_b32 s49, v5, 15 +; CHECK-NEXT: v_readlane_b32 s48, v5, 14 +; CHECK-NEXT: v_readlane_b32 s47, v5, 13 +; CHECK-NEXT: v_readlane_b32 s46, v5, 12 +; CHECK-NEXT: v_readlane_b32 s45, v5, 11 +; CHECK-NEXT: v_readlane_b32 s44, v5, 10 +; CHECK-NEXT: v_readlane_b32 s43, v5, 9 +; CHECK-NEXT: v_readlane_b32 s42, v5, 8 +; CHECK-NEXT: v_readlane_b32 s41, v5, 7 +; CHECK-NEXT: v_readlane_b32 s40, v5, 6 +; CHECK-NEXT: v_readlane_b32 s39, v5, 5 +; CHECK-NEXT: v_readlane_b32 s38, v5, 4 +; CHECK-NEXT: v_readlane_b32 s37, v5, 3 +; CHECK-NEXT: v_readlane_b32 s36, v5, 2 +; CHECK-NEXT: v_readlane_b32 s31, v5, 1 +; CHECK-NEXT: v_readlane_b32 s30, v5, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir b/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir index 96fb7cfeb2775..40089ed82b5db 100644 --- a/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -start-before=greedy,0 -stop-after=virtregrewriter,1 -o - 2>%t.err %s | FileCheck %s +# RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -start-before=greedy,0 -stop-after=virtregrewriter,2 -o - 2>%t.err %s | FileCheck %s # RUN: FileCheck -check-prefix=ERR %s < %t.err # This testcase cannot be compiled. An attempted eviction legality diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 60946956547a7..f1f4abe580c00 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1510,12 +1510,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s20, s20, s9 ; NOOPT-NEXT: s_addc_u32 s21, s21, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; NOOPT-NEXT: v_mov_b32_e32 v1, v0 -; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s6, s1 @@ -1526,11 +1521,11 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: s_mov_b32 s1, s6 ; NOOPT-NEXT: s_mov_b32 s2, s5 ; NOOPT-NEXT: s_mov_b32 s3, s4 -; NOOPT-NEXT: s_waitcnt vmcnt(1) -; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 -; NOOPT-NEXT: v_writelane_b32 v0, s2, 2 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 3 +; NOOPT-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v31, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 3 ; NOOPT-NEXT: s_mov_b32 s0, 16 ; NOOPT-NEXT: s_mov_b32 s1, 15 ; NOOPT-NEXT: s_mov_b32 s2, 14 @@ -1548,126 +1543,130 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: s_mov_b32 s14, 1 ; NOOPT-NEXT: s_mov_b32 s15, 0 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v1, s15 -; NOOPT-NEXT: v_mov_b32_e32 v31, s14 -; NOOPT-NEXT: v_mov_b32_e32 v30, s13 -; NOOPT-NEXT: v_mov_b32_e32 v29, s12 -; NOOPT-NEXT: v_mov_b32_e32 v28, s11 -; NOOPT-NEXT: v_mov_b32_e32 v27, s10 -; NOOPT-NEXT: v_mov_b32_e32 v26, s9 -; NOOPT-NEXT: v_mov_b32_e32 v25, s8 -; NOOPT-NEXT: v_mov_b32_e32 v24, s7 -; NOOPT-NEXT: v_mov_b32_e32 v23, s6 -; NOOPT-NEXT: v_mov_b32_e32 v22, s5 -; NOOPT-NEXT: v_mov_b32_e32 v21, s4 -; NOOPT-NEXT: v_mov_b32_e32 v20, s3 -; NOOPT-NEXT: v_mov_b32_e32 v19, s2 -; NOOPT-NEXT: v_mov_b32_e32 v18, s1 -; NOOPT-NEXT: v_mov_b32_e32 v17, s0 -; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v2, v31 -; NOOPT-NEXT: v_mov_b32_e32 v3, v30 -; NOOPT-NEXT: v_mov_b32_e32 v4, v29 -; NOOPT-NEXT: v_mov_b32_e32 v5, v28 -; NOOPT-NEXT: v_mov_b32_e32 v6, v27 -; NOOPT-NEXT: v_mov_b32_e32 v7, v26 -; NOOPT-NEXT: v_mov_b32_e32 v8, v25 -; NOOPT-NEXT: v_mov_b32_e32 v9, v24 -; NOOPT-NEXT: v_mov_b32_e32 v10, v23 -; NOOPT-NEXT: v_mov_b32_e32 v11, v22 -; NOOPT-NEXT: v_mov_b32_e32 v12, v21 -; NOOPT-NEXT: v_mov_b32_e32 v13, v20 -; NOOPT-NEXT: v_mov_b32_e32 v14, v19 -; NOOPT-NEXT: v_mov_b32_e32 v15, v18 -; NOOPT-NEXT: v_mov_b32_e32 v16, v17 -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v0, s15 +; NOOPT-NEXT: v_mov_b32_e32 v30, s14 +; NOOPT-NEXT: v_mov_b32_e32 v29, s13 +; NOOPT-NEXT: v_mov_b32_e32 v28, s12 +; NOOPT-NEXT: v_mov_b32_e32 v27, s11 +; NOOPT-NEXT: v_mov_b32_e32 v26, s10 +; NOOPT-NEXT: v_mov_b32_e32 v25, s9 +; NOOPT-NEXT: v_mov_b32_e32 v24, s8 +; NOOPT-NEXT: v_mov_b32_e32 v23, s7 +; NOOPT-NEXT: v_mov_b32_e32 v22, s6 +; NOOPT-NEXT: v_mov_b32_e32 v21, s5 +; NOOPT-NEXT: v_mov_b32_e32 v20, s4 +; NOOPT-NEXT: v_mov_b32_e32 v19, s3 +; NOOPT-NEXT: v_mov_b32_e32 v18, s2 +; NOOPT-NEXT: v_mov_b32_e32 v17, s1 +; NOOPT-NEXT: v_mov_b32_e32 v16, s0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v0, s0, 4 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 5 +; NOOPT-NEXT: v_writelane_b32 v31, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 5 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: ; implicit-def: $vgpr0 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 -; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 7 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v16 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00 -; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 7 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB5_1 ; NOOPT-NEXT: ; %bb.2: ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 5 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 -; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 -; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 -; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v31, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v31, 3 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: extract_neg_offset_vgpr: @@ -4022,7 +4021,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s20, s20, s9 ; NOOPT-NEXT: s_addc_u32 s21, s21, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -4034,10 +4032,11 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_mov_b32 s1, s6 ; NOOPT-NEXT: s_mov_b32 s2, s5 ; NOOPT-NEXT: s_mov_b32 s3, s4 -; NOOPT-NEXT: v_writelane_b32 v16, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v16, s1, 1 -; NOOPT-NEXT: v_writelane_b32 v16, s2, 2 -; NOOPT-NEXT: v_writelane_b32 v16, s3, 3 +; NOOPT-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v31, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 3 ; NOOPT-NEXT: s_mov_b32 s0, 16 ; NOOPT-NEXT: s_mov_b32 s1, 15 ; NOOPT-NEXT: s_mov_b32 s2, 14 @@ -4056,37 +4055,37 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_mov_b32 s15, 1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v0, s15 -; NOOPT-NEXT: v_mov_b32_e32 v31, s14 -; NOOPT-NEXT: v_mov_b32_e32 v30, s13 -; NOOPT-NEXT: v_mov_b32_e32 v29, s12 -; NOOPT-NEXT: v_mov_b32_e32 v28, s11 -; NOOPT-NEXT: v_mov_b32_e32 v27, s10 -; NOOPT-NEXT: v_mov_b32_e32 v26, s9 -; NOOPT-NEXT: v_mov_b32_e32 v25, s8 -; NOOPT-NEXT: v_mov_b32_e32 v24, s7 -; NOOPT-NEXT: v_mov_b32_e32 v23, s6 -; NOOPT-NEXT: v_mov_b32_e32 v22, s5 -; NOOPT-NEXT: v_mov_b32_e32 v21, s4 -; NOOPT-NEXT: v_mov_b32_e32 v20, s3 -; NOOPT-NEXT: v_mov_b32_e32 v19, s2 -; NOOPT-NEXT: v_mov_b32_e32 v18, s1 -; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: v_mov_b32_e32 v30, s14 +; NOOPT-NEXT: v_mov_b32_e32 v29, s13 +; NOOPT-NEXT: v_mov_b32_e32 v28, s12 +; NOOPT-NEXT: v_mov_b32_e32 v27, s11 +; NOOPT-NEXT: v_mov_b32_e32 v26, s10 +; NOOPT-NEXT: v_mov_b32_e32 v25, s9 +; NOOPT-NEXT: v_mov_b32_e32 v24, s8 +; NOOPT-NEXT: v_mov_b32_e32 v23, s7 +; NOOPT-NEXT: v_mov_b32_e32 v22, s6 +; NOOPT-NEXT: v_mov_b32_e32 v21, s5 +; NOOPT-NEXT: v_mov_b32_e32 v20, s4 +; NOOPT-NEXT: v_mov_b32_e32 v19, s3 +; NOOPT-NEXT: v_mov_b32_e32 v18, s2 +; NOOPT-NEXT: v_mov_b32_e32 v17, s1 +; NOOPT-NEXT: v_mov_b32_e32 v16, s0 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v31 -; NOOPT-NEXT: v_mov_b32_e32 v2, v30 -; NOOPT-NEXT: v_mov_b32_e32 v3, v29 -; NOOPT-NEXT: v_mov_b32_e32 v4, v28 -; NOOPT-NEXT: v_mov_b32_e32 v5, v27 -; NOOPT-NEXT: v_mov_b32_e32 v6, v26 -; NOOPT-NEXT: v_mov_b32_e32 v7, v25 -; NOOPT-NEXT: v_mov_b32_e32 v8, v24 -; NOOPT-NEXT: v_mov_b32_e32 v9, v23 -; NOOPT-NEXT: v_mov_b32_e32 v10, v22 -; NOOPT-NEXT: v_mov_b32_e32 v11, v21 -; NOOPT-NEXT: v_mov_b32_e32 v12, v20 -; NOOPT-NEXT: v_mov_b32_e32 v13, v19 -; NOOPT-NEXT: v_mov_b32_e32 v14, v18 -; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill @@ -4103,202 +4102,195 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill -; NOOPT-NEXT: v_mov_b32_e32 v17, 33 -; NOOPT-NEXT: buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v16, 33 +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 -; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: v_writelane_b32 v31, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 5 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 -; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(6) -; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(5) -; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(4) -; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(3) -; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(2) -; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 7 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00 -; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 7 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB14_1 ; NOOPT-NEXT: ; %bb.2: ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 5 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 -; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 -; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 -; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(12) -; NOOPT-NEXT: v_mov_b32_e32 v5, v19 -; NOOPT-NEXT: v_mov_b32_e32 v6, v18 -; NOOPT-NEXT: v_mov_b32_e32 v7, v17 -; NOOPT-NEXT: v_mov_b32_e32 v1, v16 -; NOOPT-NEXT: s_waitcnt vmcnt(8) -; NOOPT-NEXT: v_mov_b32_e32 v2, v23 -; NOOPT-NEXT: v_mov_b32_e32 v3, v22 -; NOOPT-NEXT: v_mov_b32_e32 v4, v21 -; NOOPT-NEXT: v_mov_b32_e32 v8, v20 -; NOOPT-NEXT: s_waitcnt vmcnt(4) -; NOOPT-NEXT: v_mov_b32_e32 v13, v27 -; NOOPT-NEXT: v_mov_b32_e32 v14, v26 -; NOOPT-NEXT: v_mov_b32_e32 v15, v25 -; NOOPT-NEXT: v_mov_b32_e32 v9, v24 -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v10, v31 -; NOOPT-NEXT: v_mov_b32_e32 v11, v30 -; NOOPT-NEXT: v_mov_b32_e32 v12, v29 -; NOOPT-NEXT: v_mov_b32_e32 v16, v28 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v31, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v31, 3 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: v_mov_b32_e32 v1, v22 +; NOOPT-NEXT: v_mov_b32_e32 v2, v21 +; NOOPT-NEXT: v_mov_b32_e32 v3, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v30 +; NOOPT-NEXT: v_mov_b32_e32 v10, v29 +; NOOPT-NEXT: v_mov_b32_e32 v11, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v17, v12 -; NOOPT-NEXT: v_mov_b32_e32 v18, v11 -; NOOPT-NEXT: v_mov_b32_e32 v19, v10 -; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v16, v11 +; NOOPT-NEXT: v_mov_b32_e32 v17, v10 +; NOOPT-NEXT: v_mov_b32_e32 v18, v9 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v10, v15 -; NOOPT-NEXT: v_mov_b32_e32 v11, v14 -; NOOPT-NEXT: v_mov_b32_e32 v12, v13 -; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v14 +; NOOPT-NEXT: v_mov_b32_e32 v10, v13 +; NOOPT-NEXT: v_mov_b32_e32 v11, v12 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v9, v4 -; NOOPT-NEXT: v_mov_b32_e32 v10, v3 -; NOOPT-NEXT: v_mov_b32_e32 v11, v2 -; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v8, v3 +; NOOPT-NEXT: v_mov_b32_e32 v9, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v1 +; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v2, v7 -; NOOPT-NEXT: v_mov_b32_e32 v3, v6 -; NOOPT-NEXT: v_mov_b32_e32 v4, v5 -; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 -; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: insert_neg_offset_vgpr: @@ -4512,7 +4504,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s20, s20, s9 ; NOOPT-NEXT: s_addc_u32 s21, s21, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -4524,10 +4515,11 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_mov_b32 s1, s6 ; NOOPT-NEXT: s_mov_b32 s2, s5 ; NOOPT-NEXT: s_mov_b32 s3, s4 -; NOOPT-NEXT: v_writelane_b32 v16, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v16, s1, 1 -; NOOPT-NEXT: v_writelane_b32 v16, s2, 2 -; NOOPT-NEXT: v_writelane_b32 v16, s3, 3 +; NOOPT-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v31, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 3 ; NOOPT-NEXT: s_mov_b32 s0, 16 ; NOOPT-NEXT: s_mov_b32 s1, 15 ; NOOPT-NEXT: s_mov_b32 s2, 14 @@ -4546,37 +4538,37 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_mov_b32 s15, 1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v0, s15 -; NOOPT-NEXT: v_mov_b32_e32 v31, s14 -; NOOPT-NEXT: v_mov_b32_e32 v30, s13 -; NOOPT-NEXT: v_mov_b32_e32 v29, s12 -; NOOPT-NEXT: v_mov_b32_e32 v28, s11 -; NOOPT-NEXT: v_mov_b32_e32 v27, s10 -; NOOPT-NEXT: v_mov_b32_e32 v26, s9 -; NOOPT-NEXT: v_mov_b32_e32 v25, s8 -; NOOPT-NEXT: v_mov_b32_e32 v24, s7 -; NOOPT-NEXT: v_mov_b32_e32 v23, s6 -; NOOPT-NEXT: v_mov_b32_e32 v22, s5 -; NOOPT-NEXT: v_mov_b32_e32 v21, s4 -; NOOPT-NEXT: v_mov_b32_e32 v20, s3 -; NOOPT-NEXT: v_mov_b32_e32 v19, s2 -; NOOPT-NEXT: v_mov_b32_e32 v18, s1 -; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: v_mov_b32_e32 v30, s14 +; NOOPT-NEXT: v_mov_b32_e32 v29, s13 +; NOOPT-NEXT: v_mov_b32_e32 v28, s12 +; NOOPT-NEXT: v_mov_b32_e32 v27, s11 +; NOOPT-NEXT: v_mov_b32_e32 v26, s10 +; NOOPT-NEXT: v_mov_b32_e32 v25, s9 +; NOOPT-NEXT: v_mov_b32_e32 v24, s8 +; NOOPT-NEXT: v_mov_b32_e32 v23, s7 +; NOOPT-NEXT: v_mov_b32_e32 v22, s6 +; NOOPT-NEXT: v_mov_b32_e32 v21, s5 +; NOOPT-NEXT: v_mov_b32_e32 v20, s4 +; NOOPT-NEXT: v_mov_b32_e32 v19, s3 +; NOOPT-NEXT: v_mov_b32_e32 v18, s2 +; NOOPT-NEXT: v_mov_b32_e32 v17, s1 +; NOOPT-NEXT: v_mov_b32_e32 v16, s0 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v31 -; NOOPT-NEXT: v_mov_b32_e32 v2, v30 -; NOOPT-NEXT: v_mov_b32_e32 v3, v29 -; NOOPT-NEXT: v_mov_b32_e32 v4, v28 -; NOOPT-NEXT: v_mov_b32_e32 v5, v27 -; NOOPT-NEXT: v_mov_b32_e32 v6, v26 -; NOOPT-NEXT: v_mov_b32_e32 v7, v25 -; NOOPT-NEXT: v_mov_b32_e32 v8, v24 -; NOOPT-NEXT: v_mov_b32_e32 v9, v23 -; NOOPT-NEXT: v_mov_b32_e32 v10, v22 -; NOOPT-NEXT: v_mov_b32_e32 v11, v21 -; NOOPT-NEXT: v_mov_b32_e32 v12, v20 -; NOOPT-NEXT: v_mov_b32_e32 v13, v19 -; NOOPT-NEXT: v_mov_b32_e32 v14, v18 -; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill @@ -4593,202 +4585,195 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill -; NOOPT-NEXT: v_mov_b32_e32 v17, 0x1f4 -; NOOPT-NEXT: buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v16, 0x1f4 +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 -; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: v_writelane_b32 v31, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 5 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 -; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(6) -; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(5) -; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(4) -; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(3) -; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(2) -; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 7 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_add_i32 m0, s2, -16 -; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 7 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB15_1 ; NOOPT-NEXT: ; %bb.2: ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 5 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 -; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 -; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 -; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(12) -; NOOPT-NEXT: v_mov_b32_e32 v5, v19 -; NOOPT-NEXT: v_mov_b32_e32 v6, v18 -; NOOPT-NEXT: v_mov_b32_e32 v7, v17 -; NOOPT-NEXT: v_mov_b32_e32 v1, v16 -; NOOPT-NEXT: s_waitcnt vmcnt(8) -; NOOPT-NEXT: v_mov_b32_e32 v2, v23 -; NOOPT-NEXT: v_mov_b32_e32 v3, v22 -; NOOPT-NEXT: v_mov_b32_e32 v4, v21 -; NOOPT-NEXT: v_mov_b32_e32 v8, v20 -; NOOPT-NEXT: s_waitcnt vmcnt(4) -; NOOPT-NEXT: v_mov_b32_e32 v13, v27 -; NOOPT-NEXT: v_mov_b32_e32 v14, v26 -; NOOPT-NEXT: v_mov_b32_e32 v15, v25 -; NOOPT-NEXT: v_mov_b32_e32 v9, v24 -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v10, v31 -; NOOPT-NEXT: v_mov_b32_e32 v11, v30 -; NOOPT-NEXT: v_mov_b32_e32 v12, v29 -; NOOPT-NEXT: v_mov_b32_e32 v16, v28 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v31, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v31, 3 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: v_mov_b32_e32 v1, v22 +; NOOPT-NEXT: v_mov_b32_e32 v2, v21 +; NOOPT-NEXT: v_mov_b32_e32 v3, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v30 +; NOOPT-NEXT: v_mov_b32_e32 v10, v29 +; NOOPT-NEXT: v_mov_b32_e32 v11, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v17, v12 -; NOOPT-NEXT: v_mov_b32_e32 v18, v11 -; NOOPT-NEXT: v_mov_b32_e32 v19, v10 -; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v16, v11 +; NOOPT-NEXT: v_mov_b32_e32 v17, v10 +; NOOPT-NEXT: v_mov_b32_e32 v18, v9 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v10, v15 -; NOOPT-NEXT: v_mov_b32_e32 v11, v14 -; NOOPT-NEXT: v_mov_b32_e32 v12, v13 -; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v14 +; NOOPT-NEXT: v_mov_b32_e32 v10, v13 +; NOOPT-NEXT: v_mov_b32_e32 v11, v12 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v9, v4 -; NOOPT-NEXT: v_mov_b32_e32 v10, v3 -; NOOPT-NEXT: v_mov_b32_e32 v11, v2 -; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v8, v3 +; NOOPT-NEXT: v_mov_b32_e32 v9, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v1 +; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v2, v7 -; NOOPT-NEXT: v_mov_b32_e32 v3, v6 -; NOOPT-NEXT: v_mov_b32_e32 v4, v5 -; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 -; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: insert_neg_inline_offset_vgpr: @@ -5053,13 +5038,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_mov_b32 s39, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s36, s36, s9 ; NOOPT-NEXT: s_addc_u32 s37, s37, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3] -; NOOPT-NEXT: v_mov_b32_e32 v1, v0 -; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[28:29] -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill ; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -5071,32 +5051,32 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_mov_b32 s5, s8 ; NOOPT-NEXT: s_mov_b32 s6, s3 ; NOOPT-NEXT: s_mov_b32 s7, s2 -; NOOPT-NEXT: s_waitcnt vmcnt(1) -; NOOPT-NEXT: v_writelane_b32 v0, s4, 0 -; NOOPT-NEXT: v_writelane_b32 v0, s5, 1 -; NOOPT-NEXT: v_writelane_b32 v0, s6, 2 -; NOOPT-NEXT: v_writelane_b32 v0, s7, 3 +; NOOPT-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v18, s4, 0 +; NOOPT-NEXT: v_writelane_b32 v18, s5, 1 +; NOOPT-NEXT: v_writelane_b32 v18, s6, 2 +; NOOPT-NEXT: v_writelane_b32 v18, s7, 3 ; NOOPT-NEXT: s_mov_b32 s4, 0 -; NOOPT-NEXT: v_writelane_b32 v0, s4, 4 +; NOOPT-NEXT: v_writelane_b32 v18, s4, 4 ; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; NOOPT-NEXT: s_mov_b32 s5, s2 ; NOOPT-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; NOOPT-NEXT: s_mov_b32 s4, 2 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; NOOPT-NEXT: s_mov_b32 s4, 0 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: v_mov_b32_e32 v3, 0 -; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v2, v3 -; NOOPT-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 glc +; NOOPT-NEXT: v_mov_b32_e32 v2, 0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b32 s0, 1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_add_i32_e64 v1, s[0:1], v1, s0 -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: v_add_i32_e64 v0, s[0:1], v0, s0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b32 s16, 16 ; NOOPT-NEXT: s_mov_b32 s17, 15 ; NOOPT-NEXT: s_mov_b32 s18, 14 @@ -5125,255 +5105,266 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_mov_b32 s13, s18 ; NOOPT-NEXT: s_mov_b32 s14, s17 ; NOOPT-NEXT: s_mov_b32 s15, s16 -; NOOPT-NEXT: v_writelane_b32 v0, s0, 5 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 6 -; NOOPT-NEXT: v_writelane_b32 v0, s2, 7 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 8 -; NOOPT-NEXT: v_writelane_b32 v0, s4, 9 -; NOOPT-NEXT: v_writelane_b32 v0, s5, 10 -; NOOPT-NEXT: v_writelane_b32 v0, s6, 11 -; NOOPT-NEXT: v_writelane_b32 v0, s7, 12 -; NOOPT-NEXT: v_writelane_b32 v0, s8, 13 -; NOOPT-NEXT: v_writelane_b32 v0, s9, 14 -; NOOPT-NEXT: v_writelane_b32 v0, s10, 15 -; NOOPT-NEXT: v_writelane_b32 v0, s11, 16 -; NOOPT-NEXT: v_writelane_b32 v0, s12, 17 -; NOOPT-NEXT: v_writelane_b32 v0, s13, 18 -; NOOPT-NEXT: v_writelane_b32 v0, s14, 19 -; NOOPT-NEXT: v_writelane_b32 v0, s15, 20 +; NOOPT-NEXT: v_writelane_b32 v18, s0, 5 +; NOOPT-NEXT: v_writelane_b32 v18, s1, 6 +; NOOPT-NEXT: v_writelane_b32 v18, s2, 7 +; NOOPT-NEXT: v_writelane_b32 v18, s3, 8 +; NOOPT-NEXT: v_writelane_b32 v18, s4, 9 +; NOOPT-NEXT: v_writelane_b32 v18, s5, 10 +; NOOPT-NEXT: v_writelane_b32 v18, s6, 11 +; NOOPT-NEXT: v_writelane_b32 v18, s7, 12 +; NOOPT-NEXT: v_writelane_b32 v18, s8, 13 +; NOOPT-NEXT: v_writelane_b32 v18, s9, 14 +; NOOPT-NEXT: v_writelane_b32 v18, s10, 15 +; NOOPT-NEXT: v_writelane_b32 v18, s11, 16 +; NOOPT-NEXT: v_writelane_b32 v18, s12, 17 +; NOOPT-NEXT: v_writelane_b32 v18, s13, 18 +; NOOPT-NEXT: v_writelane_b32 v18, s14, 19 +; NOOPT-NEXT: v_writelane_b32 v18, s15, 20 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v16, s15 -; NOOPT-NEXT: v_mov_b32_e32 v15, s14 -; NOOPT-NEXT: v_mov_b32_e32 v14, s13 -; NOOPT-NEXT: v_mov_b32_e32 v13, s12 -; NOOPT-NEXT: v_mov_b32_e32 v12, s11 -; NOOPT-NEXT: v_mov_b32_e32 v11, s10 -; NOOPT-NEXT: v_mov_b32_e32 v10, s9 -; NOOPT-NEXT: v_mov_b32_e32 v9, s8 -; NOOPT-NEXT: v_mov_b32_e32 v8, s7 -; NOOPT-NEXT: v_mov_b32_e32 v7, s6 -; NOOPT-NEXT: v_mov_b32_e32 v6, s5 -; NOOPT-NEXT: v_mov_b32_e32 v5, s4 -; NOOPT-NEXT: v_mov_b32_e32 v4, s3 -; NOOPT-NEXT: v_mov_b32_e32 v3, s2 -; NOOPT-NEXT: v_mov_b32_e32 v2, s1 -; NOOPT-NEXT: v_mov_b32_e32 v1, s0 -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: v_mov_b32_e32 v1, s1 +; NOOPT-NEXT: v_mov_b32_e32 v2, s2 +; NOOPT-NEXT: v_mov_b32_e32 v3, s3 +; NOOPT-NEXT: v_mov_b32_e32 v4, s4 +; NOOPT-NEXT: v_mov_b32_e32 v5, s5 +; NOOPT-NEXT: v_mov_b32_e32 v6, s6 +; NOOPT-NEXT: v_mov_b32_e32 v7, s7 +; NOOPT-NEXT: v_mov_b32_e32 v8, s8 +; NOOPT-NEXT: v_mov_b32_e32 v9, s9 +; NOOPT-NEXT: v_mov_b32_e32 v10, s10 +; NOOPT-NEXT: v_mov_b32_e32 v11, s11 +; NOOPT-NEXT: v_mov_b32_e32 v12, s12 +; NOOPT-NEXT: v_mov_b32_e32 v13, s13 +; NOOPT-NEXT: v_mov_b32_e32 v14, s14 +; NOOPT-NEXT: v_mov_b32_e32 v15, s15 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v0, s0, 21 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 22 +; NOOPT-NEXT: v_writelane_b32 v18, s0, 21 +; NOOPT-NEXT: v_writelane_b32 v18, s1, 22 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: ; implicit-def: $vgpr0 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) +; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) +; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) +; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) +; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) +; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 23 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 24 -; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: v_readlane_b32 s0, v18, 23 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 24 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v16 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 -; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 23 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 24 +; NOOPT-NEXT: v_writelane_b32 v18, s2, 23 +; NOOPT-NEXT: v_writelane_b32 v18, s3, 24 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB16_1 ; NOOPT-NEXT: ; %bb.2: ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 21 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 22 +; NOOPT-NEXT: v_readlane_b32 s0, v18, 21 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 22 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: ;;#ASMSTART ; NOOPT-NEXT: s_mov_b32 s4, 17 ; NOOPT-NEXT: ;;#ASMEND ; NOOPT-NEXT: s_mov_b32 s16, s4 ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 5 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s2, v0, 7 -; NOOPT-NEXT: v_readlane_b32 s3, v0, 8 -; NOOPT-NEXT: v_readlane_b32 s4, v0, 9 -; NOOPT-NEXT: v_readlane_b32 s5, v0, 10 -; NOOPT-NEXT: v_readlane_b32 s6, v0, 11 -; NOOPT-NEXT: v_readlane_b32 s7, v0, 12 -; NOOPT-NEXT: v_readlane_b32 s8, v0, 13 -; NOOPT-NEXT: v_readlane_b32 s9, v0, 14 -; NOOPT-NEXT: v_readlane_b32 s10, v0, 15 -; NOOPT-NEXT: v_readlane_b32 s11, v0, 16 -; NOOPT-NEXT: v_readlane_b32 s12, v0, 17 -; NOOPT-NEXT: v_readlane_b32 s13, v0, 18 -; NOOPT-NEXT: v_readlane_b32 s14, v0, 19 -; NOOPT-NEXT: v_readlane_b32 s15, v0, 20 -; NOOPT-NEXT: v_writelane_b32 v0, s16, 25 -; NOOPT-NEXT: v_mov_b32_e32 v16, s15 -; NOOPT-NEXT: v_mov_b32_e32 v15, s14 -; NOOPT-NEXT: v_mov_b32_e32 v14, s13 -; NOOPT-NEXT: v_mov_b32_e32 v13, s12 -; NOOPT-NEXT: v_mov_b32_e32 v12, s11 -; NOOPT-NEXT: v_mov_b32_e32 v11, s10 -; NOOPT-NEXT: v_mov_b32_e32 v10, s9 -; NOOPT-NEXT: v_mov_b32_e32 v9, s8 -; NOOPT-NEXT: v_mov_b32_e32 v8, s7 -; NOOPT-NEXT: v_mov_b32_e32 v7, s6 -; NOOPT-NEXT: v_mov_b32_e32 v6, s5 -; NOOPT-NEXT: v_mov_b32_e32 v5, s4 -; NOOPT-NEXT: v_mov_b32_e32 v4, s3 -; NOOPT-NEXT: v_mov_b32_e32 v3, s2 -; NOOPT-NEXT: v_mov_b32_e32 v2, s1 -; NOOPT-NEXT: v_mov_b32_e32 v1, s0 -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: v_readlane_b32 s0, v18, 5 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 6 +; NOOPT-NEXT: v_readlane_b32 s2, v18, 7 +; NOOPT-NEXT: v_readlane_b32 s3, v18, 8 +; NOOPT-NEXT: v_readlane_b32 s4, v18, 9 +; NOOPT-NEXT: v_readlane_b32 s5, v18, 10 +; NOOPT-NEXT: v_readlane_b32 s6, v18, 11 +; NOOPT-NEXT: v_readlane_b32 s7, v18, 12 +; NOOPT-NEXT: v_readlane_b32 s8, v18, 13 +; NOOPT-NEXT: v_readlane_b32 s9, v18, 14 +; NOOPT-NEXT: v_readlane_b32 s10, v18, 15 +; NOOPT-NEXT: v_readlane_b32 s11, v18, 16 +; NOOPT-NEXT: v_readlane_b32 s12, v18, 17 +; NOOPT-NEXT: v_readlane_b32 s13, v18, 18 +; NOOPT-NEXT: v_readlane_b32 s14, v18, 19 +; NOOPT-NEXT: v_readlane_b32 s15, v18, 20 +; NOOPT-NEXT: v_writelane_b32 v18, s16, 25 +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: v_mov_b32_e32 v1, s1 +; NOOPT-NEXT: v_mov_b32_e32 v2, s2 +; NOOPT-NEXT: v_mov_b32_e32 v3, s3 +; NOOPT-NEXT: v_mov_b32_e32 v4, s4 +; NOOPT-NEXT: v_mov_b32_e32 v5, s5 +; NOOPT-NEXT: v_mov_b32_e32 v6, s6 +; NOOPT-NEXT: v_mov_b32_e32 v7, s7 +; NOOPT-NEXT: v_mov_b32_e32 v8, s8 +; NOOPT-NEXT: v_mov_b32_e32 v9, s9 +; NOOPT-NEXT: v_mov_b32_e32 v10, s10 +; NOOPT-NEXT: v_mov_b32_e32 v11, s11 +; NOOPT-NEXT: v_mov_b32_e32 v12, s12 +; NOOPT-NEXT: v_mov_b32_e32 v13, s13 +; NOOPT-NEXT: v_mov_b32_e32 v14, s14 +; NOOPT-NEXT: v_mov_b32_e32 v15, s15 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v0, s0, 26 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 27 +; NOOPT-NEXT: v_writelane_b32 v18, s0, 26 +; NOOPT-NEXT: v_writelane_b32 v18, s1, 27 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: ; implicit-def: $vgpr0 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB16_4: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:100 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:104 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:108 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:112 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) +; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) +; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) +; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) +; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) +; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 28 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 29 -; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: v_readlane_b32 s0, v18, 28 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 29 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v16 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 -; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 28 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 29 +; NOOPT-NEXT: v_writelane_b32 v18, s2, 28 +; NOOPT-NEXT: v_writelane_b32 v18, s3, 29 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB16_4 ; NOOPT-NEXT: ; %bb.5: ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 26 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 27 +; NOOPT-NEXT: v_readlane_b32 s0, v18, 26 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 27 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.6: +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 -; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 -; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 -; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_store_dword v3, off, s[4:7], 0 -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v18, 4 +; NOOPT-NEXT: v_readlane_b32 s4, v18, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v18, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v18, 2 +; NOOPT-NEXT: v_readlane_b32 s7, v18, 3 ; NOOPT-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; NOOPT-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v0, s0, 30 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 31 +; NOOPT-NEXT: v_writelane_b32 v18, s0, 30 +; NOOPT-NEXT: v_writelane_b32 v18, s1, 31 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] @@ -5381,10 +5372,10 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: ; %bb.7: ; %bb1 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s4, v0, 25 +; NOOPT-NEXT: v_readlane_b32 s4, v18, 25 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 @@ -5401,13 +5392,12 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: .LBB16_8: ; %bb2 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 30 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 31 +; NOOPT-NEXT: v_readlane_b32 s0, v18, 30 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 31 ; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] -; NOOPT-NEXT: ; kill: killed $vgpr0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: extract_vgpr_offset_multiple_in_block: @@ -5827,7 +5817,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_mov_b32 s31, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s28, s28, s9 ; NOOPT-NEXT: s_addc_u32 s29, s29, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Spill ; NOOPT-NEXT: s_load_dwordx2 s[18:19], s[2:3], 0x9 ; NOOPT-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0xd @@ -5841,12 +5830,13 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_mov_b32 s21, s24 ; NOOPT-NEXT: s_mov_b32 s22, s19 ; NOOPT-NEXT: s_mov_b32 s23, s18 -; NOOPT-NEXT: v_writelane_b32 v16, s20, 0 -; NOOPT-NEXT: v_writelane_b32 v16, s21, 1 -; NOOPT-NEXT: v_writelane_b32 v16, s22, 2 -; NOOPT-NEXT: v_writelane_b32 v16, s23, 3 +; NOOPT-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v32, s20, 0 +; NOOPT-NEXT: v_writelane_b32 v32, s21, 1 +; NOOPT-NEXT: v_writelane_b32 v32, s22, 2 +; NOOPT-NEXT: v_writelane_b32 v32, s23, 3 ; NOOPT-NEXT: s_mov_b32 s20, 0 -; NOOPT-NEXT: v_writelane_b32 v16, s20, 4 +; NOOPT-NEXT: v_writelane_b32 v32, s20, 4 ; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21 ; NOOPT-NEXT: s_mov_b32 s21, s18 ; NOOPT-NEXT: ; kill: def $sgpr16_sgpr17 killed $sgpr16_sgpr17 def $sgpr16_sgpr17_sgpr18_sgpr19 @@ -5890,115 +5880,113 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: v_mov_b32_e32 v14, s14 ; NOOPT-NEXT: v_mov_b32_e32 v15, s15 ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v16, s0, 5 -; NOOPT-NEXT: v_writelane_b32 v16, s1, 6 +; NOOPT-NEXT: v_writelane_b32 v32, s0, 5 +; NOOPT-NEXT: v_writelane_b32 v32, s1, 6 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 -; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(6) -; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(5) -; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(4) -; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(3) -; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(2) -; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 8 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 -; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 -; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:96 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:100 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:104 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:108 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:112 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:116 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:120 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:124 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:128 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:132 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:136 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 7 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 8 +; NOOPT-NEXT: v_writelane_b32 v32, s2, 7 +; NOOPT-NEXT: v_writelane_b32 v32, s3, 8 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB17_1 ; NOOPT-NEXT: ; %bb.2: ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 5 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 5 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 6 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Reload @@ -6018,16 +6006,16 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: v_mov_b32_e32 v17, 63 -; NOOPT-NEXT: buffer_store_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v16, 63 +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec ; NOOPT-NEXT: s_waitcnt vmcnt(1) -; NOOPT-NEXT: v_writelane_b32 v16, s0, 9 -; NOOPT-NEXT: v_writelane_b32 v16, s1, 10 +; NOOPT-NEXT: v_writelane_b32 v32, s0, 9 +; NOOPT-NEXT: v_writelane_b32 v32, s1, 10 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill @@ -6047,193 +6035,186 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB17_4: ; =>This Inner Loop Header: Depth=1 -; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 11 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 12 -; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(6) -; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(5) -; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(4) -; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(3) -; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(2) -; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 11 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 12 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 -; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 -; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:228 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:232 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:236 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:240 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:244 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:248 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:252 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:256 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:260 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:264 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:268 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:228 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:232 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:236 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:240 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:244 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:248 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:252 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:256 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:260 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:264 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:268 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 11 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 12 +; NOOPT-NEXT: v_writelane_b32 v32, s2, 11 +; NOOPT-NEXT: v_writelane_b32 v32, s3, 12 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB17_4 ; NOOPT-NEXT: ; %bb.5: ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 10 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.6: +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 -; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 -; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 -; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v19, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v20, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v21, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v22, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v23, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v24, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v25, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v26, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v27, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v28, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v29, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v30, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v31, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(12) -; NOOPT-NEXT: v_mov_b32_e32 v6, v20 -; NOOPT-NEXT: v_mov_b32_e32 v7, v19 -; NOOPT-NEXT: v_mov_b32_e32 v8, v18 -; NOOPT-NEXT: v_mov_b32_e32 v2, v17 -; NOOPT-NEXT: s_waitcnt vmcnt(8) -; NOOPT-NEXT: v_mov_b32_e32 v3, v24 -; NOOPT-NEXT: v_mov_b32_e32 v4, v23 -; NOOPT-NEXT: v_mov_b32_e32 v5, v22 -; NOOPT-NEXT: v_mov_b32_e32 v9, v21 -; NOOPT-NEXT: s_waitcnt vmcnt(4) -; NOOPT-NEXT: v_mov_b32_e32 v14, v28 -; NOOPT-NEXT: v_mov_b32_e32 v15, v27 -; NOOPT-NEXT: v_mov_b32_e32 v16, v26 -; NOOPT-NEXT: v_mov_b32_e32 v10, v25 -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v11, v32 -; NOOPT-NEXT: v_mov_b32_e32 v12, v31 -; NOOPT-NEXT: v_mov_b32_e32 v13, v30 -; NOOPT-NEXT: v_mov_b32_e32 v17, v29 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 4 +; NOOPT-NEXT: v_readlane_b32 s4, v32, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v32, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v32, 2 +; NOOPT-NEXT: v_readlane_b32 s7, v32, 3 +; NOOPT-NEXT: v_mov_b32_e32 v5, v19 +; NOOPT-NEXT: v_mov_b32_e32 v6, v18 +; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v16 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v22 +; NOOPT-NEXT: v_mov_b32_e32 v4, v21 +; NOOPT-NEXT: v_mov_b32_e32 v8, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v27 +; NOOPT-NEXT: v_mov_b32_e32 v14, v26 +; NOOPT-NEXT: v_mov_b32_e32 v15, v25 +; NOOPT-NEXT: v_mov_b32_e32 v9, v24 +; NOOPT-NEXT: v_mov_b32_e32 v10, v31 +; NOOPT-NEXT: v_mov_b32_e32 v11, v30 +; NOOPT-NEXT: v_mov_b32_e32 v12, v29 +; NOOPT-NEXT: v_mov_b32_e32 v16, v28 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v18, v13 -; NOOPT-NEXT: v_mov_b32_e32 v19, v12 -; NOOPT-NEXT: v_mov_b32_e32 v20, v11 -; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v17, v12 +; NOOPT-NEXT: v_mov_b32_e32 v18, v11 +; NOOPT-NEXT: v_mov_b32_e32 v19, v10 +; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v11, v16 -; NOOPT-NEXT: v_mov_b32_e32 v12, v15 -; NOOPT-NEXT: v_mov_b32_e32 v13, v14 -; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], off, s[4:7], 0 offset:32 +; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v10, v15 +; NOOPT-NEXT: v_mov_b32_e32 v11, v14 +; NOOPT-NEXT: v_mov_b32_e32 v12, v13 +; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:32 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v10, v5 -; NOOPT-NEXT: v_mov_b32_e32 v11, v4 -; NOOPT-NEXT: v_mov_b32_e32 v12, v3 -; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v9, v4 +; NOOPT-NEXT: v_mov_b32_e32 v10, v3 +; NOOPT-NEXT: v_mov_b32_e32 v11, v2 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v3, v8 -; NOOPT-NEXT: v_mov_b32_e32 v4, v7 -; NOOPT-NEXT: v_mov_b32_e32 v5, v6 -; NOOPT-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v7 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: v_mov_b32_e32 v4, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v0, s0, 13 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 14 +; NOOPT-NEXT: v_writelane_b32 v32, s0, 13 +; NOOPT-NEXT: v_writelane_b32 v32, s1, 14 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execz .LBB17_8 ; NOOPT-NEXT: ; %bb.7: ; %bb1 -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s6, s1 @@ -6251,13 +6232,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: .LBB17_8: ; %bb2 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 13 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 14 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 13 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 14 ; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] -; NOOPT-NEXT: ; kill: killed $vgpr0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: insert_vgpr_offset_multiple_in_block: @@ -7279,28 +7259,28 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: s_mov_b32 s15, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s12, s12, s9 ; NOOPT-NEXT: s_addc_u32 s13, s13, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; NOOPT-NEXT: s_load_dword s2, s[2:3], 0x9 ; NOOPT-NEXT: s_mov_b64 s[0:1], -1 ; NOOPT-NEXT: ; implicit-def: $sgpr3 ; NOOPT-NEXT: s_mov_b32 s3, 0 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_cmp_lg_u32 s2, s3 -; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v4, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v4, s1, 1 ; NOOPT-NEXT: s_mov_b64 s[8:9], exec ; NOOPT-NEXT: s_mov_b64 exec, -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[8:9] ; NOOPT-NEXT: s_cbranch_scc1 .LBB19_3 ; NOOPT-NEXT: .LBB19_1: ; %Flow ; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[8:9] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s0, v4, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v4, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr2 ; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; NOOPT-NEXT: s_mov_b32 s0, 1 @@ -7330,7 +7310,7 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: .LBB19_3: ; %bb4 ; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[8:9] ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s6, s1 @@ -7342,24 +7322,21 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: s_mov_b32 s1, s6 ; NOOPT-NEXT: s_mov_b32 s2, s5 ; NOOPT-NEXT: s_mov_b32 s3, s4 -; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: ; implicit-def: $sgpr0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: ;;#ASMSTART -; NOOPT-NEXT: ; reg use v[1:4] +; NOOPT-NEXT: ; reg use v[0:3] ; NOOPT-NEXT: ;;#ASMEND ; NOOPT-NEXT: s_mov_b64 s[0:1], 0 -; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v4, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v4, s1, 1 ; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[8:9] ; NOOPT-NEXT: s_branch .LBB19_1 ; NOOPT-NEXT: .LBB19_4: ; %bb7 -; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[8:9] ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s7, s1 @@ -7371,10 +7348,9 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: s_mov_b32 s1, s7 ; NOOPT-NEXT: s_mov_b32 s2, s6 ; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: v_mov_b32_e32 v1, s4 -; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: ; kill: killed $vgpr0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: extract_adjacent_blocks: @@ -7525,7 +7501,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s16, s16, s9 ; NOOPT-NEXT: s_addc_u32 s17, s17, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3] ; NOOPT-NEXT: s_load_dword s2, s[0:1], 0x9 ; NOOPT-NEXT: s_load_dword s0, s[0:1], 0xa @@ -7534,21 +7509,22 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: s_mov_b32 s3, 0 ; NOOPT-NEXT: s_cmp_lg_u32 s2, s3 -; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v4, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v4, s1, 1 ; NOOPT-NEXT: s_mov_b64 s[12:13], exec ; NOOPT-NEXT: s_mov_b64 exec, -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_cbranch_scc1 .LBB20_3 ; NOOPT-NEXT: .LBB20_1: ; %Flow ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s0, v4, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v4, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; NOOPT-NEXT: s_mov_b32 s0, 1 @@ -7579,7 +7555,7 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: .LBB20_3: ; %bb4 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s6, s1 @@ -7591,25 +7567,22 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: s_mov_b32 s1, s6 ; NOOPT-NEXT: s_mov_b32 s2, s5 ; NOOPT-NEXT: s_mov_b32 s3, s4 -; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: ;;#ASMSTART -; NOOPT-NEXT: ; reg use v[1:4] +; NOOPT-NEXT: ; reg use v[0:3] ; NOOPT-NEXT: ;;#ASMEND ; NOOPT-NEXT: s_mov_b64 s[0:1], 0 -; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v4, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v4, s1, 1 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_branch .LBB20_1 ; NOOPT-NEXT: .LBB20_4: ; %bb7 -; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s10, s1 @@ -7621,13 +7594,12 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: s_mov_b32 s1, s10 ; NOOPT-NEXT: s_mov_b32 s2, s9 ; NOOPT-NEXT: s_mov_b32 s3, s8 -; NOOPT-NEXT: v_mov_b32_e32 v1, s4 -; NOOPT-NEXT: v_mov_b32_e32 v2, s5 -; NOOPT-NEXT: v_mov_b32_e32 v3, s6 -; NOOPT-NEXT: v_mov_b32_e32 v4, s7 -; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 +; NOOPT-NEXT: v_mov_b32_e32 v1, s5 +; NOOPT-NEXT: v_mov_b32_e32 v2, s6 +; NOOPT-NEXT: v_mov_b32_e32 v3, s7 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: ; kill: killed $vgpr0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: insert_adjacent_blocks: @@ -9084,49 +9056,48 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_mov_b32 s27, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s24, s24, s9 ; NOOPT-NEXT: s_addc_u32 s25, s25, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; NOOPT-NEXT: s_load_dword s1, s[2:3], 0x9 ; NOOPT-NEXT: s_load_dword s0, s[2:3], 0xa +; NOOPT-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: v_writelane_b32 v0, s1, 0 +; NOOPT-NEXT: v_writelane_b32 v18, s1, 0 ; NOOPT-NEXT: s_mov_b32 s1, 8 -; NOOPT-NEXT: v_writelane_b32 v0, s0, 1 +; NOOPT-NEXT: v_writelane_b32 v18, s0, 1 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v0, 8 -; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: .LBB26_1: ; %bb2 ; NOOPT-NEXT: ; =>This Loop Header: Depth=1 ; NOOPT-NEXT: ; Child Loop BB26_3 Depth 2 -; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s2, v0, 0 -; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: v_readlane_b32 s2, v18, 0 ; NOOPT-NEXT: s_mov_b64 s[0:1], -1 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_cmp_ge_i32_e64 s[2:3], v1, s2 -; NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; NOOPT-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, s2 +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 ; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3] -; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: v_writelane_b32 v0, s0, 2 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 3 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: v_writelane_b32 v18, s0, 2 +; NOOPT-NEXT: v_writelane_b32 v18, s1, 3 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_cbranch_vccnz .LBB26_6 ; NOOPT-NEXT: ; %bb.2: ; %bb4 ; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v16, 1 +; NOOPT-NEXT: v_readlane_b32 s0, v18, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 ; NOOPT-NEXT: ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3 ; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5 @@ -9137,7 +9108,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_mov_b32 s5, s3 ; NOOPT-NEXT: s_mov_b32 s6, s2 ; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:80 ; 4-byte Folded Spill @@ -9159,13 +9129,13 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: v_mov_b32_e32 v13, s17 ; NOOPT-NEXT: v_mov_b32_e32 v14, s18 ; NOOPT-NEXT: v_mov_b32_e32 v15, s19 -; NOOPT-NEXT: v_mov_b32_e32 v17, s0 -; NOOPT-NEXT: buffer_store_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v16, s0 +; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 -; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: v_writelane_b32 v18, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v18, s1, 5 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill @@ -9186,146 +9156,139 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB26_3: ; Parent Loop BB26_1 Depth=1 ; NOOPT-NEXT: ; => This Inner Loop Header: Depth=2 -; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[20:21] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 -; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(6) -; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(5) -; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(4) -; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(3) -; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(2) -; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: v_readlane_b32 s0, v18, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 7 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 -; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 -; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: v_writelane_b32 v18, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v18, s3, 7 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB26_3 ; NOOPT-NEXT: ; %bb.4: ; in Loop: Header=BB26_1 Depth=1 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s0, v18, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 5 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.5: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:96 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:100 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:104 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:108 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:112 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:116 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:120 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:124 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] -; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 s[0:1], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(14) -; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: v_writelane_b32 v0, s0, 2 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 3 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(1) +; NOOPT-NEXT: v_writelane_b32 v18, s0, 2 +; NOOPT-NEXT: v_writelane_b32 v18, s1, 3 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: .LBB26_6: ; %Flow ; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1 -; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 ; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v1, 2 -; NOOPT-NEXT: v_readlane_b32 s1, v1, 3 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: v_readlane_b32 s0, v18, 2 +; NOOPT-NEXT: v_readlane_b32 s1, v18, 3 ; NOOPT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; NOOPT-NEXT: s_mov_b32 s0, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 ; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: s_cbranch_vccnz .LBB26_1 ; NOOPT-NEXT: ; %bb.7: ; %bb8 -; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[20:21] -; NOOPT-NEXT: ; kill: killed $vgpr0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: broken_phi_bb: @@ -9570,13 +9533,13 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s16, s16, s5 ; NOOPT-NEXT: s_addc_u32 s17, s17, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane -; NOOPT-NEXT: v_writelane_b32 v16, s4, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v33, s4, 0 ; NOOPT-NEXT: s_mov_b32 s4, s1 -; NOOPT-NEXT: v_readlane_b32 s1, v16, 0 -; NOOPT-NEXT: v_writelane_b32 v16, s4, 1 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 0 +; NOOPT-NEXT: v_writelane_b32 v33, s4, 1 ; NOOPT-NEXT: s_mov_b32 s4, s0 -; NOOPT-NEXT: v_readlane_b32 s0, v16, 1 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 1 ; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill ; NOOPT-NEXT: v_mov_b32_e32 v2, v1 ; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 @@ -9591,17 +9554,17 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b32 s8, 0xf000 ; NOOPT-NEXT: s_mov_b32 s0, 0 -; NOOPT-NEXT: v_writelane_b32 v16, s0, 2 +; NOOPT-NEXT: v_writelane_b32 v33, s0, 2 ; NOOPT-NEXT: s_mov_b32 s2, s0 ; NOOPT-NEXT: s_mov_b32 s3, s8 ; NOOPT-NEXT: s_mov_b32 s8, s0 ; NOOPT-NEXT: s_mov_b32 s9, s0 ; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 ; NOOPT-NEXT: s_mov_b64 s[10:11], s[2:3] -; NOOPT-NEXT: v_writelane_b32 v16, s8, 3 -; NOOPT-NEXT: v_writelane_b32 v16, s9, 4 -; NOOPT-NEXT: v_writelane_b32 v16, s10, 5 -; NOOPT-NEXT: v_writelane_b32 v16, s11, 6 +; NOOPT-NEXT: v_writelane_b32 v33, s8, 3 +; NOOPT-NEXT: v_writelane_b32 v33, s9, 4 +; NOOPT-NEXT: v_writelane_b32 v33, s10, 5 +; NOOPT-NEXT: v_writelane_b32 v33, s11, 6 ; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 ; NOOPT-NEXT: s_waitcnt expcnt(1) @@ -9611,7 +9574,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v0, s0 -; NOOPT-NEXT: v_mov_b32_e32 v31, s0 ; NOOPT-NEXT: v_mov_b32_e32 v30, s0 ; NOOPT-NEXT: v_mov_b32_e32 v29, s0 ; NOOPT-NEXT: v_mov_b32_e32 v28, s0 @@ -9626,22 +9588,23 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: v_mov_b32_e32 v19, s0 ; NOOPT-NEXT: v_mov_b32_e32 v18, s0 ; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: v_mov_b32_e32 v16, s0 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v31 -; NOOPT-NEXT: v_mov_b32_e32 v2, v30 -; NOOPT-NEXT: v_mov_b32_e32 v3, v29 -; NOOPT-NEXT: v_mov_b32_e32 v4, v28 -; NOOPT-NEXT: v_mov_b32_e32 v5, v27 -; NOOPT-NEXT: v_mov_b32_e32 v6, v26 -; NOOPT-NEXT: v_mov_b32_e32 v7, v25 -; NOOPT-NEXT: v_mov_b32_e32 v8, v24 -; NOOPT-NEXT: v_mov_b32_e32 v9, v23 -; NOOPT-NEXT: v_mov_b32_e32 v10, v22 -; NOOPT-NEXT: v_mov_b32_e32 v11, v21 -; NOOPT-NEXT: v_mov_b32_e32 v12, v20 -; NOOPT-NEXT: v_mov_b32_e32 v13, v19 -; NOOPT-NEXT: v_mov_b32_e32 v14, v18 -; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:72 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:76 ; 4-byte Folded Spill @@ -9659,207 +9622,200 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v16, s0, 7 -; NOOPT-NEXT: v_writelane_b32 v16, s1, 8 +; NOOPT-NEXT: v_writelane_b32 v33, s0, 7 +; NOOPT-NEXT: v_writelane_b32 v33, s1, 8 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 -; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] -; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 -; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 exec, s[12:13] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 -; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(6) -; NOOPT-NEXT: buffer_load_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(5) -; NOOPT-NEXT: buffer_load_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(4) -; NOOPT-NEXT: buffer_load_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(3) -; NOOPT-NEXT: buffer_load_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(2) -; NOOPT-NEXT: buffer_load_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: buffer_load_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 10 +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 -; NOOPT-NEXT: v_movreld_b32_e32 v2, v17 -; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: v_movreld_b32_e32 v1, v16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v0, s2, 9 -; NOOPT-NEXT: v_writelane_b32 v0, s3, 10 +; NOOPT-NEXT: v_writelane_b32 v33, s2, 9 +; NOOPT-NEXT: v_writelane_b32 v33, s3, 10 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 -; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execnz .LBB27_1 ; NOOPT-NEXT: ; %bb.2: ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 8 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 3 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s2, v0, 5 -; NOOPT-NEXT: v_readlane_b32 s3, v0, 6 -; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v19, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v20, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v21, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v22, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v23, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v24, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v25, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v26, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v27, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v28, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v29, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v30, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v31, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v32, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(12) -; NOOPT-NEXT: v_mov_b32_e32 v7, v21 -; NOOPT-NEXT: v_mov_b32_e32 v8, v20 -; NOOPT-NEXT: v_mov_b32_e32 v9, v19 -; NOOPT-NEXT: v_mov_b32_e32 v1, v18 -; NOOPT-NEXT: s_waitcnt vmcnt(8) -; NOOPT-NEXT: v_mov_b32_e32 v2, v25 -; NOOPT-NEXT: v_mov_b32_e32 v3, v24 -; NOOPT-NEXT: v_mov_b32_e32 v4, v23 -; NOOPT-NEXT: v_mov_b32_e32 v10, v22 -; NOOPT-NEXT: s_waitcnt vmcnt(4) -; NOOPT-NEXT: v_mov_b32_e32 v15, v29 -; NOOPT-NEXT: v_mov_b32_e32 v16, v28 -; NOOPT-NEXT: v_mov_b32_e32 v17, v27 -; NOOPT-NEXT: v_mov_b32_e32 v11, v26 -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v12, v33 -; NOOPT-NEXT: v_mov_b32_e32 v13, v32 -; NOOPT-NEXT: v_mov_b32_e32 v14, v31 -; NOOPT-NEXT: v_mov_b32_e32 v18, v30 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 3 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 4 +; NOOPT-NEXT: v_readlane_b32 s2, v33, 5 +; NOOPT-NEXT: v_readlane_b32 s3, v33, 6 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v8, v18 +; NOOPT-NEXT: v_mov_b32_e32 v0, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v24 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v22 +; NOOPT-NEXT: v_mov_b32_e32 v9, v21 +; NOOPT-NEXT: v_mov_b32_e32 v14, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 +; NOOPT-NEXT: v_mov_b32_e32 v16, v26 +; NOOPT-NEXT: v_mov_b32_e32 v10, v25 +; NOOPT-NEXT: v_mov_b32_e32 v11, v32 +; NOOPT-NEXT: v_mov_b32_e32 v12, v31 +; NOOPT-NEXT: v_mov_b32_e32 v13, v30 +; NOOPT-NEXT: v_mov_b32_e32 v17, v29 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19_vgpr20_vgpr21 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v19, v14 -; NOOPT-NEXT: v_mov_b32_e32 v20, v13 -; NOOPT-NEXT: v_mov_b32_e32 v21, v12 -; NOOPT-NEXT: v_mov_b32_e32 v13, v6 +; NOOPT-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 ; NOOPT-NEXT: v_mov_b32_e32 v12, v5 -; NOOPT-NEXT: buffer_store_dwordx4 v[18:21], v[12:13], s[0:3], 0 addr64 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v11, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], v[11:12], s[0:3], 0 addr64 offset:48 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12_vgpr13_vgpr14 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v12, v17 -; NOOPT-NEXT: v_mov_b32_e32 v13, v16 -; NOOPT-NEXT: v_mov_b32_e32 v14, v15 -; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v11, v16 +; NOOPT-NEXT: v_mov_b32_e32 v12, v15 +; NOOPT-NEXT: v_mov_b32_e32 v13, v14 ; NOOPT-NEXT: v_mov_b32_e32 v15, v5 -; NOOPT-NEXT: buffer_store_dwordx4 v[11:14], v[15:16], s[0:3], 0 addr64 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v14, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], v[14:15], s[0:3], 0 addr64 offset:32 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v11, v4 -; NOOPT-NEXT: v_mov_b32_e32 v12, v3 -; NOOPT-NEXT: v_mov_b32_e32 v13, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v3 +; NOOPT-NEXT: v_mov_b32_e32 v11, v2 +; NOOPT-NEXT: v_mov_b32_e32 v12, v1 +; NOOPT-NEXT: v_mov_b32_e32 v1, v4 ; NOOPT-NEXT: v_mov_b32_e32 v2, v5 -; NOOPT-NEXT: v_mov_b32_e32 v3, v6 -; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], v[2:3], s[0:3], 0 addr64 offset:16 +; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], v[1:2], s[0:3], 0 addr64 offset:16 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v2, v9 -; NOOPT-NEXT: v_mov_b32_e32 v3, v8 -; NOOPT-NEXT: v_mov_b32_e32 v4, v7 -; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64 -; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v8 +; NOOPT-NEXT: v_mov_b32_e32 v2, v7 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: insert_or_disj_index: diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll index f20d720c3876b..dce4162c24624 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll @@ -41,7 +41,7 @@ entry: } ; FIXME: This should warn too -; ERR-NOT: warning +; ERR-NOT: warning: inline asm clobber list contains reserved registers define amdgpu_kernel void @def_exec(ptr addrspace(1) %ptr) { entry: %exec = call i64 asm sideeffect "; def $0", "={exec}"() diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 9d1368b2ec105..e7c77d3123e82 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -3,6 +3,18 @@ declare i32 @llvm.amdgcn.workitem.id.x() +define <2 x i64> @f1() #0 { +; GFX11-LABEL: f1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + ret <2 x i64> zeroinitializer +} + define void @f0() { ; GFX11-LABEL: f0: ; GFX11: ; %bb.0: ; %bb @@ -36,18 +48,6 @@ bb: ret void } -define <2 x i64> @f1() #0 { -; GFX11-LABEL: f1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - ret <2 x i64> zeroinitializer -} - ; FIXME: This generates "instid1(/* invalid instid value */)". define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { ; GFX11-LABEL: f2: diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index 9e336a714ca67..eef51acc4e12e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -7,13 +7,13 @@ define fastcc i32 @foo() { ; CHECK-LABEL: name: foo ; CHECK: bb.0 (%ir-block.0): ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $vgpr40, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_WAITCNT 0 ; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33 ; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32 ; CHECK-NEXT: $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40 @@ -26,24 +26,22 @@ define fastcc i32 @foo() { ; CHECK-NEXT: BUFFER_GL1_INV implicit $exec ; CHECK-NEXT: BUFFER_GL0_INV implicit $exec ; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr40 - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40 ; CHECK-NEXT: S_WAITCNT 49279 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1 (%ir-block.1): ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; CHECK-NEXT: liveins: $vcc_lo, $vgpr40 + ; CHECK-NEXT: liveins: $vcc_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.DummyReturnBlock: - ; CHECK-NEXT: liveins: $vgpr40 - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1 ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0 - ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 killed $vgpr40, 2 + ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2 ; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index f771536463778..ea18e0d9eeefb 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -1010,73 +1010,73 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bfe_u32 s9, s4, 0xf0001 ; GCN-NEXT: s_lshr_b32 s42, s5, 16 -; GCN-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-NEXT: v_writelane_b32 v0, s1, 1 +; GCN-NEXT: v_writelane_b32 v6, s0, 0 +; GCN-NEXT: v_writelane_b32 v6, s1, 1 ; GCN-NEXT: s_lshr_b32 s0, s4, 16 -; GCN-NEXT: v_writelane_b32 v0, s0, 2 +; GCN-NEXT: v_writelane_b32 v6, s0, 2 ; GCN-NEXT: s_lshr_b32 s0, s4, 17 -; GCN-NEXT: v_writelane_b32 v0, s0, 3 +; GCN-NEXT: v_writelane_b32 v6, s0, 3 ; GCN-NEXT: s_lshr_b32 s0, s4, 18 -; GCN-NEXT: v_writelane_b32 v0, s0, 4 +; GCN-NEXT: v_writelane_b32 v6, s0, 4 ; GCN-NEXT: s_lshr_b32 s0, s4, 19 -; GCN-NEXT: v_writelane_b32 v0, s0, 5 +; GCN-NEXT: v_writelane_b32 v6, s0, 5 ; GCN-NEXT: s_lshr_b32 s0, s4, 20 -; GCN-NEXT: v_writelane_b32 v0, s0, 6 +; GCN-NEXT: v_writelane_b32 v6, s0, 6 ; GCN-NEXT: s_lshr_b32 s0, s4, 21 -; GCN-NEXT: v_writelane_b32 v0, s0, 7 +; GCN-NEXT: v_writelane_b32 v6, s0, 7 ; GCN-NEXT: s_lshr_b32 s0, s4, 22 -; GCN-NEXT: v_writelane_b32 v0, s0, 8 +; GCN-NEXT: v_writelane_b32 v6, s0, 8 ; GCN-NEXT: s_lshr_b32 s0, s4, 23 -; GCN-NEXT: v_writelane_b32 v0, s0, 9 +; GCN-NEXT: v_writelane_b32 v6, s0, 9 ; GCN-NEXT: s_lshr_b32 s0, s4, 24 -; GCN-NEXT: v_writelane_b32 v0, s0, 10 +; GCN-NEXT: v_writelane_b32 v6, s0, 10 ; GCN-NEXT: s_lshr_b32 s0, s4, 25 -; GCN-NEXT: v_writelane_b32 v0, s0, 11 +; GCN-NEXT: v_writelane_b32 v6, s0, 11 ; GCN-NEXT: s_lshr_b32 s0, s4, 26 -; GCN-NEXT: v_writelane_b32 v0, s0, 12 +; GCN-NEXT: v_writelane_b32 v6, s0, 12 ; GCN-NEXT: s_lshr_b32 s0, s4, 27 -; GCN-NEXT: v_writelane_b32 v0, s0, 13 +; GCN-NEXT: v_writelane_b32 v6, s0, 13 ; GCN-NEXT: s_lshr_b32 s0, s4, 28 -; GCN-NEXT: v_writelane_b32 v0, s0, 14 +; GCN-NEXT: v_writelane_b32 v6, s0, 14 ; GCN-NEXT: s_lshr_b32 s0, s4, 29 -; GCN-NEXT: v_writelane_b32 v0, s0, 15 +; GCN-NEXT: v_writelane_b32 v6, s0, 15 ; GCN-NEXT: s_lshr_b32 s0, s4, 30 -; GCN-NEXT: v_writelane_b32 v0, s0, 16 +; GCN-NEXT: v_writelane_b32 v6, s0, 16 ; GCN-NEXT: s_lshr_b32 s0, s4, 31 -; GCN-NEXT: v_writelane_b32 v0, s0, 17 -; GCN-NEXT: v_writelane_b32 v0, s9, 18 +; GCN-NEXT: v_writelane_b32 v6, s0, 17 +; GCN-NEXT: v_writelane_b32 v6, s9, 18 ; GCN-NEXT: s_bfe_u32 s9, s4, 0xe0002 -; GCN-NEXT: v_writelane_b32 v0, s9, 19 +; GCN-NEXT: v_writelane_b32 v6, s9, 19 ; GCN-NEXT: s_bfe_u32 s9, s4, 0xd0003 -; GCN-NEXT: v_writelane_b32 v0, s9, 20 +; GCN-NEXT: v_writelane_b32 v6, s9, 20 ; GCN-NEXT: s_bfe_u32 s9, s4, 0xc0004 -; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v6, s9, 21 ; GCN-NEXT: s_bfe_u32 s9, s4, 0xb0005 -; GCN-NEXT: v_writelane_b32 v0, s9, 22 +; GCN-NEXT: v_writelane_b32 v6, s9, 22 ; GCN-NEXT: s_bfe_u32 s9, s4, 0xa0006 -; GCN-NEXT: v_writelane_b32 v0, s9, 23 +; GCN-NEXT: v_writelane_b32 v6, s9, 23 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x90007 -; GCN-NEXT: v_writelane_b32 v0, s9, 24 +; GCN-NEXT: v_writelane_b32 v6, s9, 24 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GCN-NEXT: v_writelane_b32 v0, s9, 25 +; GCN-NEXT: v_writelane_b32 v6, s9, 25 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x70009 -; GCN-NEXT: v_writelane_b32 v0, s9, 26 +; GCN-NEXT: v_writelane_b32 v6, s9, 26 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x6000a -; GCN-NEXT: v_writelane_b32 v0, s9, 27 +; GCN-NEXT: v_writelane_b32 v6, s9, 27 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x5000b -; GCN-NEXT: v_writelane_b32 v0, s9, 28 +; GCN-NEXT: v_writelane_b32 v6, s9, 28 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x4000c -; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: v_writelane_b32 v6, s9, 29 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x3000d -; GCN-NEXT: v_writelane_b32 v0, s9, 30 +; GCN-NEXT: v_writelane_b32 v6, s9, 30 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x2000e -; GCN-NEXT: v_writelane_b32 v0, s9, 31 +; GCN-NEXT: v_writelane_b32 v6, s9, 31 ; GCN-NEXT: s_bfe_u32 s9, s4, 0x1000f -; GCN-NEXT: v_writelane_b32 v0, s9, 32 +; GCN-NEXT: v_writelane_b32 v6, s9, 32 ; GCN-NEXT: s_bfe_u32 s9, s5, 0xf0001 ; GCN-NEXT: s_lshr_b32 s43, s5, 17 ; GCN-NEXT: s_lshr_b32 s45, s5, 18 @@ -1125,7 +1125,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_lshr_b32 s2, s7, 29 ; GCN-NEXT: s_lshr_b32 s1, s7, 30 ; GCN-NEXT: s_lshr_b32 s0, s7, 31 -; GCN-NEXT: v_writelane_b32 v0, s9, 33 +; GCN-NEXT: v_writelane_b32 v6, s9, 33 ; GCN-NEXT: s_bfe_u32 s40, s5, 0xe0002 ; GCN-NEXT: s_bfe_u32 s41, s5, 0xd0003 ; GCN-NEXT: s_bfe_u32 s44, s5, 0xc0004 @@ -1630,7 +1630,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_and_b32 s5, s5, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 33 -; GCN-NEXT: v_readlane_b32 s9, v0, 33 +; GCN-NEXT: v_readlane_b32 s9, v6, 33 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 1 ; GCN-NEXT: s_or_b32 s5, s5, s9 @@ -1643,21 +1643,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: s_cmp_lg_u32 s8, 31 -; GCN-NEXT: v_readlane_b32 s1, v0, 17 +; GCN-NEXT: v_readlane_b32 s1, v6, 17 ; GCN-NEXT: s_cselect_b32 s1, s1, 1 ; GCN-NEXT: s_lshl_b32 s1, s1, 3 ; GCN-NEXT: s_cmp_lg_u32 s8, 30 -; GCN-NEXT: v_readlane_b32 s2, v0, 16 +; GCN-NEXT: v_readlane_b32 s2, v6, 16 ; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: s_lshl_b32 s2, s2, 2 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: s_cmp_lg_u32 s8, 29 -; GCN-NEXT: v_readlane_b32 s2, v0, 15 +; GCN-NEXT: v_readlane_b32 s2, v6, 15 ; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 28 -; GCN-NEXT: v_readlane_b32 s3, v0, 14 +; GCN-NEXT: v_readlane_b32 s3, v6, 14 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_and_b32 s3, s3, 1 ; GCN-NEXT: s_or_b32 s2, s3, s2 @@ -1665,21 +1665,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s1, s2, s1 ; GCN-NEXT: s_lshl_b32 s1, s1, 12 ; GCN-NEXT: s_cmp_lg_u32 s8, 27 -; GCN-NEXT: v_readlane_b32 s2, v0, 13 +; GCN-NEXT: v_readlane_b32 s2, v6, 13 ; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_lshl_b32 s2, s2, 3 ; GCN-NEXT: s_cmp_lg_u32 s8, 26 -; GCN-NEXT: v_readlane_b32 s3, v0, 12 +; GCN-NEXT: v_readlane_b32 s3, v6, 12 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_and_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 2 ; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_cmp_lg_u32 s8, 25 -; GCN-NEXT: v_readlane_b32 s3, v0, 11 +; GCN-NEXT: v_readlane_b32 s3, v6, 11 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 24 -; GCN-NEXT: v_readlane_b32 s5, v0, 10 +; GCN-NEXT: v_readlane_b32 s5, v6, 10 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_and_b32 s5, s5, 1 ; GCN-NEXT: s_or_b32 s3, s5, s3 @@ -1689,21 +1689,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: s_cmp_lg_u32 s8, 23 -; GCN-NEXT: v_readlane_b32 s2, v0, 9 +; GCN-NEXT: v_readlane_b32 s2, v6, 9 ; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_lshl_b32 s2, s2, 3 ; GCN-NEXT: s_cmp_lg_u32 s8, 22 -; GCN-NEXT: v_readlane_b32 s3, v0, 8 +; GCN-NEXT: v_readlane_b32 s3, v6, 8 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_and_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 2 ; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_cmp_lg_u32 s8, 21 -; GCN-NEXT: v_readlane_b32 s3, v0, 7 +; GCN-NEXT: v_readlane_b32 s3, v6, 7 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 20 -; GCN-NEXT: v_readlane_b32 s5, v0, 6 +; GCN-NEXT: v_readlane_b32 s5, v6, 6 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_and_b32 s5, s5, 1 ; GCN-NEXT: s_or_b32 s3, s5, s3 @@ -1711,21 +1711,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: s_lshl_b32 s2, s2, 4 ; GCN-NEXT: s_cmp_lg_u32 s8, 19 -; GCN-NEXT: v_readlane_b32 s3, v0, 5 +; GCN-NEXT: v_readlane_b32 s3, v6, 5 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 3 ; GCN-NEXT: s_cmp_lg_u32 s8, 18 -; GCN-NEXT: v_readlane_b32 s5, v0, 4 +; GCN-NEXT: v_readlane_b32 s5, v6, 4 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_and_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 2 ; GCN-NEXT: s_or_b32 s3, s3, s5 ; GCN-NEXT: s_cmp_lg_u32 s8, 17 -; GCN-NEXT: v_readlane_b32 s5, v0, 3 +; GCN-NEXT: v_readlane_b32 s5, v6, 3 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 16 -; GCN-NEXT: v_readlane_b32 s9, v0, 2 +; GCN-NEXT: v_readlane_b32 s9, v6, 2 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 ; GCN-NEXT: s_or_b32 s5, s9, s5 @@ -1737,21 +1737,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s1, s2, s1 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_cmp_lg_u32 s8, 15 -; GCN-NEXT: v_readlane_b32 s2, v0, 32 +; GCN-NEXT: v_readlane_b32 s2, v6, 32 ; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_lshl_b32 s2, s2, 3 ; GCN-NEXT: s_cmp_lg_u32 s8, 14 -; GCN-NEXT: v_readlane_b32 s3, v0, 31 +; GCN-NEXT: v_readlane_b32 s3, v6, 31 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_and_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 2 ; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_cmp_lg_u32 s8, 13 -; GCN-NEXT: v_readlane_b32 s3, v0, 30 +; GCN-NEXT: v_readlane_b32 s3, v6, 30 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 12 -; GCN-NEXT: v_readlane_b32 s5, v0, 29 +; GCN-NEXT: v_readlane_b32 s5, v6, 29 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_and_b32 s5, s5, 1 ; GCN-NEXT: s_or_b32 s3, s5, s3 @@ -1759,21 +1759,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: s_lshl_b32 s2, s2, 12 ; GCN-NEXT: s_cmp_lg_u32 s8, 11 -; GCN-NEXT: v_readlane_b32 s3, v0, 28 +; GCN-NEXT: v_readlane_b32 s3, v6, 28 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 3 ; GCN-NEXT: s_cmp_lg_u32 s8, 10 -; GCN-NEXT: v_readlane_b32 s5, v0, 27 +; GCN-NEXT: v_readlane_b32 s5, v6, 27 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_and_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 2 ; GCN-NEXT: s_or_b32 s3, s3, s5 ; GCN-NEXT: s_cmp_lg_u32 s8, 9 -; GCN-NEXT: v_readlane_b32 s5, v0, 26 +; GCN-NEXT: v_readlane_b32 s5, v6, 26 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 8 -; GCN-NEXT: v_readlane_b32 s9, v0, 25 +; GCN-NEXT: v_readlane_b32 s9, v6, 25 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 ; GCN-NEXT: s_or_b32 s5, s9, s5 @@ -1783,21 +1783,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_lshl_b32 s3, s3, 8 ; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: v_readlane_b32 s3, v0, 24 +; GCN-NEXT: v_readlane_b32 s3, v6, 24 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_lshl_b32 s3, s3, 3 ; GCN-NEXT: s_cmp_lg_u32 s8, 6 -; GCN-NEXT: v_readlane_b32 s5, v0, 23 +; GCN-NEXT: v_readlane_b32 s5, v6, 23 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_and_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 2 ; GCN-NEXT: s_or_b32 s3, s3, s5 ; GCN-NEXT: s_cmp_lg_u32 s8, 5 -; GCN-NEXT: v_readlane_b32 s5, v0, 22 +; GCN-NEXT: v_readlane_b32 s5, v6, 22 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 4 -; GCN-NEXT: v_readlane_b32 s9, v0, 21 +; GCN-NEXT: v_readlane_b32 s9, v6, 21 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 ; GCN-NEXT: s_or_b32 s5, s9, s5 @@ -1805,11 +1805,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s3, s5, s3 ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: v_readlane_b32 s5, v0, 20 +; GCN-NEXT: v_readlane_b32 s5, v6, 20 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 3 ; GCN-NEXT: s_cmp_lg_u32 s8, 2 -; GCN-NEXT: v_readlane_b32 s9, v0, 19 +; GCN-NEXT: v_readlane_b32 s9, v6, 19 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 2 @@ -1818,7 +1818,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_cselect_b32 s4, s4, 1 ; GCN-NEXT: s_and_b32 s4, s4, 1 ; GCN-NEXT: s_cmp_lg_u32 s8, 1 -; GCN-NEXT: v_readlane_b32 s8, v0, 18 +; GCN-NEXT: v_readlane_b32 s8, v6, 18 ; GCN-NEXT: s_cselect_b32 s8, s8, 1 ; GCN-NEXT: s_lshl_b32 s8, s8, 1 ; GCN-NEXT: s_or_b32 s4, s4, s8 @@ -1830,16 +1830,15 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_mov_b32_e32 v6, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v4, s7 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: flat_store_dwordx4 v[5:6], v[1:4] -; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_readlane_b32 s0, v6, 0 +; GCN-NEXT: v_readlane_b32 s1, v6, 1 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: %v = insertelement <128 x i1> %vec, i1 1, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index b49931379b84a..957f404c8cdbe 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -30,7 +30,7 @@ define hidden void @func() #1 { ; GCN-NOT: writelane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; NumSgprs: 37 +; GCN: ; TotalNumSgprs: 37 ; GCN: ; NumVgprs: 9 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, ptr addrspace(1) undef @@ -48,7 +48,7 @@ define amdgpu_kernel void @kernel_call() #0 { ; GCN-NOT: readlane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; NumSgprs: 34 +; GCN: ; TotalNumSgprs: 34 ; GCN: ; NumVgprs: 10 define void @func_regular_call() #1 { %vgpr = load volatile i32, ptr addrspace(1) undef @@ -64,7 +64,7 @@ define void @func_regular_call() #1 { ; GCN-NEXT: s_addc_u32 s17, ; GCN-NEXT: s_setpc_b64 s[16:17] -; GCN: ; NumSgprs: 32 +; GCN: ; TotalNumSgprs: 32 ; GCN: ; NumVgprs: 8 define void @func_tail_call() #1 { tail call void @func() @@ -77,7 +77,7 @@ define void @func_tail_call() #1 { ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 ; GCN: s_setpc_b64 -; GCN: ; NumSgprs: 34 +; GCN: ; TotalNumSgprs: 34 ; GCN: ; NumVgprs: 10 define void @func_call_tail_call() #1 { %vgpr = load volatile i32, ptr addrspace(1) undef @@ -105,13 +105,6 @@ define void @test_funcx2() #0 { ret void } -; GCN-LABEL: {{^}}wombat: -define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) { -bb: - call void @hoge() #0 - ret void -} - ; Make sure we save/restore the return address around the call. ; Function Attrs: norecurse define internal void @hoge() #2 { @@ -128,6 +121,13 @@ bb: ret void } +; GCN-LABEL: {{^}}wombat: +define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) { +bb: + call void @hoge() #0 + ret void +} + declare dso_local void @eggs() diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index ec446f1f3bf27..7b195f8e86220 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -13,22 +13,14 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_load_dword s8, s[6:7], 0x0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v0, s8, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: ; implicit-def: $vgpr40 : SGPR spill to VGPR lane +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v40, s8, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def vgpr10 ; CHECK-NEXT: ;;#ASMEND @@ -62,14 +54,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 0 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: s_cmp_eq_u32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000 @@ -77,24 +64,14 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %store -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100000 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b32 v1, v2 -; CHECK-NEXT: ; kill: killed $vgpr0 +; CHECK-NEXT: ds_write_b32 v0, v1 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %end -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm %arr = alloca < 1339 x i32>, align 8192, addrspace(5) %cmp = icmp ne i32 %val, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 7bf1b8746fd87..b192fdec15739 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -123,6 +123,8 @@ ; GCN-O0-NEXT: SI Pre-allocate WWM Registers ; GCN-O0-NEXT: Fast Register Allocator ; GCN-O0-NEXT: SI Lower WWM Copies +; GCN-O0-NEXT: AMDGPU Reserve WWM Registers +; GCN-O0-NEXT: Fast Register Allocator ; GCN-O0-NEXT: SI Fix VGPR copies ; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis ; GCN-O0-NEXT: Fixup Statepoint Caller Saved @@ -149,12 +151,9 @@ ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O0-NEXT: Machine Optimization Remark Emitter ; GCN-O0-NEXT: Stack Frame Layout Analysis -; GCN-O0-NEXT: Function register usage analysis -; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O0-NEXT: Machine Optimization Remark Emitter -; GCN-O0-NEXT: AMDGPU Assembly Printer -; GCN-O0-NEXT: Free MachineFunction +; GCN-O0-NEXT: Function register usage analysis +; GCN-O0-NEXT: AMDGPU Assembly Printer +; GCN-O0-NEXT: Free MachineFunction ; GCN-O1:Target Library Information ; GCN-O1-NEXT:Target Pass Configuration @@ -373,6 +372,11 @@ ; GCN-O1-NEXT: SI Pre-allocate WWM Registers ; GCN-O1-NEXT: Greedy Register Allocator ; GCN-O1-NEXT: SI Lower WWM Copies +; GCN-O1-NEXT: Virtual Register Rewriter +; GCN-O1-NEXT: AMDGPU Reserve WWM Registers +; GCN-O1-NEXT: Virtual Register Map +; GCN-O1-NEXT: Live Register Matrix +; GCN-O1-NEXT: Greedy Register Allocator ; GCN-O1-NEXT: GCN NSA Reassign ; GCN-O1-NEXT: Virtual Register Rewriter ; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load @@ -427,12 +431,9 @@ ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Stack Frame Layout Analysis -; GCN-O1-NEXT: Function register usage analysis -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O1-NEXT: Machine Optimization Remark Emitter -; GCN-O1-NEXT: AMDGPU Assembly Printer -; GCN-O1-NEXT: Free MachineFunction +; GCN-O1-NEXT: Function register usage analysis +; GCN-O1-NEXT: AMDGPU Assembly Printer +; GCN-O1-NEXT: Free MachineFunction ; GCN-O1-OPTS:Target Library Information ; GCN-O1-OPTS-NEXT:Target Pass Configuration @@ -679,6 +680,11 @@ ; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers ; GCN-O1-OPTS-NEXT: Greedy Register Allocator ; GCN-O1-OPTS-NEXT: SI Lower WWM Copies +; GCN-O1-OPTS-NEXT: Virtual Register Rewriter +; GCN-O1-OPTS-NEXT: AMDGPU Reserve WWM Registers +; GCN-O1-OPTS-NEXT: Virtual Register Map +; GCN-O1-OPTS-NEXT: Live Register Matrix +; GCN-O1-OPTS-NEXT: Greedy Register Allocator ; GCN-O1-OPTS-NEXT: GCN NSA Reassign ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter ; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load @@ -733,12 +739,9 @@ ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis -; GCN-O1-OPTS-NEXT: Function register usage analysis -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter -; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer -; GCN-O1-OPTS-NEXT: Free MachineFunction +; GCN-O1-OPTS-NEXT: Function register usage analysis +; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer +; GCN-O1-OPTS-NEXT: Free MachineFunction ; GCN-O2:Target Library Information ; GCN-O2-NEXT:Target Pass Configuration @@ -991,6 +994,11 @@ ; GCN-O2-NEXT: SI Pre-allocate WWM Registers ; GCN-O2-NEXT: Greedy Register Allocator ; GCN-O2-NEXT: SI Lower WWM Copies +; GCN-O2-NEXT: Virtual Register Rewriter +; GCN-O2-NEXT: AMDGPU Reserve WWM Registers +; GCN-O2-NEXT: Virtual Register Map +; GCN-O2-NEXT: Live Register Matrix +; GCN-O2-NEXT: Greedy Register Allocator ; GCN-O2-NEXT: GCN NSA Reassign ; GCN-O2-NEXT: Virtual Register Rewriter ; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load @@ -1045,12 +1053,9 @@ ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Stack Frame Layout Analysis -; GCN-O2-NEXT: Function register usage analysis -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O2-NEXT: Machine Optimization Remark Emitter -; GCN-O2-NEXT: AMDGPU Assembly Printer -; GCN-O2-NEXT: Free MachineFunction +; GCN-O2-NEXT: Function register usage analysis +; GCN-O2-NEXT: AMDGPU Assembly Printer +; GCN-O2-NEXT: Free MachineFunction ; GCN-O3:Target Library Information ; GCN-O3-NEXT:Target Pass Configuration @@ -1315,6 +1320,11 @@ ; GCN-O3-NEXT: SI Pre-allocate WWM Registers ; GCN-O3-NEXT: Greedy Register Allocator ; GCN-O3-NEXT: SI Lower WWM Copies +; GCN-O3-NEXT: Virtual Register Rewriter +; GCN-O3-NEXT: AMDGPU Reserve WWM Registers +; GCN-O3-NEXT: Virtual Register Map +; GCN-O3-NEXT: Live Register Matrix +; GCN-O3-NEXT: Greedy Register Allocator ; GCN-O3-NEXT: GCN NSA Reassign ; GCN-O3-NEXT: Virtual Register Rewriter ; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load @@ -1369,12 +1379,9 @@ ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Stack Frame Layout Analysis -; GCN-O3-NEXT: Function register usage analysis -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O3-NEXT: Machine Optimization Remark Emitter -; GCN-O3-NEXT: AMDGPU Assembly Printer -; GCN-O3-NEXT: Free MachineFunction +; GCN-O3-NEXT: Function register usage analysis +; GCN-O3-NEXT: AMDGPU Assembly Printer +; GCN-O3-NEXT: Free MachineFunction define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index db88ddf1807f3..32abe50ff04d8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -8759,11 +8759,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX8-NEXT: s_add_u32 s88, s88, s9 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 -; GFX8-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane +; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s0, s3, 8 -; GFX8-NEXT: v_writelane_b32 v44, s0, 0 -; GFX8-NEXT: v_writelane_b32 v44, s1, 1 +; GFX8-NEXT: v_writelane_b32 v62, s0, 0 +; GFX8-NEXT: v_writelane_b32 v62, s1, 1 ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s36, s3, 21 ; GFX8-NEXT: s_lshr_b32 s30, s3, 19 @@ -8789,7 +8789,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s54, s3, 10 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX8-NEXT: v_writelane_b32 v44, s0, 2 +; GFX8-NEXT: v_writelane_b32 v62, s0, 2 ; GFX8-NEXT: s_lshr_b32 s52, s3, 11 ; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 @@ -8814,7 +8814,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[30:31], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX8-NEXT: v_writelane_b32 v44, s1, 3 +; GFX8-NEXT: v_writelane_b32 v62, s1, 3 ; GFX8-NEXT: s_lshr_b32 s6, s3, 9 ; GFX8-NEXT: s_lshr_b32 s8, s3, 6 ; GFX8-NEXT: s_lshr_b32 s10, s3, 7 @@ -8830,7 +8830,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v4, s74 ; GFX8-NEXT: v_mov_b32_e32 v8, s72 ; GFX8-NEXT: v_mov_b32_e32 v0, s70 -; GFX8-NEXT: v_mov_b32_e32 v55, s68 +; GFX8-NEXT: v_mov_b32_e32 v54, s68 ; GFX8-NEXT: v_mov_b32_e32 v20, s66 ; GFX8-NEXT: v_mov_b32_e32 v16, s64 ; GFX8-NEXT: v_mov_b32_e32 v24, s62 @@ -8851,7 +8851,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: s_lshr_b32 s70, s2, 21 ; GFX8-NEXT: s_lshr_b32 s68, s2, 18 -; GFX8-NEXT: v_mov_b32_e32 v57, s42 +; GFX8-NEXT: v_mov_b32_e32 v56, s42 ; GFX8-NEXT: s_lshr_b32 s66, s2, 19 ; GFX8-NEXT: s_lshr_b32 s64, s2, 16 ; GFX8-NEXT: v_mov_b32_e32 v22, s40 @@ -8876,16 +8876,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s36, s2, 2 ; GFX8-NEXT: s_lshr_b32 s30, s2, 3 ; GFX8-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000 -; GFX8-NEXT: v_readlane_b32 s2, v44, 0 -; GFX8-NEXT: v_readlane_b32 s3, v44, 1 +; GFX8-NEXT: v_readlane_b32 s2, v62, 0 +; GFX8-NEXT: v_readlane_b32 s3, v62, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, s75 ; GFX8-NEXT: v_mov_b32_e32 v7, s51 ; GFX8-NEXT: v_mov_b32_e32 v9, s73 ; GFX8-NEXT: v_mov_b32_e32 v11, s49 ; GFX8-NEXT: v_mov_b32_e32 v1, s71 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: v_mov_b32_e32 v56, s69 -; GFX8-NEXT: v_mov_b32_e32 v58, s43 +; GFX8-NEXT: v_mov_b32_e32 v55, s69 +; GFX8-NEXT: v_mov_b32_e32 v57, s43 ; GFX8-NEXT: v_mov_b32_e32 v21, s67 ; GFX8-NEXT: v_mov_b32_e32 v23, s41 ; GFX8-NEXT: v_mov_b32_e32 v17, s65 @@ -8942,24 +8942,24 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v42, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x1e0 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v46, s3 -; GFX8-NEXT: v_mov_b32_e32 v45, s2 +; GFX8-NEXT: v_mov_b32_e32 v45, s3 +; GFX8-NEXT: v_mov_b32_e32 v44, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x1d0 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v48, s3 -; GFX8-NEXT: v_mov_b32_e32 v47, s2 +; GFX8-NEXT: v_mov_b32_e32 v47, s3 +; GFX8-NEXT: v_mov_b32_e32 v46, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v50, s3 -; GFX8-NEXT: v_mov_b32_e32 v49, s2 +; GFX8-NEXT: v_mov_b32_e32 v49, s3 +; GFX8-NEXT: v_mov_b32_e32 v48, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x1b0 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v52, s3 -; GFX8-NEXT: v_mov_b32_e32 v51, s2 +; GFX8-NEXT: v_mov_b32_e32 v51, s3 +; GFX8-NEXT: v_mov_b32_e32 v50, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v54, s3 -; GFX8-NEXT: v_mov_b32_e32 v53, s2 +; GFX8-NEXT: v_mov_b32_e32 v53, s3 +; GFX8-NEXT: v_mov_b32_e32 v52, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x190 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, s3 @@ -8971,26 +8971,26 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[45:46], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[47:48], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[49:50], v[55:58] -; GFX8-NEXT: flat_store_dwordx4 v[51:52], v[20:23] -; GFX8-NEXT: flat_store_dwordx4 v[53:54], v[16:19] +; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[54:57] +; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[20:23] +; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[16:19] ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27] ; GFX8-NEXT: buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload ; GFX8-NEXT: buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GFX8-NEXT: s_add_u32 s2, s4, 0x170 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v60, s3 -; GFX8-NEXT: v_mov_b32_e32 v59, s2 +; GFX8-NEXT: v_mov_b32_e32 v59, s3 +; GFX8-NEXT: v_mov_b32_e32 v58, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x160 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v62, s3 -; GFX8-NEXT: v_mov_b32_e32 v61, s2 +; GFX8-NEXT: v_mov_b32_e32 v61, s3 +; GFX8-NEXT: v_mov_b32_e32 v60, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x150 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v46, s3 -; GFX8-NEXT: v_mov_b32_e32 v45, s2 +; GFX8-NEXT: v_mov_b32_e32 v45, s3 +; GFX8-NEXT: v_mov_b32_e32 v44, s2 ; GFX8-NEXT: s_add_u32 s2, s4, 0x140 ; GFX8-NEXT: s_addc_u32 s3, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s0 @@ -9021,9 +9021,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v11, s15 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[28:31] -; GFX8-NEXT: flat_store_dwordx4 v[59:60], v[32:35] -; GFX8-NEXT: flat_store_dwordx4 v[61:62], v[36:39] -; GFX8-NEXT: flat_store_dwordx4 v[45:46], v[40:43] +; GFX8-NEXT: flat_store_dwordx4 v[58:59], v[32:35] +; GFX8-NEXT: flat_store_dwordx4 v[60:61], v[36:39] +; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[40:43] ; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11] @@ -9177,9 +9177,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s30 ; GFX8-NEXT: v_mov_b32_e32 v3, s31 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_readlane_b32 s0, v44, 2 +; GFX8-NEXT: v_readlane_b32 s0, v62, 2 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_readlane_b32 s1, v44, 3 +; GFX8-NEXT: v_readlane_b32 s1, v62, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 @@ -9187,7 +9187,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: ; kill: killed $vgpr44 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v64i1_to_v64i64: diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll index 34dcdaf29677e..b508ffff8050a 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -9,6 +9,19 @@ @lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 +; GCN-LABEL: {{^}}f0: +; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3 +; GCN: ds_write_b8 [[NULL]], [[TREE]] +define void @f0() { +; OPT-LABEL: @f0() { +; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1 +; OPT-NEXT: ret void +; + store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1 + ret void +} + ; GCN-LABEL: {{^}}k0: ; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 @@ -29,16 +42,3 @@ define amdgpu_kernel void @k0() { call void @f0() ret void } - -; GCN-LABEL: {{^}}f0: -; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3 -; GCN: ds_write_b8 [[NULL]], [[TREE]] -define void @f0() { -; OPT-LABEL: @f0() { -; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1 -; OPT-NEXT: ret void -; - store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index 9829b7e787d47..e9cd94620a6b9 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -1520,9 +1520,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: s_add_i32 s6, s32, 0x201000 +; GFX7-NEXT: s_add_i32 s6, s32, 0x202000 ; GFX7-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX7-NEXT: s_add_i32 s6, s32, 0x201200 +; GFX7-NEXT: s_add_i32 s6, s32, 0x202100 ; GFX7-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v23, s28, 28 @@ -1562,36 +1562,57 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: ; implicit-def: $vgpr22 -; GFX7-NEXT: v_writelane_b32 v23, s59, 27 +; GFX7-NEXT: buffer_store_dword v16, off, s[0:3], s32 +; GFX7-NEXT: v_mov_b32_e32 v16, 0x8040 +; GFX7-NEXT: buffer_store_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 ; GFX7-NEXT: v_writelane_b32 v22, vcc_lo, 0 ; GFX7-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; GFX7-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX7-NEXT: v_mov_b32_e32 v0, 0x8044 -; GFX7-NEXT: buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[28:29] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX7-NEXT: v_lshr_b32_e64 v22, s32, 6 ; GFX7-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX7-NEXT: v_add_i32_e32 v22, vcc, vcc_lo, v22 -; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0x200, v22 -; GFX7-NEXT: v_readfirstlane_b32 s59, v22 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, vcc_lo, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v0 +; GFX7-NEXT: v_writelane_b32 v23, s59, 27 +; GFX7-NEXT: v_readfirstlane_b32 s59, v0 ; GFX7-NEXT: s_and_b64 vcc, 0, exec -; GFX7-NEXT: s_mov_b64 s[28:29], exec -; GFX7-NEXT: s_mov_b64 exec, -1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX7-NEXT: v_mov_b32_e32 v0, 0x8044 -; GFX7-NEXT: buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[28:29] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_readlane_b32 vcc_lo, v22, 0 ; GFX7-NEXT: v_readlane_b32 vcc_hi, v22, 1 -; GFX7-NEXT: s_mov_b64 s[28:29], exec -; GFX7-NEXT: s_mov_b64 exec, -1 -; GFX7-NEXT: s_mov_b64 exec, s[28:29] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v16, off, s[0:3], s32 +; GFX7-NEXT: v_mov_b32_e32 v16, 0x8040 +; GFX7-NEXT: buffer_load_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc @@ -1624,13 +1645,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: v_readlane_b32 s33, v23, 2 ; GFX7-NEXT: v_readlane_b32 s31, v23, 1 ; GFX7-NEXT: v_readlane_b32 s30, v23, 0 -; GFX7-NEXT: ; kill: killed $vgpr22 ; GFX7-NEXT: v_readlane_b32 s28, v23, 28 ; GFX7-NEXT: v_readlane_b32 s29, v23, 29 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: s_add_i32 s6, s32, 0x201000 +; GFX7-NEXT: s_add_i32 s6, s32, 0x202000 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX7-NEXT: s_add_i32 s6, s32, 0x201200 +; GFX7-NEXT: s_add_i32 s6, s32, 0x202100 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1640,9 +1660,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 +; GFX8-NEXT: s_add_i32 s6, s32, 0x202000 ; GFX8-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_add_i32 s6, s32, 0x201200 +; GFX8-NEXT: s_add_i32 s6, s32, 0x202100 ; GFX8-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_writelane_b32 v23, s58, 28 @@ -1682,36 +1702,60 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: ; implicit-def: $vgpr22 -; GFX8-NEXT: v_writelane_b32 v23, s59, 27 +; GFX8-NEXT: buffer_store_dword v16, off, s[0:3], s32 +; GFX8-NEXT: v_mov_b32_e32 v16, 0x8040 +; GFX8-NEXT: buffer_store_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Spill +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_store_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: v_writelane_b32 v22, vcc_lo, 0 ; GFX8-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; GFX8-NEXT: s_or_saveexec_b64 s[58:59], -1 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX8-NEXT: v_mov_b32_e32 v0, 0x8044 -; GFX8-NEXT: buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[58:59] -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX8-NEXT: v_lshrrev_b32_e64 v22, 6, s32 ; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX8-NEXT: v_add_u32_e32 v22, vcc, vcc_lo, v22 -; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0x200, v22 -; GFX8-NEXT: v_readfirstlane_b32 s59, v22 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v0 +; GFX8-NEXT: v_writelane_b32 v23, s59, 27 +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: s_and_b64 vcc, 0, exec -; GFX8-NEXT: s_mov_b64 s[58:59], exec -; GFX8-NEXT: s_mov_b64 exec, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX8-NEXT: v_mov_b32_e32 v0, 0x8044 -; GFX8-NEXT: buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[58:59] -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_readlane_b32 vcc_lo, v22, 0 ; GFX8-NEXT: v_readlane_b32 vcc_hi, v22, 1 -; GFX8-NEXT: s_mov_b64 s[58:59], exec -; GFX8-NEXT: s_mov_b64 exec, -1 -; GFX8-NEXT: s_mov_b64 exec, s[58:59] +; GFX8-NEXT: v_readlane_b32 s58, v23, 28 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v16, off, s[0:3], s32 +; GFX8-NEXT: v_mov_b32_e32 v16, 0x8040 +; GFX8-NEXT: buffer_load_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Reload +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc @@ -1744,13 +1788,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: v_readlane_b32 s33, v23, 2 ; GFX8-NEXT: v_readlane_b32 s31, v23, 1 ; GFX8-NEXT: v_readlane_b32 s30, v23, 0 -; GFX8-NEXT: ; kill: killed $vgpr22 -; GFX8-NEXT: v_readlane_b32 s58, v23, 28 ; GFX8-NEXT: v_readlane_b32 s59, v23, 29 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 +; GFX8-NEXT: s_add_i32 s6, s32, 0x202000 ; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_add_i32 s6, s32, 0x201200 +; GFX8-NEXT: s_add_i32 s6, s32, 0x202100 ; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 506f40516c9e6..25a6c80b91794 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -1734,3 +1734,1268 @@ define double @v_maximumnum_f64_fneg(double %x, double %y) { %result = call double @llvm.maximumnum.f64(double %fneg.x, double %fneg.y) ret double %result } + +define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) { +; GFX8-LABEL: v_maximumnum_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %x, <2 x half> %y) + ret <2 x half> %result +} + +define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) { +; GFX8-LABEL: v_maximumnum_v2f16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v2f16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v2f16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v2f16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v2f16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <2 x half> @llvm.maximumnum.v2f16(<2 x half> %x, <2 x half> %y) + ret <2 x half> %result +} + +define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) { +; GFX8-LABEL: v_maximumnum_v3f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v3f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v3f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> %x, <3 x half> %y) + ret <3 x half> %result +} + +define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { +; GFX8-LABEL: v_maximumnum_v3f16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v3f16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v3f16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v3f16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v3f16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <3 x half> @llvm.maximumnum.v3f16(<3 x half> %x, <3 x half> %y) + ret <3 x half> %result +} + +define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) { +; GFX8-LABEL: v_maximumnum_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v4f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y) + ret <4 x half> %result +} + +define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { +; GFX8-LABEL: v_maximumnum_v4f16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v4f16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v4f16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v4f16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v4f16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y) + ret <4 x half> %result +} + +define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) { +; GFX8-LABEL: v_maximumnum_v6f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX9-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX9-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX9-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v6f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v4 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_pk_max_f16 v1, v1, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v6f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <6 x half> @llvm.maximumnum.v6f16(<6 x half> %x, <6 x half> %y) + ret <6 x half> %result +} + +define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) { +; GFX8-LABEL: v_maximumnum_v8f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX9-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v4 +; GFX9-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX9-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v3, v3, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v8f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v6, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v7, v7, v7 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v6 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v6, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v7, v7, v7 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_pk_max_f16 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v8f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v6, v6, v6 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v7, v7, v7 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v5 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %result +} + +define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) { +; GFX8-LABEL: v_maximumnum_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v2f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v2 :: v_dual_max_num_f32 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %x, <2 x float> %y) + ret <2 x float> %result +} + +define <2 x float> @v_maximumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) { +; GFX8-LABEL: v_maximumnum_v2f32_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v2f32_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v2f32_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v2f32_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v2f32_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v2 :: v_dual_max_num_f32 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <2 x float> @llvm.maximumnum.v2f32(<2 x float> %x, <2 x float> %y) + ret <2 x float> %result +} + +define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) { +; GFX8-LABEL: v_maximumnum_v3f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v3f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v3f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v0, v0, v0 +; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v3 :: v_dual_max_num_f32 v1, v1, v4 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> %x, <3 x float> %y) + ret <3 x float> %result +} + +define <3 x float> @v_maximumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) { +; GFX8-LABEL: v_maximumnum_v3f32_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v3f32_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v3f32_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v3f32_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v3f32_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v3 :: v_dual_max_num_f32 v1, v1, v4 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <3 x float> @llvm.maximumnum.v3f32(<3 x float> %x, <3 x float> %y) + ret <3 x float> %result +} + +define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) { +; GFX8-LABEL: v_maximumnum_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_max_f32_e32 v4, v6, v6 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX9-NEXT: v_max_f32_e32 v4, v7, v7 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v6 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v7, v7, v7 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v4f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7 +; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v4 :: v_dual_max_num_f32 v1, v1, v5 +; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v6 :: v_dual_max_num_f32 v3, v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %result +} + +define <4 x float> @v_maximumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) { +; GFX8-LABEL: v_maximumnum_v4f32_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v4f32_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v4f32_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v4f32_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v4f32_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v4 :: v_dual_max_num_f32 v1, v1, v5 +; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v6 :: v_dual_max_num_f32 v3, v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %result +} + +define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) { +; GFX8-LABEL: v_maximumnum_v2f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v2f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v2f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %result +} + +define <2 x double> @v_maximumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) { +; GFX8-LABEL: v_maximumnum_v2f64_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v2f64_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v2f64_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v2f64_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v2f64_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <2 x double> @llvm.maximumnum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %result +} + +define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) { +; GFX8-LABEL: v_maximumnum_v3f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v3f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v3f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v3f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[10:11], v[10:11], v[10:11] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[10:11] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> %x, <3 x double> %y) + ret <3 x double> %result +} + +define <3 x double> @v_maximumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) { +; GFX8-LABEL: v_maximumnum_v3f64_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v3f64_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v3f64_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v3f64_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v3f64_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[10:11] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <3 x double> @llvm.maximumnum.v3f64(<3 x double> %x, <3 x double> %y) + ret <3 x double> %result +} + +define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) { +; GFX8-LABEL: v_maximumnum_v4f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v4f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX10-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v4f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[10:11], v[10:11], v[10:11] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[12:13], v[12:13], v[12:13] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[14:15], v[14:15], v[14:15] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[10:11] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[12:13] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[14:15] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> %x, <4 x double> %y) + ret <4 x double> %result +} + +define <4 x double> @v_maximumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) { +; GFX8-LABEL: v_maximumnum_v4f64_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_v4f64_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_v4f64_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_v4f64_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_v4f64_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[10:11] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[12:13] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[14:15] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <4 x double> @llvm.maximumnum.v4f64(<4 x double> %x, <4 x double> %y) + ret <4 x double> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index a2ba770067d16..466505c0bcbea 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -1688,3 +1688,1268 @@ define double @v_minimumnum_f64_fneg(double %x, double %y) { %result = call double @llvm.minimumnum.f64(double %fneg.x, double %fneg.y) ret double %result } + +define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) { +; GFX8-LABEL: v_minimumnum_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %x, <2 x half> %y) + ret <2 x half> %result +} + +define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) { +; GFX8-LABEL: v_minimumnum_v2f16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v2f16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v2f16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v2f16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v2f16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <2 x half> @llvm.minimumnum.v2f16(<2 x half> %x, <2 x half> %y) + ret <2 x half> %result +} + +define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) { +; GFX8-LABEL: v_minimumnum_v3f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v3f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v3f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> %x, <3 x half> %y) + ret <3 x half> %result +} + +define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { +; GFX8-LABEL: v_minimumnum_v3f16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v3f16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v3f16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v3f16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v3f16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <3 x half> @llvm.minimumnum.v3f16(<3 x half> %x, <3 x half> %y) + ret <3 x half> %result +} + +define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) { +; GFX8-LABEL: v_minimumnum_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v4f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %x, <4 x half> %y) + ret <4 x half> %result +} + +define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { +; GFX8-LABEL: v_minimumnum_v4f16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_min_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v4f16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v4f16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v4f16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v4f16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <4 x half> @llvm.minimumnum.v4f16(<4 x half> %x, <4 x half> %y) + ret <4 x half> %result +} + +define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) { +; GFX8-LABEL: v_minimumnum_v6f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX9-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX9-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX9-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v6f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v4 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_pk_min_f16 v1, v1, v4 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v6f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v4 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <6 x half> @llvm.minimumnum.v6f16(<6 x half> %x, <6 x half> %y) + ret <6 x half> %result +} + +define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) { +; GFX8-LABEL: v_minimumnum_v8f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX9-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v4 +; GFX9-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX9-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX9-NEXT: v_pk_min_f16 v3, v3, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v8f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v6, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v7, v7, v7 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v6 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v6, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v7, v7, v7 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_pk_min_f16 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v8f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v6, v6, v6 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v7, v7, v7 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v5 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %result +} + +define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) { +; GFX8-LABEL: v_minimumnum_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v2f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_min_num_f32 v0, v0, v2 :: v_dual_min_num_f32 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %x, <2 x float> %y) + ret <2 x float> %result +} + +define <2 x float> @v_minimumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) { +; GFX8-LABEL: v_minimumnum_v2f32_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v2f32_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v2f32_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v2f32_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v2f32_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_min_num_f32 v0, v0, v2 :: v_dual_min_num_f32 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <2 x float> @llvm.minimumnum.v2f32(<2 x float> %x, <2 x float> %y) + ret <2 x float> %result +} + +define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) { +; GFX8-LABEL: v_minimumnum_v3f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v3f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v3f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v0, v0, v0 +; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_min_num_f32 v0, v0, v3 :: v_dual_min_num_f32 v1, v1, v4 +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> %x, <3 x float> %y) + ret <3 x float> %result +} + +define <3 x float> @v_minimumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) { +; GFX8-LABEL: v_minimumnum_v3f32_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v3f32_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v3f32_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v3f32_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v3f32_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_min_num_f32 v0, v0, v3 :: v_dual_min_num_f32 v1, v1, v4 +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <3 x float> @llvm.minimumnum.v3f32(<3 x float> %x, <3 x float> %y) + ret <3 x float> %result +} + +define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) { +; GFX8-LABEL: v_minimumnum_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_max_f32_e32 v4, v6, v6 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX9-NEXT: v_max_f32_e32 v4, v7, v7 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v6 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v7, v7, v7 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5 +; GFX11-NEXT: v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v4f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7 +; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_min_num_f32 v0, v0, v4 :: v_dual_min_num_f32 v1, v1, v5 +; GFX12-NEXT: v_dual_min_num_f32 v2, v2, v6 :: v_dual_min_num_f32 v3, v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %result +} + +define <4 x float> @v_minimumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) { +; GFX8-LABEL: v_minimumnum_v4f32_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v4f32_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v4f32_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v4f32_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5 +; GFX11-NEXT: v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v4f32_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_min_num_f32 v0, v0, v4 :: v_dual_min_num_f32 v1, v1, v5 +; GFX12-NEXT: v_dual_min_num_f32 v2, v2, v6 :: v_dual_min_num_f32 v3, v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <4 x float> @llvm.minimumnum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %result +} + +define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) { +; GFX8-LABEL: v_minimumnum_v2f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v2f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v2f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %result +} + +define <2 x double> @v_minimumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) { +; GFX8-LABEL: v_minimumnum_v2f64_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v2f64_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v2f64_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v2f64_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v2f64_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %result +} + +define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) { +; GFX8-LABEL: v_minimumnum_v3f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v3f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v3f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v3f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[10:11], v[10:11], v[10:11] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[10:11] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> %x, <3 x double> %y) + ret <3 x double> %result +} + +define <3 x double> @v_minimumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) { +; GFX8-LABEL: v_minimumnum_v3f64_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v3f64_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v3f64_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v3f64_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v3f64_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[10:11] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <3 x double> @llvm.minimumnum.v3f64(<3 x double> %x, <3 x double> %y) + ret <3 x double> %result +} + +define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) { +; GFX8-LABEL: v_minimumnum_v4f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX8-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v4f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX10-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX10-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v4f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[10:11], v[10:11], v[10:11] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[12:13], v[12:13], v[12:13] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[14:15], v[14:15], v[14:15] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[10:11] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[12:13] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[14:15] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> %x, <4 x double> %y) + ret <4 x double> %result +} + +define <4 x double> @v_minimumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) { +; GFX8-LABEL: v_minimumnum_v4f64_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX8-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_v4f64_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_v4f64_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX10-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_v4f64_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_v4f64_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[10:11] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[12:13] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[14:15] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan <4 x double> @llvm.minimumnum.v4f64(<4 x double> %x, <4 x double> %y) + ret <4 x double> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index 27b71dd471a83..aa16937d7d897 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -24,6 +24,55 @@ store i32 0, ptr addrspace(3) @used_by_kernel } ; CHECK: ; LDSByteSize: 4 bytes +define void @nonkernel() { +; GFX9-LABEL: nonkernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 +; GFX9-NEXT: ds_write_b64 v0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: nonkernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 +; GFX10-NEXT: ds_write_b64 v0, v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; G_GFX9-LABEL: nonkernel: +; G_GFX9: ; %bb.0: +; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; G_GFX9-NEXT: v_mov_b32_e32 v2, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v3, 8 +; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v1, 0 +; G_GFX9-NEXT: ds_write_b32 v3, v2 +; G_GFX9-NEXT: ds_write_b64 v2, v[0:1] +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: s_setpc_b64 s[30:31] +; +; G_GFX10-LABEL: nonkernel: +; G_GFX10: ; %bb.0: +; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 +; G_GFX10-NEXT: v_mov_b32_e32 v3, 8 +; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 +; G_GFX10-NEXT: ds_write_b32 v3, v2 +; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: s_setpc_b64 s[30:31] + store i32 0, ptr addrspace(3) @used_by_both + store double 0.0, ptr addrspace(3) @used_by_function + ret void +} + ; Needs to allocate both variables, store to used_by_both is at sizeof(double) define amdgpu_kernel void @withcall() { ; GFX9-LABEL: withcall: @@ -171,55 +220,5 @@ define amdgpu_kernel void @nocall_false_sharing() { } ; CHECK: ; LDSByteSize: 4 bytes - -define void @nonkernel() { -; GFX9-LABEL: nonkernel: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 -; GFX9-NEXT: ds_write_b64 v0, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: nonkernel: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 -; GFX10-NEXT: ds_write_b64 v0, v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; G_GFX9-LABEL: nonkernel: -; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; G_GFX9-NEXT: v_mov_b32_e32 v2, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v3, 8 -; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, 0 -; G_GFX9-NEXT: ds_write_b32 v3, v2 -; G_GFX9-NEXT: ds_write_b64 v2, v[0:1] -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_setpc_b64 s[30:31] -; -; G_GFX10-LABEL: nonkernel: -; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 -; G_GFX10-NEXT: v_mov_b32_e32 v3, 8 -; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 -; G_GFX10-NEXT: ds_write_b32 v3, v2 -; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_setpc_b64 s[30:31] - store i32 0, ptr addrspace(3) @used_by_both - store double 0.0, ptr addrspace(3) @used_by_function - ret void -} - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index c302233e748fd..76a31a7fac8c1 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -141,112 +141,103 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0: ; %bb.0: ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v5, v3 -; W64-O0-NEXT: v_mov_b32_e32 v6, v2 -; W64-O0-NEXT: v_mov_b32_e32 v7, v1 -; W64-O0-NEXT: v_mov_b32_e32 v1, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v4, v3 +; W64-O0-NEXT: v_mov_b32_e32 v5, v2 +; W64-O0-NEXT: v_mov_b32_e32 v6, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v7 -; W64-O0-NEXT: v_mov_b32_e32 v3, v6 -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v6 +; W64-O0-NEXT: v_mov_b32_e32 v2, v5 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: s_waitcnt vmcnt(4) -; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; W64-O0-NEXT: v_writelane_b32 v7, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v7, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v7, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v7, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v7, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v7, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v7, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v7, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v7, s5, 8 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v7, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v7, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v7, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v7, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v7, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v7, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v7, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 ; W64-O0-NEXT: ; %bb.3: +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s4, v7, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v7, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -498,34 +489,32 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: v_mov_b32_e32 v13, v4 -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v7, v3 -; W64-O0-NEXT: v_mov_b32_e32 v8, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v9, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v6, v3 +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v7, v2 +; W64-O0-NEXT: v_mov_b32_e32 v8, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v2, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v14, v5 -; W64-O0-NEXT: v_mov_b32_e32 v15, v6 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_mov_b32_e32 v16, v4 +; W64-O0-NEXT: v_mov_b32_e32 v15, v4 +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_mov_b32_e32 v16, v3 ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -535,195 +524,192 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v9 -; W64-O0-NEXT: v_mov_b32_e32 v5, v8 -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v8 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: v_mov_b32_e32 v5, v6 +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v12 -; W64-O0-NEXT: s_waitcnt vmcnt(10) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v12 +; W64-O0-NEXT: s_waitcnt vmcnt(9) +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v10 -; W64-O0-NEXT: s_waitcnt vmcnt(11) -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v10 +; W64-O0-NEXT: s_waitcnt vmcnt(10) +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: s_waitcnt vmcnt(12) -; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr17 : SGPR spill to VGPR lane +; W64-O0-NEXT: v_writelane_b32 v17, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v17, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v17, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v17, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v17, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v17, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v17, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v17, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v17, s5, 8 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v17, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v17, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v17, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v17, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v17, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v17, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v17, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 +; W64-O0-NEXT: v_readlane_b32 s4, v17, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v17, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 +; W64-O0-NEXT: v_writelane_b32 v17, s4, 9 +; W64-O0-NEXT: v_writelane_b32 v17, s5, 10 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v17, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v17, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v17, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v17, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 +; W64-O0-NEXT: v_writelane_b32 v17, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v17, s5, 16 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v17, 15 +; W64-O0-NEXT: v_readlane_b32 s5, v17, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v17, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v17, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v17, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v17, 14 +; W64-O0-NEXT: v_readlane_b32 s6, v17, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v17, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v17, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[4:5], v6, off +; W64-O0-NEXT: global_store_dword v[3:4], v5, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -1031,262 +1017,253 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v8, v6 -; W64-O0-NEXT: v_mov_b32_e32 v9, v5 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v8, v5 +; W64-O0-NEXT: v_mov_b32_e32 v5, v4 +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v10, v3 -; W64-O0-NEXT: v_mov_b32_e32 v11, v2 -; W64-O0-NEXT: v_mov_b32_e32 v13, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v6, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v9, v3 +; W64-O0-NEXT: v_mov_b32_e32 v10, v2 +; W64-O0-NEXT: v_mov_b32_e32 v11, v1 +; W64-O0-NEXT: v_mov_b32_e32 v5, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v9 -; W64-O0-NEXT: v_mov_b32_e32 v3, v8 -; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v8 +; W64-O0-NEXT: v_mov_b32_e32 v2, v6 +; W64-O0-NEXT: v_mov_b32_e32 v3, v7 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v7, v13 -; W64-O0-NEXT: v_mov_b32_e32 v8, v11 -; W64-O0-NEXT: v_mov_b32_e32 v9, v10 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v6, v11 +; W64-O0-NEXT: v_mov_b32_e32 v7, v10 +; W64-O0-NEXT: v_mov_b32_e32 v8, v9 +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v6, v12 -; W64-O0-NEXT: s_waitcnt vmcnt(7) -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v5, v12 +; W64-O0-NEXT: s_waitcnt vmcnt(6) +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(7) -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(6) +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND ; W64-O0-NEXT: s_mov_b32 s5, s4 -; W64-O0-NEXT: s_waitcnt vmcnt(10) -; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane +; W64-O0-NEXT: v_writelane_b32 v13, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v13, s5, 1 +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 2 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 3 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 2 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 3 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 6 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 7 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v13, s8, 4 +; W64-O0-NEXT: v_writelane_b32 v13, s9, 5 +; W64-O0-NEXT: v_writelane_b32 v13, s10, 6 +; W64-O0-NEXT: v_writelane_b32 v13, s11, 7 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 8 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 9 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 8 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 9 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 8 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 9 +; W64-O0-NEXT: v_readlane_b32 s8, v13, 4 +; W64-O0-NEXT: v_readlane_b32 s9, v13, 5 +; W64-O0-NEXT: v_readlane_b32 s10, v13, 6 +; W64-O0-NEXT: v_readlane_b32 s11, v13, 7 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 1 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 ; W64-O0-NEXT: ; %bb.3: +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 2 +; W64-O0-NEXT: v_readlane_b32 s7, v13, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 10 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 11 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execz .LBB2_8 ; W64-O0-NEXT: ; %bb.4: ; %bb1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v13, s5, 12 +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 14 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 13 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 14 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 16 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 17 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 18 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v13, s8, 15 +; W64-O0-NEXT: v_writelane_b32 v13, s9, 16 +; W64-O0-NEXT: v_writelane_b32 v13, s10, 17 +; W64-O0-NEXT: v_writelane_b32 v13, s11, 18 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 19 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 20 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 19 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 20 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 19 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 20 +; W64-O0-NEXT: v_readlane_b32 s8, v13, 15 +; W64-O0-NEXT: v_readlane_b32 s9, v13, 16 +; W64-O0-NEXT: v_readlane_b32 s10, v13, 17 +; W64-O0-NEXT: v_readlane_b32 s11, v13, 18 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 12 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 ; W64-O0-NEXT: ; %bb.7: +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 13 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 14 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index dd6fd5aa384f6..59ceecbf43b78 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -140,127 +140,115 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0: ; %bb.0: ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v6, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v1 -; W64-O0-NEXT: v_mov_b32_e32 v1, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v5, v2 +; W64-O0-NEXT: v_mov_b32_e32 v2, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: v_mov_b32_e32 v5, v7 -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v6, v3 +; W64-O0-NEXT: v_mov_b32_e32 v4, v6 +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v3 -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v2 +; W64-O0-NEXT: v_mov_b32_e32 v6, v1 +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v7 -; W64-O0-NEXT: v_mov_b32_e32 v3, v6 -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v6 +; W64-O0-NEXT: v_mov_b32_e32 v2, v5 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: s_waitcnt vmcnt(4) -; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; W64-O0-NEXT: v_writelane_b32 v7, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v7, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v7, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v7, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v7, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v7, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v7, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v7, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v7, s5, 8 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v7, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v7, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v7, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v7, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v7, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v7, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v7, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 ; W64-O0-NEXT: ; %bb.3: +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s4, v7, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v7, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -512,45 +500,42 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; W64-O0-NEXT: v_mov_b32_e32 v14, v6 -; W64-O0-NEXT: v_mov_b32_e32 v9, v5 +; W64-O0-NEXT: v_mov_b32_e32 v8, v5 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v13, v4 -; W64-O0-NEXT: v_mov_b32_e32 v4, v3 -; W64-O0-NEXT: v_mov_b32_e32 v8, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v5, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v7, v2 +; W64-O0-NEXT: v_mov_b32_e32 v4, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v2, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v15, v7 -; W64-O0-NEXT: v_mov_b32_e32 v6, v15 -; W64-O0-NEXT: v_mov_b32_e32 v7, v14 +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_mov_b32_e32 v15, v5 +; W64-O0-NEXT: v_mov_b32_e32 v5, v15 +; W64-O0-NEXT: v_mov_b32_e32 v6, v14 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v9 -; W64-O0-NEXT: v_mov_b32_e32 v9, v14 +; W64-O0-NEXT: v_mov_b32_e32 v14, v8 +; W64-O0-NEXT: v_mov_b32_e32 v8, v14 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v9 -; W64-O0-NEXT: v_mov_b32_e32 v15, v7 -; W64-O0-NEXT: v_mov_b32_e32 v16, v6 +; W64-O0-NEXT: v_mov_b32_e32 v14, v8 +; W64-O0-NEXT: v_mov_b32_e32 v15, v6 +; W64-O0-NEXT: v_mov_b32_e32 v16, v5 ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -558,45 +543,45 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v4 -; W64-O0-NEXT: v_mov_b32_e32 v7, v9 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v8, v3 +; W64-O0-NEXT: v_mov_b32_e32 v6, v8 +; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: v_mov_b32_e32 v9, v4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: v_mov_b32_e32 v8, v3 +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v9 -; W64-O0-NEXT: v_mov_b32_e32 v5, v8 -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v8 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: v_mov_b32_e32 v5, v6 +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v12 -; W64-O0-NEXT: s_waitcnt vmcnt(10) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v12 +; W64-O0-NEXT: s_waitcnt vmcnt(9) +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v10 -; W64-O0-NEXT: s_waitcnt vmcnt(11) -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v10 +; W64-O0-NEXT: s_waitcnt vmcnt(10) +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 @@ -604,165 +589,162 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: s_waitcnt vmcnt(12) -; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr17 : SGPR spill to VGPR lane +; W64-O0-NEXT: v_writelane_b32 v17, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v17, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v17, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v17, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v17, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v17, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v17, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v17, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v17, s5, 8 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v17, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v17, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v17, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v17, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v17, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v17, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v17, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 +; W64-O0-NEXT: v_readlane_b32 s4, v17, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v17, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 +; W64-O0-NEXT: v_writelane_b32 v17, s4, 9 +; W64-O0-NEXT: v_writelane_b32 v17, s5, 10 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v17, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v17, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v17, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v17, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 +; W64-O0-NEXT: v_writelane_b32 v17, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v17, s5, 16 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v17, 15 +; W64-O0-NEXT: v_readlane_b32 s5, v17, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v17, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v17, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v17, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v17, 14 +; W64-O0-NEXT: v_readlane_b32 s6, v17, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v17, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v17, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: global_store_dword v[3:4], v5, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[4:5], v6, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -1070,48 +1052,42 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v6, v5 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v5, v4 +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v4, v3 -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v13, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v10, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v9, v2 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v6, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v8, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v4 -; W64-O0-NEXT: v_mov_b32_e32 v4, v14 -; W64-O0-NEXT: v_mov_b32_e32 v6, v13 +; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v10, v3 +; W64-O0-NEXT: v_mov_b32_e32 v3, v10 +; W64-O0-NEXT: v_mov_b32_e32 v5, v9 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v10 -; W64-O0-NEXT: v_mov_b32_e32 v13, v9 +; W64-O0-NEXT: v_mov_b32_e32 v9, v6 +; W64-O0-NEXT: v_mov_b32_e32 v6, v9 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v13 -; W64-O0-NEXT: v_mov_b32_e32 v10, v6 -; W64-O0-NEXT: v_mov_b32_e32 v11, v4 +; W64-O0-NEXT: v_mov_b32_e32 v9, v6 +; W64-O0-NEXT: v_mov_b32_e32 v10, v5 +; W64-O0-NEXT: v_mov_b32_e32 v11, v3 ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -1119,251 +1095,246 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v5, v7 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: s_waitcnt vmcnt(6) -; W64-O0-NEXT: v_mov_b32_e32 v4, v2 +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: s_waitcnt vmcnt(5) +; W64-O0-NEXT: v_mov_b32_e32 v3, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v12 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v12 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; W64-O0-NEXT: s_waitcnt vmcnt(9) -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(8) +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND ; W64-O0-NEXT: s_mov_b32 s5, s4 -; W64-O0-NEXT: s_waitcnt vmcnt(10) -; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane +; W64-O0-NEXT: v_writelane_b32 v13, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v13, s5, 1 +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 2 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 3 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 2 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 3 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 6 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 7 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v13, s8, 4 +; W64-O0-NEXT: v_writelane_b32 v13, s9, 5 +; W64-O0-NEXT: v_writelane_b32 v13, s10, 6 +; W64-O0-NEXT: v_writelane_b32 v13, s11, 7 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 8 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 9 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 8 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 9 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 8 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 9 +; W64-O0-NEXT: v_readlane_b32 s8, v13, 4 +; W64-O0-NEXT: v_readlane_b32 s9, v13, 5 +; W64-O0-NEXT: v_readlane_b32 s10, v13, 6 +; W64-O0-NEXT: v_readlane_b32 s11, v13, 7 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 1 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 ; W64-O0-NEXT: ; %bb.3: +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 2 +; W64-O0-NEXT: v_readlane_b32 s7, v13, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 10 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 11 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execz .LBB2_8 ; W64-O0-NEXT: ; %bb.4: ; %bb1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_mov_b32_e32 v7, v5 -; W64-O0-NEXT: v_mov_b32_e32 v1, v4 -; W64-O0-NEXT: v_mov_b32_e32 v5, v3 -; W64-O0-NEXT: v_mov_b32_e32 v6, v2 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 0 +; W64-O0-NEXT: v_mov_b32_e32 v6, v4 +; W64-O0-NEXT: v_mov_b32_e32 v0, v3 +; W64-O0-NEXT: v_mov_b32_e32 v4, v2 +; W64-O0-NEXT: v_mov_b32_e32 v5, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v7 -; W64-O0-NEXT: v_mov_b32_e32 v3, v6 -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v6 +; W64-O0-NEXT: v_mov_b32_e32 v2, v5 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v13, s5, 12 +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 14 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 13 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 14 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 ; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 16 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 17 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 18 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_writelane_b32 v13, s8, 15 +; W64-O0-NEXT: v_writelane_b32 v13, s9, 16 +; W64-O0-NEXT: v_writelane_b32 v13, s10, 17 +; W64-O0-NEXT: v_writelane_b32 v13, s11, 18 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 19 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 20 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 19 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 20 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 19 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 20 +; W64-O0-NEXT: v_readlane_b32 s8, v13, 15 +; W64-O0-NEXT: v_readlane_b32 s9, v13, 16 +; W64-O0-NEXT: v_readlane_b32 s10, v13, 17 +; W64-O0-NEXT: v_readlane_b32 s11, v13, 18 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 12 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 ; W64-O0-NEXT: ; %bb.7: +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 13 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 14 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index 6d18f354e6542..a2baa56ea0c98 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s ; CHECK-LABEL: {{^}}_amdgpu_cs_main: -; CHECK: ; NumSgprs: 4 +; CHECK: ; TotalNumSgprs: 4 ; CHECK: ; NumVgprs: 2 ; CHECK: .amdgpu_pal_metadata ; CHECK-NEXT: --- diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 72aafcaca3ff8..37d0309caac0a 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -11,21 +11,17 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %26 - ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %26 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23 - ; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7 + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) - ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; ; PEI-GFX908-LABEL: name: partial_copy @@ -60,18 +56,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %25 - ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23 - ; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7 + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: S_ENDPGM 0 ; diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index 5b0354e63c236..078b133a93d6f 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -17,13 +17,11 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: s_mov_b32 s95, 0xe8f000 ; GCN-NEXT: s_add_u32 s92, s92, s9 ; GCN-NEXT: s_addc_u32 s93, s93, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v2, s4, 0 ; GCN-NEXT: v_writelane_b32 v2, s5, 1 ; GCN-NEXT: v_writelane_b32 v2, s6, 2 @@ -115,107 +113,109 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v2, s4, 0 +; GCN-NEXT: v_writelane_b32 v2, s5, 1 +; GCN-NEXT: v_writelane_b32 v2, s6, 2 +; GCN-NEXT: v_writelane_b32 v2, s7, 3 +; GCN-NEXT: v_writelane_b32 v2, s8, 4 +; GCN-NEXT: v_writelane_b32 v2, s9, 5 +; GCN-NEXT: v_writelane_b32 v2, s10, 6 +; GCN-NEXT: v_writelane_b32 v2, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 8 -; GCN-NEXT: v_writelane_b32 v1, s5, 9 -; GCN-NEXT: v_writelane_b32 v1, s6, 10 -; GCN-NEXT: v_writelane_b32 v1, s7, 11 -; GCN-NEXT: v_writelane_b32 v1, s8, 12 -; GCN-NEXT: v_writelane_b32 v1, s9, 13 -; GCN-NEXT: v_writelane_b32 v1, s10, 14 -; GCN-NEXT: v_writelane_b32 v1, s11, 15 +; GCN-NEXT: v_writelane_b32 v2, s4, 8 +; GCN-NEXT: v_writelane_b32 v2, s5, 9 +; GCN-NEXT: v_writelane_b32 v2, s6, 10 +; GCN-NEXT: v_writelane_b32 v2, s7, 11 +; GCN-NEXT: v_writelane_b32 v2, s8, 12 +; GCN-NEXT: v_writelane_b32 v2, s9, 13 +; GCN-NEXT: v_writelane_b32 v2, s10, 14 +; GCN-NEXT: v_writelane_b32 v2, s11, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 +; GCN-NEXT: v_writelane_b32 v2, s4, 16 +; GCN-NEXT: v_writelane_b32 v2, s5, 17 +; GCN-NEXT: v_writelane_b32 v2, s6, 18 +; GCN-NEXT: v_writelane_b32 v2, s7, 19 +; GCN-NEXT: v_writelane_b32 v2, s8, 20 +; GCN-NEXT: v_writelane_b32 v2, s9, 21 +; GCN-NEXT: v_writelane_b32 v2, s10, 22 +; GCN-NEXT: v_writelane_b32 v2, s11, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 24 -; GCN-NEXT: v_writelane_b32 v1, s5, 25 -; GCN-NEXT: v_writelane_b32 v1, s6, 26 -; GCN-NEXT: v_writelane_b32 v1, s7, 27 -; GCN-NEXT: v_writelane_b32 v1, s8, 28 -; GCN-NEXT: v_writelane_b32 v1, s9, 29 -; GCN-NEXT: v_writelane_b32 v1, s10, 30 -; GCN-NEXT: v_writelane_b32 v1, s11, 31 +; GCN-NEXT: v_writelane_b32 v2, s4, 24 +; GCN-NEXT: v_writelane_b32 v2, s5, 25 +; GCN-NEXT: v_writelane_b32 v2, s6, 26 +; GCN-NEXT: v_writelane_b32 v2, s7, 27 +; GCN-NEXT: v_writelane_b32 v2, s8, 28 +; GCN-NEXT: v_writelane_b32 v2, s9, 29 +; GCN-NEXT: v_writelane_b32 v2, s10, 30 +; GCN-NEXT: v_writelane_b32 v2, s11, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 +; GCN-NEXT: v_writelane_b32 v2, s4, 32 +; GCN-NEXT: v_writelane_b32 v2, s5, 33 +; GCN-NEXT: v_writelane_b32 v2, s6, 34 +; GCN-NEXT: v_writelane_b32 v2, s7, 35 +; GCN-NEXT: v_writelane_b32 v2, s8, 36 +; GCN-NEXT: v_writelane_b32 v2, s9, 37 +; GCN-NEXT: v_writelane_b32 v2, s10, 38 +; GCN-NEXT: v_writelane_b32 v2, s11, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 40 -; GCN-NEXT: v_writelane_b32 v1, s5, 41 -; GCN-NEXT: v_writelane_b32 v1, s6, 42 -; GCN-NEXT: v_writelane_b32 v1, s7, 43 -; GCN-NEXT: v_writelane_b32 v1, s8, 44 -; GCN-NEXT: v_writelane_b32 v1, s9, 45 -; GCN-NEXT: v_writelane_b32 v1, s10, 46 -; GCN-NEXT: v_writelane_b32 v1, s11, 47 +; GCN-NEXT: v_writelane_b32 v2, s4, 40 +; GCN-NEXT: v_writelane_b32 v2, s5, 41 +; GCN-NEXT: v_writelane_b32 v2, s6, 42 +; GCN-NEXT: v_writelane_b32 v2, s7, 43 +; GCN-NEXT: v_writelane_b32 v2, s8, 44 +; GCN-NEXT: v_writelane_b32 v2, s9, 45 +; GCN-NEXT: v_writelane_b32 v2, s10, 46 +; GCN-NEXT: v_writelane_b32 v2, s11, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 +; GCN-NEXT: v_writelane_b32 v2, s4, 48 +; GCN-NEXT: v_writelane_b32 v2, s5, 49 +; GCN-NEXT: v_writelane_b32 v2, s6, 50 +; GCN-NEXT: v_writelane_b32 v2, s7, 51 +; GCN-NEXT: v_writelane_b32 v2, s8, 52 +; GCN-NEXT: v_writelane_b32 v2, s9, 53 +; GCN-NEXT: v_writelane_b32 v2, s10, 54 +; GCN-NEXT: v_writelane_b32 v2, s11, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 56 -; GCN-NEXT: v_writelane_b32 v1, s5, 57 -; GCN-NEXT: v_writelane_b32 v1, s6, 58 -; GCN-NEXT: v_writelane_b32 v1, s7, 59 -; GCN-NEXT: v_writelane_b32 v1, s8, 60 -; GCN-NEXT: v_writelane_b32 v1, s9, 61 -; GCN-NEXT: v_writelane_b32 v1, s10, 62 -; GCN-NEXT: v_writelane_b32 v1, s11, 63 +; GCN-NEXT: v_writelane_b32 v2, s4, 56 +; GCN-NEXT: v_writelane_b32 v2, s5, 57 +; GCN-NEXT: v_writelane_b32 v2, s6, 58 +; GCN-NEXT: v_writelane_b32 v2, s7, 59 +; GCN-NEXT: v_writelane_b32 v2, s8, 60 +; GCN-NEXT: v_writelane_b32 v2, s9, 61 +; GCN-NEXT: v_writelane_b32 v2, s10, 62 +; GCN-NEXT: v_writelane_b32 v2, s11, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v2, s4, 0 +; GCN-NEXT: v_writelane_b32 v2, s5, 1 +; GCN-NEXT: v_writelane_b32 v2, s6, 2 +; GCN-NEXT: v_writelane_b32 v2, s7, 3 +; GCN-NEXT: v_writelane_b32 v2, s8, 4 +; GCN-NEXT: v_writelane_b32 v2, s9, 5 +; GCN-NEXT: v_writelane_b32 v2, s10, 6 +; GCN-NEXT: v_writelane_b32 v2, s11, 7 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -223,76 +223,76 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_readlane_b32 s8, v2, 56 -; GCN-NEXT: v_readlane_b32 s9, v2, 57 -; GCN-NEXT: v_readlane_b32 s10, v2, 58 -; GCN-NEXT: v_readlane_b32 s11, v2, 59 -; GCN-NEXT: v_readlane_b32 s12, v2, 60 -; GCN-NEXT: v_readlane_b32 s13, v2, 61 -; GCN-NEXT: v_readlane_b32 s14, v2, 62 -; GCN-NEXT: v_readlane_b32 s15, v2, 63 -; GCN-NEXT: v_readlane_b32 s16, v2, 48 -; GCN-NEXT: v_readlane_b32 s17, v2, 49 -; GCN-NEXT: v_readlane_b32 s18, v2, 50 -; GCN-NEXT: v_readlane_b32 s19, v2, 51 -; GCN-NEXT: v_readlane_b32 s20, v2, 52 -; GCN-NEXT: v_readlane_b32 s21, v2, 53 -; GCN-NEXT: v_readlane_b32 s22, v2, 54 -; GCN-NEXT: v_readlane_b32 s23, v2, 55 -; GCN-NEXT: v_readlane_b32 s24, v2, 40 -; GCN-NEXT: v_readlane_b32 s25, v2, 41 -; GCN-NEXT: v_readlane_b32 s26, v2, 42 -; GCN-NEXT: v_readlane_b32 s27, v2, 43 -; GCN-NEXT: v_readlane_b32 s28, v2, 44 -; GCN-NEXT: v_readlane_b32 s29, v2, 45 -; GCN-NEXT: v_readlane_b32 s30, v2, 46 -; GCN-NEXT: v_readlane_b32 s31, v2, 47 -; GCN-NEXT: v_readlane_b32 s36, v2, 32 -; GCN-NEXT: v_readlane_b32 s37, v2, 33 -; GCN-NEXT: v_readlane_b32 s38, v2, 34 -; GCN-NEXT: v_readlane_b32 s39, v2, 35 -; GCN-NEXT: v_readlane_b32 s40, v2, 36 -; GCN-NEXT: v_readlane_b32 s41, v2, 37 -; GCN-NEXT: v_readlane_b32 s42, v2, 38 -; GCN-NEXT: v_readlane_b32 s43, v2, 39 -; GCN-NEXT: v_readlane_b32 s44, v2, 24 -; GCN-NEXT: v_readlane_b32 s45, v2, 25 -; GCN-NEXT: v_readlane_b32 s46, v2, 26 -; GCN-NEXT: v_readlane_b32 s47, v2, 27 -; GCN-NEXT: v_readlane_b32 s48, v2, 28 -; GCN-NEXT: v_readlane_b32 s49, v2, 29 -; GCN-NEXT: v_readlane_b32 s50, v2, 30 -; GCN-NEXT: v_readlane_b32 s51, v2, 31 -; GCN-NEXT: v_readlane_b32 s52, v2, 16 -; GCN-NEXT: v_readlane_b32 s53, v2, 17 -; GCN-NEXT: v_readlane_b32 s54, v2, 18 -; GCN-NEXT: v_readlane_b32 s55, v2, 19 -; GCN-NEXT: v_readlane_b32 s56, v2, 20 -; GCN-NEXT: v_readlane_b32 s57, v2, 21 -; GCN-NEXT: v_readlane_b32 s58, v2, 22 -; GCN-NEXT: v_readlane_b32 s59, v2, 23 -; GCN-NEXT: v_readlane_b32 s60, v2, 8 -; GCN-NEXT: v_readlane_b32 s61, v2, 9 -; GCN-NEXT: v_readlane_b32 s62, v2, 10 -; GCN-NEXT: v_readlane_b32 s63, v2, 11 -; GCN-NEXT: v_readlane_b32 s64, v2, 12 -; GCN-NEXT: v_readlane_b32 s65, v2, 13 -; GCN-NEXT: v_readlane_b32 s66, v2, 14 -; GCN-NEXT: v_readlane_b32 s67, v2, 15 -; GCN-NEXT: v_readlane_b32 s68, v2, 0 -; GCN-NEXT: v_readlane_b32 s69, v2, 1 -; GCN-NEXT: v_readlane_b32 s70, v2, 2 -; GCN-NEXT: v_readlane_b32 s71, v2, 3 -; GCN-NEXT: v_readlane_b32 s72, v2, 4 -; GCN-NEXT: v_readlane_b32 s73, v2, 5 -; GCN-NEXT: v_readlane_b32 s74, v2, 6 -; GCN-NEXT: v_readlane_b32 s75, v2, 7 +; GCN-NEXT: v_readlane_b32 s8, v0, 56 +; GCN-NEXT: v_readlane_b32 s9, v0, 57 +; GCN-NEXT: v_readlane_b32 s10, v0, 58 +; GCN-NEXT: v_readlane_b32 s11, v0, 59 +; GCN-NEXT: v_readlane_b32 s12, v0, 60 +; GCN-NEXT: v_readlane_b32 s13, v0, 61 +; GCN-NEXT: v_readlane_b32 s14, v0, 62 +; GCN-NEXT: v_readlane_b32 s15, v0, 63 +; GCN-NEXT: v_readlane_b32 s16, v0, 48 +; GCN-NEXT: v_readlane_b32 s17, v0, 49 +; GCN-NEXT: v_readlane_b32 s18, v0, 50 +; GCN-NEXT: v_readlane_b32 s19, v0, 51 +; GCN-NEXT: v_readlane_b32 s20, v0, 52 +; GCN-NEXT: v_readlane_b32 s21, v0, 53 +; GCN-NEXT: v_readlane_b32 s22, v0, 54 +; GCN-NEXT: v_readlane_b32 s23, v0, 55 +; GCN-NEXT: v_readlane_b32 s24, v0, 40 +; GCN-NEXT: v_readlane_b32 s25, v0, 41 +; GCN-NEXT: v_readlane_b32 s26, v0, 42 +; GCN-NEXT: v_readlane_b32 s27, v0, 43 +; GCN-NEXT: v_readlane_b32 s28, v0, 44 +; GCN-NEXT: v_readlane_b32 s29, v0, 45 +; GCN-NEXT: v_readlane_b32 s30, v0, 46 +; GCN-NEXT: v_readlane_b32 s31, v0, 47 +; GCN-NEXT: v_readlane_b32 s36, v0, 32 +; GCN-NEXT: v_readlane_b32 s37, v0, 33 +; GCN-NEXT: v_readlane_b32 s38, v0, 34 +; GCN-NEXT: v_readlane_b32 s39, v0, 35 +; GCN-NEXT: v_readlane_b32 s40, v0, 36 +; GCN-NEXT: v_readlane_b32 s41, v0, 37 +; GCN-NEXT: v_readlane_b32 s42, v0, 38 +; GCN-NEXT: v_readlane_b32 s43, v0, 39 +; GCN-NEXT: v_readlane_b32 s44, v0, 24 +; GCN-NEXT: v_readlane_b32 s45, v0, 25 +; GCN-NEXT: v_readlane_b32 s46, v0, 26 +; GCN-NEXT: v_readlane_b32 s47, v0, 27 +; GCN-NEXT: v_readlane_b32 s48, v0, 28 +; GCN-NEXT: v_readlane_b32 s49, v0, 29 +; GCN-NEXT: v_readlane_b32 s50, v0, 30 +; GCN-NEXT: v_readlane_b32 s51, v0, 31 +; GCN-NEXT: v_readlane_b32 s52, v0, 16 +; GCN-NEXT: v_readlane_b32 s53, v0, 17 +; GCN-NEXT: v_readlane_b32 s54, v0, 18 +; GCN-NEXT: v_readlane_b32 s55, v0, 19 +; GCN-NEXT: v_readlane_b32 s56, v0, 20 +; GCN-NEXT: v_readlane_b32 s57, v0, 21 +; GCN-NEXT: v_readlane_b32 s58, v0, 22 +; GCN-NEXT: v_readlane_b32 s59, v0, 23 +; GCN-NEXT: v_readlane_b32 s60, v0, 8 +; GCN-NEXT: v_readlane_b32 s61, v0, 9 +; GCN-NEXT: v_readlane_b32 s62, v0, 10 +; GCN-NEXT: v_readlane_b32 s63, v0, 11 +; GCN-NEXT: v_readlane_b32 s64, v0, 12 +; GCN-NEXT: v_readlane_b32 s65, v0, 13 +; GCN-NEXT: v_readlane_b32 s66, v0, 14 +; GCN-NEXT: v_readlane_b32 s67, v0, 15 +; GCN-NEXT: v_readlane_b32 s68, v0, 0 +; GCN-NEXT: v_readlane_b32 s69, v0, 1 +; GCN-NEXT: v_readlane_b32 s70, v0, 2 +; GCN-NEXT: v_readlane_b32 s71, v0, 3 +; GCN-NEXT: v_readlane_b32 s72, v0, 4 +; GCN-NEXT: v_readlane_b32 s73, v0, 5 +; GCN-NEXT: v_readlane_b32 s74, v0, 6 +; GCN-NEXT: v_readlane_b32 s75, v0, 7 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s76, v1, 56 ; GCN-NEXT: v_readlane_b32 s77, v1, 57 @@ -319,7 +319,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: v_readlane_b32 s6, v1, 6 ; GCN-NEXT: v_readlane_b32 s7, v1, 7 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] @@ -380,14 +380,14 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_readlane_b32 s2, v0, 2 -; GCN-NEXT: v_readlane_b32 s3, v0, 3 -; GCN-NEXT: v_readlane_b32 s4, v0, 4 -; GCN-NEXT: v_readlane_b32 s5, v0, 5 -; GCN-NEXT: v_readlane_b32 s6, v0, 6 -; GCN-NEXT: v_readlane_b32 s7, v0, 7 +; GCN-NEXT: v_readlane_b32 s0, v2, 0 +; GCN-NEXT: v_readlane_b32 s1, v2, 1 +; GCN-NEXT: v_readlane_b32 s2, v2, 2 +; GCN-NEXT: v_readlane_b32 s3, v2, 3 +; GCN-NEXT: v_readlane_b32 s4, v2, 4 +; GCN-NEXT: v_readlane_b32 s5, v2, 5 +; GCN-NEXT: v_readlane_b32 s6, v2, 6 +; GCN-NEXT: v_readlane_b32 s7, v2, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[84:91] ; GCN-NEXT: ;;#ASMEND @@ -422,18 +422,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -490,12 +478,11 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s9 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v1, s4, 0 ; GCN-NEXT: v_writelane_b32 v1, s5, 1 ; GCN-NEXT: v_writelane_b32 v1, s6, 2 @@ -575,21 +562,22 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: v_writelane_b32 v1, s5, 1 +; GCN-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-NEXT: v_writelane_b32 v1, s8, 4 +; GCN-NEXT: v_writelane_b32 v1, s9, 5 +; GCN-NEXT: v_writelane_b32 v1, s10, 6 +; GCN-NEXT: v_writelane_b32 v1, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s2, 8 -; GCN-NEXT: v_writelane_b32 v0, s3, 9 +; GCN-NEXT: v_writelane_b32 v1, s2, 8 +; GCN-NEXT: v_writelane_b32 v1, s3, 9 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -597,93 +585,93 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_readlane_b32 s16, v1, 8 -; GCN-NEXT: v_readlane_b32 s17, v1, 9 -; GCN-NEXT: v_readlane_b32 s20, v1, 0 -; GCN-NEXT: v_readlane_b32 s21, v1, 1 -; GCN-NEXT: v_readlane_b32 s22, v1, 2 -; GCN-NEXT: v_readlane_b32 s23, v1, 3 -; GCN-NEXT: v_readlane_b32 s24, v1, 4 -; GCN-NEXT: v_readlane_b32 s25, v1, 5 -; GCN-NEXT: v_readlane_b32 s26, v1, 6 -; GCN-NEXT: v_readlane_b32 s27, v1, 7 +; GCN-NEXT: v_readlane_b32 s16, v0, 8 +; GCN-NEXT: v_readlane_b32 s17, v0, 9 +; GCN-NEXT: v_readlane_b32 s20, v0, 0 +; GCN-NEXT: v_readlane_b32 s21, v0, 1 +; GCN-NEXT: v_readlane_b32 s22, v0, 2 +; GCN-NEXT: v_readlane_b32 s23, v0, 3 +; GCN-NEXT: v_readlane_b32 s24, v0, 4 +; GCN-NEXT: v_readlane_b32 s25, v0, 5 +; GCN-NEXT: v_readlane_b32 s26, v0, 6 +; GCN-NEXT: v_readlane_b32 s27, v0, 7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s36, v0, 32 -; GCN-NEXT: v_readlane_b32 s37, v0, 33 -; GCN-NEXT: v_readlane_b32 s38, v0, 34 -; GCN-NEXT: v_readlane_b32 s39, v0, 35 -; GCN-NEXT: v_readlane_b32 s40, v0, 36 -; GCN-NEXT: v_readlane_b32 s41, v0, 37 -; GCN-NEXT: v_readlane_b32 s42, v0, 38 -; GCN-NEXT: v_readlane_b32 s43, v0, 39 -; GCN-NEXT: v_readlane_b32 s44, v0, 40 -; GCN-NEXT: v_readlane_b32 s45, v0, 41 -; GCN-NEXT: v_readlane_b32 s46, v0, 42 -; GCN-NEXT: v_readlane_b32 s47, v0, 43 -; GCN-NEXT: v_readlane_b32 s48, v0, 44 -; GCN-NEXT: v_readlane_b32 s49, v0, 45 -; GCN-NEXT: v_readlane_b32 s50, v0, 46 -; GCN-NEXT: v_readlane_b32 s51, v0, 47 -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_readlane_b32 s2, v0, 2 -; GCN-NEXT: v_readlane_b32 s3, v0, 3 -; GCN-NEXT: v_readlane_b32 s4, v0, 4 -; GCN-NEXT: v_readlane_b32 s5, v0, 5 -; GCN-NEXT: v_readlane_b32 s6, v0, 6 -; GCN-NEXT: v_readlane_b32 s7, v0, 7 -; GCN-NEXT: v_readlane_b32 s8, v0, 8 -; GCN-NEXT: v_readlane_b32 s9, v0, 9 -; GCN-NEXT: v_readlane_b32 s10, v0, 10 -; GCN-NEXT: v_readlane_b32 s11, v0, 11 -; GCN-NEXT: v_readlane_b32 s12, v0, 12 -; GCN-NEXT: v_readlane_b32 s13, v0, 13 -; GCN-NEXT: v_readlane_b32 s14, v0, 14 -; GCN-NEXT: v_readlane_b32 s15, v0, 15 +; GCN-NEXT: v_readlane_b32 s36, v1, 32 +; GCN-NEXT: v_readlane_b32 s37, v1, 33 +; GCN-NEXT: v_readlane_b32 s38, v1, 34 +; GCN-NEXT: v_readlane_b32 s39, v1, 35 +; GCN-NEXT: v_readlane_b32 s40, v1, 36 +; GCN-NEXT: v_readlane_b32 s41, v1, 37 +; GCN-NEXT: v_readlane_b32 s42, v1, 38 +; GCN-NEXT: v_readlane_b32 s43, v1, 39 +; GCN-NEXT: v_readlane_b32 s44, v1, 40 +; GCN-NEXT: v_readlane_b32 s45, v1, 41 +; GCN-NEXT: v_readlane_b32 s46, v1, 42 +; GCN-NEXT: v_readlane_b32 s47, v1, 43 +; GCN-NEXT: v_readlane_b32 s48, v1, 44 +; GCN-NEXT: v_readlane_b32 s49, v1, 45 +; GCN-NEXT: v_readlane_b32 s50, v1, 46 +; GCN-NEXT: v_readlane_b32 s51, v1, 47 +; GCN-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-NEXT: v_readlane_b32 s2, v1, 2 +; GCN-NEXT: v_readlane_b32 s3, v1, 3 +; GCN-NEXT: v_readlane_b32 s4, v1, 4 +; GCN-NEXT: v_readlane_b32 s5, v1, 5 +; GCN-NEXT: v_readlane_b32 s6, v1, 6 +; GCN-NEXT: v_readlane_b32 s7, v1, 7 +; GCN-NEXT: v_readlane_b32 s8, v1, 8 +; GCN-NEXT: v_readlane_b32 s9, v1, 9 +; GCN-NEXT: v_readlane_b32 s10, v1, 10 +; GCN-NEXT: v_readlane_b32 s11, v1, 11 +; GCN-NEXT: v_readlane_b32 s12, v1, 12 +; GCN-NEXT: v_readlane_b32 s13, v1, 13 +; GCN-NEXT: v_readlane_b32 s14, v1, 14 +; GCN-NEXT: v_readlane_b32 s15, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 16 -; GCN-NEXT: v_readlane_b32 s1, v0, 17 -; GCN-NEXT: v_readlane_b32 s2, v0, 18 -; GCN-NEXT: v_readlane_b32 s3, v0, 19 -; GCN-NEXT: v_readlane_b32 s4, v0, 20 -; GCN-NEXT: v_readlane_b32 s5, v0, 21 -; GCN-NEXT: v_readlane_b32 s6, v0, 22 -; GCN-NEXT: v_readlane_b32 s7, v0, 23 -; GCN-NEXT: v_readlane_b32 s8, v0, 24 -; GCN-NEXT: v_readlane_b32 s9, v0, 25 -; GCN-NEXT: v_readlane_b32 s10, v0, 26 -; GCN-NEXT: v_readlane_b32 s11, v0, 27 -; GCN-NEXT: v_readlane_b32 s12, v0, 28 -; GCN-NEXT: v_readlane_b32 s13, v0, 29 -; GCN-NEXT: v_readlane_b32 s14, v0, 30 -; GCN-NEXT: v_readlane_b32 s15, v0, 31 +; GCN-NEXT: v_readlane_b32 s0, v1, 16 +; GCN-NEXT: v_readlane_b32 s1, v1, 17 +; GCN-NEXT: v_readlane_b32 s2, v1, 18 +; GCN-NEXT: v_readlane_b32 s3, v1, 19 +; GCN-NEXT: v_readlane_b32 s4, v1, 20 +; GCN-NEXT: v_readlane_b32 s5, v1, 21 +; GCN-NEXT: v_readlane_b32 s6, v1, 22 +; GCN-NEXT: v_readlane_b32 s7, v1, 23 +; GCN-NEXT: v_readlane_b32 s8, v1, 24 +; GCN-NEXT: v_readlane_b32 s9, v1, 25 +; GCN-NEXT: v_readlane_b32 s10, v1, 26 +; GCN-NEXT: v_readlane_b32 s11, v1, 27 +; GCN-NEXT: v_readlane_b32 s12, v1, 28 +; GCN-NEXT: v_readlane_b32 s13, v1, 29 +; GCN-NEXT: v_readlane_b32 s14, v1, 30 +; GCN-NEXT: v_readlane_b32 s15, v1, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 48 -; GCN-NEXT: v_readlane_b32 s1, v0, 49 -; GCN-NEXT: v_readlane_b32 s2, v0, 50 -; GCN-NEXT: v_readlane_b32 s3, v0, 51 -; GCN-NEXT: v_readlane_b32 s4, v0, 52 -; GCN-NEXT: v_readlane_b32 s5, v0, 53 -; GCN-NEXT: v_readlane_b32 s6, v0, 54 -; GCN-NEXT: v_readlane_b32 s7, v0, 55 -; GCN-NEXT: v_readlane_b32 s8, v0, 56 -; GCN-NEXT: v_readlane_b32 s9, v0, 57 -; GCN-NEXT: v_readlane_b32 s10, v0, 58 -; GCN-NEXT: v_readlane_b32 s11, v0, 59 -; GCN-NEXT: v_readlane_b32 s12, v0, 60 -; GCN-NEXT: v_readlane_b32 s13, v0, 61 -; GCN-NEXT: v_readlane_b32 s14, v0, 62 -; GCN-NEXT: v_readlane_b32 s15, v0, 63 +; GCN-NEXT: v_readlane_b32 s0, v1, 48 +; GCN-NEXT: v_readlane_b32 s1, v1, 49 +; GCN-NEXT: v_readlane_b32 s2, v1, 50 +; GCN-NEXT: v_readlane_b32 s3, v1, 51 +; GCN-NEXT: v_readlane_b32 s4, v1, 52 +; GCN-NEXT: v_readlane_b32 s5, v1, 53 +; GCN-NEXT: v_readlane_b32 s6, v1, 54 +; GCN-NEXT: v_readlane_b32 s7, v1, 55 +; GCN-NEXT: v_readlane_b32 s8, v1, 56 +; GCN-NEXT: v_readlane_b32 s9, v1, 57 +; GCN-NEXT: v_readlane_b32 s10, v1, 58 +; GCN-NEXT: v_readlane_b32 s11, v1, 59 +; GCN-NEXT: v_readlane_b32 s12, v1, 60 +; GCN-NEXT: v_readlane_b32 s13, v1, 61 +; GCN-NEXT: v_readlane_b32 s14, v1, 62 +; GCN-NEXT: v_readlane_b32 s15, v1, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -697,14 +685,6 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -741,17 +721,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s9 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -765,91 +737,91 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 -; GCN-NEXT: v_writelane_b32 v1, s12, 8 -; GCN-NEXT: v_writelane_b32 v1, s13, 9 -; GCN-NEXT: v_writelane_b32 v1, s14, 10 -; GCN-NEXT: v_writelane_b32 v1, s15, 11 -; GCN-NEXT: v_writelane_b32 v1, s16, 12 -; GCN-NEXT: v_writelane_b32 v1, s17, 13 -; GCN-NEXT: v_writelane_b32 v1, s18, 14 -; GCN-NEXT: v_writelane_b32 v1, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v32, s4, 0 +; GCN-NEXT: v_writelane_b32 v32, s5, 1 +; GCN-NEXT: v_writelane_b32 v32, s6, 2 +; GCN-NEXT: v_writelane_b32 v32, s7, 3 +; GCN-NEXT: v_writelane_b32 v32, s8, 4 +; GCN-NEXT: v_writelane_b32 v32, s9, 5 +; GCN-NEXT: v_writelane_b32 v32, s10, 6 +; GCN-NEXT: v_writelane_b32 v32, s11, 7 +; GCN-NEXT: v_writelane_b32 v32, s12, 8 +; GCN-NEXT: v_writelane_b32 v32, s13, 9 +; GCN-NEXT: v_writelane_b32 v32, s14, 10 +; GCN-NEXT: v_writelane_b32 v32, s15, 11 +; GCN-NEXT: v_writelane_b32 v32, s16, 12 +; GCN-NEXT: v_writelane_b32 v32, s17, 13 +; GCN-NEXT: v_writelane_b32 v32, s18, 14 +; GCN-NEXT: v_writelane_b32 v32, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 -; GCN-NEXT: v_writelane_b32 v1, s12, 24 -; GCN-NEXT: v_writelane_b32 v1, s13, 25 -; GCN-NEXT: v_writelane_b32 v1, s14, 26 -; GCN-NEXT: v_writelane_b32 v1, s15, 27 -; GCN-NEXT: v_writelane_b32 v1, s16, 28 -; GCN-NEXT: v_writelane_b32 v1, s17, 29 -; GCN-NEXT: v_writelane_b32 v1, s18, 30 -; GCN-NEXT: v_writelane_b32 v1, s19, 31 +; GCN-NEXT: v_writelane_b32 v32, s4, 16 +; GCN-NEXT: v_writelane_b32 v32, s5, 17 +; GCN-NEXT: v_writelane_b32 v32, s6, 18 +; GCN-NEXT: v_writelane_b32 v32, s7, 19 +; GCN-NEXT: v_writelane_b32 v32, s8, 20 +; GCN-NEXT: v_writelane_b32 v32, s9, 21 +; GCN-NEXT: v_writelane_b32 v32, s10, 22 +; GCN-NEXT: v_writelane_b32 v32, s11, 23 +; GCN-NEXT: v_writelane_b32 v32, s12, 24 +; GCN-NEXT: v_writelane_b32 v32, s13, 25 +; GCN-NEXT: v_writelane_b32 v32, s14, 26 +; GCN-NEXT: v_writelane_b32 v32, s15, 27 +; GCN-NEXT: v_writelane_b32 v32, s16, 28 +; GCN-NEXT: v_writelane_b32 v32, s17, 29 +; GCN-NEXT: v_writelane_b32 v32, s18, 30 +; GCN-NEXT: v_writelane_b32 v32, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 -; GCN-NEXT: v_writelane_b32 v1, s12, 40 -; GCN-NEXT: v_writelane_b32 v1, s13, 41 -; GCN-NEXT: v_writelane_b32 v1, s14, 42 -; GCN-NEXT: v_writelane_b32 v1, s15, 43 -; GCN-NEXT: v_writelane_b32 v1, s16, 44 -; GCN-NEXT: v_writelane_b32 v1, s17, 45 -; GCN-NEXT: v_writelane_b32 v1, s18, 46 -; GCN-NEXT: v_writelane_b32 v1, s19, 47 +; GCN-NEXT: v_writelane_b32 v32, s4, 32 +; GCN-NEXT: v_writelane_b32 v32, s5, 33 +; GCN-NEXT: v_writelane_b32 v32, s6, 34 +; GCN-NEXT: v_writelane_b32 v32, s7, 35 +; GCN-NEXT: v_writelane_b32 v32, s8, 36 +; GCN-NEXT: v_writelane_b32 v32, s9, 37 +; GCN-NEXT: v_writelane_b32 v32, s10, 38 +; GCN-NEXT: v_writelane_b32 v32, s11, 39 +; GCN-NEXT: v_writelane_b32 v32, s12, 40 +; GCN-NEXT: v_writelane_b32 v32, s13, 41 +; GCN-NEXT: v_writelane_b32 v32, s14, 42 +; GCN-NEXT: v_writelane_b32 v32, s15, 43 +; GCN-NEXT: v_writelane_b32 v32, s16, 44 +; GCN-NEXT: v_writelane_b32 v32, s17, 45 +; GCN-NEXT: v_writelane_b32 v32, s18, 46 +; GCN-NEXT: v_writelane_b32 v32, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 -; GCN-NEXT: v_writelane_b32 v1, s12, 56 -; GCN-NEXT: v_writelane_b32 v1, s13, 57 -; GCN-NEXT: v_writelane_b32 v1, s14, 58 -; GCN-NEXT: v_writelane_b32 v1, s15, 59 -; GCN-NEXT: v_writelane_b32 v1, s16, 60 -; GCN-NEXT: v_writelane_b32 v1, s17, 61 -; GCN-NEXT: v_writelane_b32 v1, s18, 62 -; GCN-NEXT: v_writelane_b32 v1, s19, 63 +; GCN-NEXT: v_writelane_b32 v32, s4, 48 +; GCN-NEXT: v_writelane_b32 v32, s5, 49 +; GCN-NEXT: v_writelane_b32 v32, s6, 50 +; GCN-NEXT: v_writelane_b32 v32, s7, 51 +; GCN-NEXT: v_writelane_b32 v32, s8, 52 +; GCN-NEXT: v_writelane_b32 v32, s9, 53 +; GCN-NEXT: v_writelane_b32 v32, s10, 54 +; GCN-NEXT: v_writelane_b32 v32, s11, 55 +; GCN-NEXT: v_writelane_b32 v32, s12, 56 +; GCN-NEXT: v_writelane_b32 v32, s13, 57 +; GCN-NEXT: v_writelane_b32 v32, s14, 58 +; GCN-NEXT: v_writelane_b32 v32, s15, 59 +; GCN-NEXT: v_writelane_b32 v32, s16, 60 +; GCN-NEXT: v_writelane_b32 v32, s17, 61 +; GCN-NEXT: v_writelane_b32 v32, s18, 62 +; GCN-NEXT: v_writelane_b32 v32, s19, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v0, s2, 0 -; GCN-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v32, s2, 0 +; GCN-NEXT: v_writelane_b32 v32, s3, 1 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[52:55], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -857,59 +829,59 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s36, v1, 32 -; GCN-NEXT: v_readlane_b32 s37, v1, 33 -; GCN-NEXT: v_readlane_b32 s38, v1, 34 -; GCN-NEXT: v_readlane_b32 s39, v1, 35 -; GCN-NEXT: v_readlane_b32 s40, v1, 36 -; GCN-NEXT: v_readlane_b32 s41, v1, 37 -; GCN-NEXT: v_readlane_b32 s42, v1, 38 -; GCN-NEXT: v_readlane_b32 s43, v1, 39 -; GCN-NEXT: v_readlane_b32 s44, v1, 40 -; GCN-NEXT: v_readlane_b32 s45, v1, 41 -; GCN-NEXT: v_readlane_b32 s46, v1, 42 -; GCN-NEXT: v_readlane_b32 s47, v1, 43 -; GCN-NEXT: v_readlane_b32 s48, v1, 44 -; GCN-NEXT: v_readlane_b32 s49, v1, 45 -; GCN-NEXT: v_readlane_b32 s50, v1, 46 -; GCN-NEXT: v_readlane_b32 s51, v1, 47 -; GCN-NEXT: v_readlane_b32 s0, v1, 16 -; GCN-NEXT: v_readlane_b32 s1, v1, 17 -; GCN-NEXT: v_readlane_b32 s2, v1, 18 -; GCN-NEXT: v_readlane_b32 s3, v1, 19 -; GCN-NEXT: v_readlane_b32 s4, v1, 20 -; GCN-NEXT: v_readlane_b32 s5, v1, 21 -; GCN-NEXT: v_readlane_b32 s6, v1, 22 -; GCN-NEXT: v_readlane_b32 s7, v1, 23 -; GCN-NEXT: v_readlane_b32 s8, v1, 24 -; GCN-NEXT: v_readlane_b32 s9, v1, 25 -; GCN-NEXT: v_readlane_b32 s10, v1, 26 -; GCN-NEXT: v_readlane_b32 s11, v1, 27 -; GCN-NEXT: v_readlane_b32 s12, v1, 28 -; GCN-NEXT: v_readlane_b32 s13, v1, 29 -; GCN-NEXT: v_readlane_b32 s14, v1, 30 -; GCN-NEXT: v_readlane_b32 s15, v1, 31 -; GCN-NEXT: v_readlane_b32 s16, v1, 0 -; GCN-NEXT: v_readlane_b32 s17, v1, 1 -; GCN-NEXT: v_readlane_b32 s18, v1, 2 -; GCN-NEXT: v_readlane_b32 s19, v1, 3 -; GCN-NEXT: v_readlane_b32 s20, v1, 4 -; GCN-NEXT: v_readlane_b32 s21, v1, 5 -; GCN-NEXT: v_readlane_b32 s22, v1, 6 -; GCN-NEXT: v_readlane_b32 s23, v1, 7 -; GCN-NEXT: v_readlane_b32 s24, v1, 8 -; GCN-NEXT: v_readlane_b32 s25, v1, 9 -; GCN-NEXT: v_readlane_b32 s26, v1, 10 -; GCN-NEXT: v_readlane_b32 s27, v1, 11 -; GCN-NEXT: v_readlane_b32 s28, v1, 12 -; GCN-NEXT: v_readlane_b32 s29, v1, 13 -; GCN-NEXT: v_readlane_b32 s30, v1, 14 -; GCN-NEXT: v_readlane_b32 s31, v1, 15 +; GCN-NEXT: v_readlane_b32 s36, v31, 32 +; GCN-NEXT: v_readlane_b32 s37, v31, 33 +; GCN-NEXT: v_readlane_b32 s38, v31, 34 +; GCN-NEXT: v_readlane_b32 s39, v31, 35 +; GCN-NEXT: v_readlane_b32 s40, v31, 36 +; GCN-NEXT: v_readlane_b32 s41, v31, 37 +; GCN-NEXT: v_readlane_b32 s42, v31, 38 +; GCN-NEXT: v_readlane_b32 s43, v31, 39 +; GCN-NEXT: v_readlane_b32 s44, v31, 40 +; GCN-NEXT: v_readlane_b32 s45, v31, 41 +; GCN-NEXT: v_readlane_b32 s46, v31, 42 +; GCN-NEXT: v_readlane_b32 s47, v31, 43 +; GCN-NEXT: v_readlane_b32 s48, v31, 44 +; GCN-NEXT: v_readlane_b32 s49, v31, 45 +; GCN-NEXT: v_readlane_b32 s50, v31, 46 +; GCN-NEXT: v_readlane_b32 s51, v31, 47 +; GCN-NEXT: v_readlane_b32 s0, v31, 16 +; GCN-NEXT: v_readlane_b32 s1, v31, 17 +; GCN-NEXT: v_readlane_b32 s2, v31, 18 +; GCN-NEXT: v_readlane_b32 s3, v31, 19 +; GCN-NEXT: v_readlane_b32 s4, v31, 20 +; GCN-NEXT: v_readlane_b32 s5, v31, 21 +; GCN-NEXT: v_readlane_b32 s6, v31, 22 +; GCN-NEXT: v_readlane_b32 s7, v31, 23 +; GCN-NEXT: v_readlane_b32 s8, v31, 24 +; GCN-NEXT: v_readlane_b32 s9, v31, 25 +; GCN-NEXT: v_readlane_b32 s10, v31, 26 +; GCN-NEXT: v_readlane_b32 s11, v31, 27 +; GCN-NEXT: v_readlane_b32 s12, v31, 28 +; GCN-NEXT: v_readlane_b32 s13, v31, 29 +; GCN-NEXT: v_readlane_b32 s14, v31, 30 +; GCN-NEXT: v_readlane_b32 s15, v31, 31 +; GCN-NEXT: v_readlane_b32 s16, v31, 0 +; GCN-NEXT: v_readlane_b32 s17, v31, 1 +; GCN-NEXT: v_readlane_b32 s18, v31, 2 +; GCN-NEXT: v_readlane_b32 s19, v31, 3 +; GCN-NEXT: v_readlane_b32 s20, v31, 4 +; GCN-NEXT: v_readlane_b32 s21, v31, 5 +; GCN-NEXT: v_readlane_b32 s22, v31, 6 +; GCN-NEXT: v_readlane_b32 s23, v31, 7 +; GCN-NEXT: v_readlane_b32 s24, v31, 8 +; GCN-NEXT: v_readlane_b32 s25, v31, 9 +; GCN-NEXT: v_readlane_b32 s26, v31, 10 +; GCN-NEXT: v_readlane_b32 s27, v31, 11 +; GCN-NEXT: v_readlane_b32 s28, v31, 12 +; GCN-NEXT: v_readlane_b32 s29, v31, 13 +; GCN-NEXT: v_readlane_b32 s30, v31, 14 +; GCN-NEXT: v_readlane_b32 s31, v31, 15 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[16:31] @@ -917,25 +889,25 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 48 -; GCN-NEXT: v_readlane_b32 s5, v1, 49 -; GCN-NEXT: v_readlane_b32 s6, v1, 50 -; GCN-NEXT: v_readlane_b32 s7, v1, 51 -; GCN-NEXT: v_readlane_b32 s8, v1, 52 -; GCN-NEXT: v_readlane_b32 s9, v1, 53 -; GCN-NEXT: v_readlane_b32 s10, v1, 54 -; GCN-NEXT: v_readlane_b32 s11, v1, 55 -; GCN-NEXT: v_readlane_b32 s12, v1, 56 -; GCN-NEXT: v_readlane_b32 s13, v1, 57 -; GCN-NEXT: v_readlane_b32 s14, v1, 58 -; GCN-NEXT: v_readlane_b32 s15, v1, 59 -; GCN-NEXT: v_readlane_b32 s16, v1, 60 -; GCN-NEXT: v_readlane_b32 s17, v1, 61 -; GCN-NEXT: v_readlane_b32 s18, v1, 62 -; GCN-NEXT: v_readlane_b32 s19, v1, 63 +; GCN-NEXT: v_readlane_b32 s4, v31, 48 +; GCN-NEXT: v_readlane_b32 s5, v31, 49 +; GCN-NEXT: v_readlane_b32 s6, v31, 50 +; GCN-NEXT: v_readlane_b32 s7, v31, 51 +; GCN-NEXT: v_readlane_b32 s8, v31, 52 +; GCN-NEXT: v_readlane_b32 s9, v31, 53 +; GCN-NEXT: v_readlane_b32 s10, v31, 54 +; GCN-NEXT: v_readlane_b32 s11, v31, 55 +; GCN-NEXT: v_readlane_b32 s12, v31, 56 +; GCN-NEXT: v_readlane_b32 s13, v31, 57 +; GCN-NEXT: v_readlane_b32 s14, v31, 58 +; GCN-NEXT: v_readlane_b32 s15, v31, 59 +; GCN-NEXT: v_readlane_b32 s16, v31, 60 +; GCN-NEXT: v_readlane_b32 s17, v31, 61 +; GCN-NEXT: v_readlane_b32 s18, v31, 62 +; GCN-NEXT: v_readlane_b32 s19, v31, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readlane_b32 s0, v32, 0 +; GCN-NEXT: v_readlane_b32 s1, v32, 1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -946,14 +918,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ; use s[0:1] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 @@ -993,17 +957,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s9 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -1017,91 +973,91 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 -; GCN-NEXT: v_writelane_b32 v1, s12, 8 -; GCN-NEXT: v_writelane_b32 v1, s13, 9 -; GCN-NEXT: v_writelane_b32 v1, s14, 10 -; GCN-NEXT: v_writelane_b32 v1, s15, 11 -; GCN-NEXT: v_writelane_b32 v1, s16, 12 -; GCN-NEXT: v_writelane_b32 v1, s17, 13 -; GCN-NEXT: v_writelane_b32 v1, s18, 14 -; GCN-NEXT: v_writelane_b32 v1, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v32, s4, 0 +; GCN-NEXT: v_writelane_b32 v32, s5, 1 +; GCN-NEXT: v_writelane_b32 v32, s6, 2 +; GCN-NEXT: v_writelane_b32 v32, s7, 3 +; GCN-NEXT: v_writelane_b32 v32, s8, 4 +; GCN-NEXT: v_writelane_b32 v32, s9, 5 +; GCN-NEXT: v_writelane_b32 v32, s10, 6 +; GCN-NEXT: v_writelane_b32 v32, s11, 7 +; GCN-NEXT: v_writelane_b32 v32, s12, 8 +; GCN-NEXT: v_writelane_b32 v32, s13, 9 +; GCN-NEXT: v_writelane_b32 v32, s14, 10 +; GCN-NEXT: v_writelane_b32 v32, s15, 11 +; GCN-NEXT: v_writelane_b32 v32, s16, 12 +; GCN-NEXT: v_writelane_b32 v32, s17, 13 +; GCN-NEXT: v_writelane_b32 v32, s18, 14 +; GCN-NEXT: v_writelane_b32 v32, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 -; GCN-NEXT: v_writelane_b32 v1, s12, 24 -; GCN-NEXT: v_writelane_b32 v1, s13, 25 -; GCN-NEXT: v_writelane_b32 v1, s14, 26 -; GCN-NEXT: v_writelane_b32 v1, s15, 27 -; GCN-NEXT: v_writelane_b32 v1, s16, 28 -; GCN-NEXT: v_writelane_b32 v1, s17, 29 -; GCN-NEXT: v_writelane_b32 v1, s18, 30 -; GCN-NEXT: v_writelane_b32 v1, s19, 31 +; GCN-NEXT: v_writelane_b32 v32, s4, 16 +; GCN-NEXT: v_writelane_b32 v32, s5, 17 +; GCN-NEXT: v_writelane_b32 v32, s6, 18 +; GCN-NEXT: v_writelane_b32 v32, s7, 19 +; GCN-NEXT: v_writelane_b32 v32, s8, 20 +; GCN-NEXT: v_writelane_b32 v32, s9, 21 +; GCN-NEXT: v_writelane_b32 v32, s10, 22 +; GCN-NEXT: v_writelane_b32 v32, s11, 23 +; GCN-NEXT: v_writelane_b32 v32, s12, 24 +; GCN-NEXT: v_writelane_b32 v32, s13, 25 +; GCN-NEXT: v_writelane_b32 v32, s14, 26 +; GCN-NEXT: v_writelane_b32 v32, s15, 27 +; GCN-NEXT: v_writelane_b32 v32, s16, 28 +; GCN-NEXT: v_writelane_b32 v32, s17, 29 +; GCN-NEXT: v_writelane_b32 v32, s18, 30 +; GCN-NEXT: v_writelane_b32 v32, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 -; GCN-NEXT: v_writelane_b32 v1, s12, 40 -; GCN-NEXT: v_writelane_b32 v1, s13, 41 -; GCN-NEXT: v_writelane_b32 v1, s14, 42 -; GCN-NEXT: v_writelane_b32 v1, s15, 43 -; GCN-NEXT: v_writelane_b32 v1, s16, 44 -; GCN-NEXT: v_writelane_b32 v1, s17, 45 -; GCN-NEXT: v_writelane_b32 v1, s18, 46 -; GCN-NEXT: v_writelane_b32 v1, s19, 47 +; GCN-NEXT: v_writelane_b32 v32, s4, 32 +; GCN-NEXT: v_writelane_b32 v32, s5, 33 +; GCN-NEXT: v_writelane_b32 v32, s6, 34 +; GCN-NEXT: v_writelane_b32 v32, s7, 35 +; GCN-NEXT: v_writelane_b32 v32, s8, 36 +; GCN-NEXT: v_writelane_b32 v32, s9, 37 +; GCN-NEXT: v_writelane_b32 v32, s10, 38 +; GCN-NEXT: v_writelane_b32 v32, s11, 39 +; GCN-NEXT: v_writelane_b32 v32, s12, 40 +; GCN-NEXT: v_writelane_b32 v32, s13, 41 +; GCN-NEXT: v_writelane_b32 v32, s14, 42 +; GCN-NEXT: v_writelane_b32 v32, s15, 43 +; GCN-NEXT: v_writelane_b32 v32, s16, 44 +; GCN-NEXT: v_writelane_b32 v32, s17, 45 +; GCN-NEXT: v_writelane_b32 v32, s18, 46 +; GCN-NEXT: v_writelane_b32 v32, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 -; GCN-NEXT: v_writelane_b32 v1, s12, 56 -; GCN-NEXT: v_writelane_b32 v1, s13, 57 -; GCN-NEXT: v_writelane_b32 v1, s14, 58 -; GCN-NEXT: v_writelane_b32 v1, s15, 59 -; GCN-NEXT: v_writelane_b32 v1, s16, 60 -; GCN-NEXT: v_writelane_b32 v1, s17, 61 -; GCN-NEXT: v_writelane_b32 v1, s18, 62 -; GCN-NEXT: v_writelane_b32 v1, s19, 63 +; GCN-NEXT: v_writelane_b32 v32, s4, 48 +; GCN-NEXT: v_writelane_b32 v32, s5, 49 +; GCN-NEXT: v_writelane_b32 v32, s6, 50 +; GCN-NEXT: v_writelane_b32 v32, s7, 51 +; GCN-NEXT: v_writelane_b32 v32, s8, 52 +; GCN-NEXT: v_writelane_b32 v32, s9, 53 +; GCN-NEXT: v_writelane_b32 v32, s10, 54 +; GCN-NEXT: v_writelane_b32 v32, s11, 55 +; GCN-NEXT: v_writelane_b32 v32, s12, 56 +; GCN-NEXT: v_writelane_b32 v32, s13, 57 +; GCN-NEXT: v_writelane_b32 v32, s14, 58 +; GCN-NEXT: v_writelane_b32 v32, s15, 59 +; GCN-NEXT: v_writelane_b32 v32, s16, 60 +; GCN-NEXT: v_writelane_b32 v32, s17, 61 +; GCN-NEXT: v_writelane_b32 v32, s18, 62 +; GCN-NEXT: v_writelane_b32 v32, s19, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v0, s2, 0 -; GCN-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v32, s2, 0 +; GCN-NEXT: v_writelane_b32 v32, s3, 1 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[52:55], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1109,59 +1065,59 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s36, v2, 32 -; GCN-NEXT: v_readlane_b32 s37, v2, 33 -; GCN-NEXT: v_readlane_b32 s38, v2, 34 -; GCN-NEXT: v_readlane_b32 s39, v2, 35 -; GCN-NEXT: v_readlane_b32 s40, v2, 36 -; GCN-NEXT: v_readlane_b32 s41, v2, 37 -; GCN-NEXT: v_readlane_b32 s42, v2, 38 -; GCN-NEXT: v_readlane_b32 s43, v2, 39 -; GCN-NEXT: v_readlane_b32 s44, v2, 40 -; GCN-NEXT: v_readlane_b32 s45, v2, 41 -; GCN-NEXT: v_readlane_b32 s46, v2, 42 -; GCN-NEXT: v_readlane_b32 s47, v2, 43 -; GCN-NEXT: v_readlane_b32 s48, v2, 44 -; GCN-NEXT: v_readlane_b32 s49, v2, 45 -; GCN-NEXT: v_readlane_b32 s50, v2, 46 -; GCN-NEXT: v_readlane_b32 s51, v2, 47 -; GCN-NEXT: v_readlane_b32 s0, v2, 16 -; GCN-NEXT: v_readlane_b32 s1, v2, 17 -; GCN-NEXT: v_readlane_b32 s2, v2, 18 -; GCN-NEXT: v_readlane_b32 s3, v2, 19 -; GCN-NEXT: v_readlane_b32 s4, v2, 20 -; GCN-NEXT: v_readlane_b32 s5, v2, 21 -; GCN-NEXT: v_readlane_b32 s6, v2, 22 -; GCN-NEXT: v_readlane_b32 s7, v2, 23 -; GCN-NEXT: v_readlane_b32 s8, v2, 24 -; GCN-NEXT: v_readlane_b32 s9, v2, 25 -; GCN-NEXT: v_readlane_b32 s10, v2, 26 -; GCN-NEXT: v_readlane_b32 s11, v2, 27 -; GCN-NEXT: v_readlane_b32 s12, v2, 28 -; GCN-NEXT: v_readlane_b32 s13, v2, 29 -; GCN-NEXT: v_readlane_b32 s14, v2, 30 -; GCN-NEXT: v_readlane_b32 s15, v2, 31 -; GCN-NEXT: v_readlane_b32 s16, v2, 0 -; GCN-NEXT: v_readlane_b32 s17, v2, 1 -; GCN-NEXT: v_readlane_b32 s18, v2, 2 -; GCN-NEXT: v_readlane_b32 s19, v2, 3 -; GCN-NEXT: v_readlane_b32 s20, v2, 4 -; GCN-NEXT: v_readlane_b32 s21, v2, 5 -; GCN-NEXT: v_readlane_b32 s22, v2, 6 -; GCN-NEXT: v_readlane_b32 s23, v2, 7 -; GCN-NEXT: v_readlane_b32 s24, v2, 8 -; GCN-NEXT: v_readlane_b32 s25, v2, 9 -; GCN-NEXT: v_readlane_b32 s26, v2, 10 -; GCN-NEXT: v_readlane_b32 s27, v2, 11 -; GCN-NEXT: v_readlane_b32 s28, v2, 12 -; GCN-NEXT: v_readlane_b32 s29, v2, 13 -; GCN-NEXT: v_readlane_b32 s30, v2, 14 -; GCN-NEXT: v_readlane_b32 s31, v2, 15 +; GCN-NEXT: v_readlane_b32 s36, v31, 32 +; GCN-NEXT: v_readlane_b32 s37, v31, 33 +; GCN-NEXT: v_readlane_b32 s38, v31, 34 +; GCN-NEXT: v_readlane_b32 s39, v31, 35 +; GCN-NEXT: v_readlane_b32 s40, v31, 36 +; GCN-NEXT: v_readlane_b32 s41, v31, 37 +; GCN-NEXT: v_readlane_b32 s42, v31, 38 +; GCN-NEXT: v_readlane_b32 s43, v31, 39 +; GCN-NEXT: v_readlane_b32 s44, v31, 40 +; GCN-NEXT: v_readlane_b32 s45, v31, 41 +; GCN-NEXT: v_readlane_b32 s46, v31, 42 +; GCN-NEXT: v_readlane_b32 s47, v31, 43 +; GCN-NEXT: v_readlane_b32 s48, v31, 44 +; GCN-NEXT: v_readlane_b32 s49, v31, 45 +; GCN-NEXT: v_readlane_b32 s50, v31, 46 +; GCN-NEXT: v_readlane_b32 s51, v31, 47 +; GCN-NEXT: v_readlane_b32 s0, v31, 16 +; GCN-NEXT: v_readlane_b32 s1, v31, 17 +; GCN-NEXT: v_readlane_b32 s2, v31, 18 +; GCN-NEXT: v_readlane_b32 s3, v31, 19 +; GCN-NEXT: v_readlane_b32 s4, v31, 20 +; GCN-NEXT: v_readlane_b32 s5, v31, 21 +; GCN-NEXT: v_readlane_b32 s6, v31, 22 +; GCN-NEXT: v_readlane_b32 s7, v31, 23 +; GCN-NEXT: v_readlane_b32 s8, v31, 24 +; GCN-NEXT: v_readlane_b32 s9, v31, 25 +; GCN-NEXT: v_readlane_b32 s10, v31, 26 +; GCN-NEXT: v_readlane_b32 s11, v31, 27 +; GCN-NEXT: v_readlane_b32 s12, v31, 28 +; GCN-NEXT: v_readlane_b32 s13, v31, 29 +; GCN-NEXT: v_readlane_b32 s14, v31, 30 +; GCN-NEXT: v_readlane_b32 s15, v31, 31 +; GCN-NEXT: v_readlane_b32 s16, v31, 0 +; GCN-NEXT: v_readlane_b32 s17, v31, 1 +; GCN-NEXT: v_readlane_b32 s18, v31, 2 +; GCN-NEXT: v_readlane_b32 s19, v31, 3 +; GCN-NEXT: v_readlane_b32 s20, v31, 4 +; GCN-NEXT: v_readlane_b32 s21, v31, 5 +; GCN-NEXT: v_readlane_b32 s22, v31, 6 +; GCN-NEXT: v_readlane_b32 s23, v31, 7 +; GCN-NEXT: v_readlane_b32 s24, v31, 8 +; GCN-NEXT: v_readlane_b32 s25, v31, 9 +; GCN-NEXT: v_readlane_b32 s26, v31, 10 +; GCN-NEXT: v_readlane_b32 s27, v31, 11 +; GCN-NEXT: v_readlane_b32 s28, v31, 12 +; GCN-NEXT: v_readlane_b32 s29, v31, 13 +; GCN-NEXT: v_readlane_b32 s30, v31, 14 +; GCN-NEXT: v_readlane_b32 s31, v31, 15 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v0 @@ -1172,25 +1128,25 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v2, 48 -; GCN-NEXT: v_readlane_b32 s5, v2, 49 -; GCN-NEXT: v_readlane_b32 s6, v2, 50 -; GCN-NEXT: v_readlane_b32 s7, v2, 51 -; GCN-NEXT: v_readlane_b32 s8, v2, 52 -; GCN-NEXT: v_readlane_b32 s9, v2, 53 -; GCN-NEXT: v_readlane_b32 s10, v2, 54 -; GCN-NEXT: v_readlane_b32 s11, v2, 55 -; GCN-NEXT: v_readlane_b32 s12, v2, 56 -; GCN-NEXT: v_readlane_b32 s13, v2, 57 -; GCN-NEXT: v_readlane_b32 s14, v2, 58 -; GCN-NEXT: v_readlane_b32 s15, v2, 59 -; GCN-NEXT: v_readlane_b32 s16, v2, 60 -; GCN-NEXT: v_readlane_b32 s17, v2, 61 -; GCN-NEXT: v_readlane_b32 s18, v2, 62 -; GCN-NEXT: v_readlane_b32 s19, v2, 63 +; GCN-NEXT: v_readlane_b32 s4, v31, 48 +; GCN-NEXT: v_readlane_b32 s5, v31, 49 +; GCN-NEXT: v_readlane_b32 s6, v31, 50 +; GCN-NEXT: v_readlane_b32 s7, v31, 51 +; GCN-NEXT: v_readlane_b32 s8, v31, 52 +; GCN-NEXT: v_readlane_b32 s9, v31, 53 +; GCN-NEXT: v_readlane_b32 s10, v31, 54 +; GCN-NEXT: v_readlane_b32 s11, v31, 55 +; GCN-NEXT: v_readlane_b32 s12, v31, 56 +; GCN-NEXT: v_readlane_b32 s13, v31, 57 +; GCN-NEXT: v_readlane_b32 s14, v31, 58 +; GCN-NEXT: v_readlane_b32 s15, v31, 59 +; GCN-NEXT: v_readlane_b32 s16, v31, 60 +; GCN-NEXT: v_readlane_b32 s17, v31, 61 +; GCN-NEXT: v_readlane_b32 s18, v31, 62 +; GCN-NEXT: v_readlane_b32 s19, v31, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-NEXT: v_readlane_b32 s0, v32, 0 +; GCN-NEXT: v_readlane_b32 s1, v32, 1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -1204,14 +1160,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ; use v0 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 @@ -1243,7 +1191,7 @@ ret: } attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-waves-per-eu"="8,8" } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="7,7" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir index 8e2a56b463c40..fa62048fd31ad 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir @@ -61,35 +61,27 @@ machineFunctionInfo: isChainFunction: true returnsVoid: true wwmReservedRegs: - - '$vgpr11' + - '$vgpr10' body: | bb.0: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 ; GCN-LABEL: name: preserve_all_lanes_wwm_above_args - ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 - ; GCN-NEXT: renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10 + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 - ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0 - ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec - ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $vgpr8 = COPY killed $vgpr0 ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 - renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10 + $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10 $sgpr35 = S_MOV_B32 5 $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0 - renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec - $vgpr8 = COPY renamable killed $vgpr10 + $vgpr10 = V_MOV_B32_e32 10, implicit $exec + $vgpr8 = COPY killed $vgpr10 renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 @@ -139,23 +131,15 @@ body: | liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 ; GCN-LABEL: name: preserve_inactive_lanes_wwm_args - ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr10 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 - ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 - ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr0 renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 $sgpr35 = S_MOV_B32 5 $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 @@ -184,7 +168,7 @@ body: | ; GCN-LABEL: name: dont_preserve_if_no_chain_calls ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec @@ -218,7 +202,7 @@ body: | ; GCN-LABEL: name: dont_preserve_v0_v7 ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir index 4b8b71a740085..49001a2cfd7a6 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir @@ -36,19 +36,11 @@ body: | liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 ; GCN-LABEL: name: preserve_inactive_wwm - ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + ; GCN: liveins: $sgpr0, $sgpr35 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 - ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1 renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 @@ -72,24 +64,16 @@ body: | ; GCN-LABEL: name: preserve_inactive_detected_wwm ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 - ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec - ; GCN-NEXT: renamable $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9 + ; GCN-NEXT: $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0 ; GCN-NEXT: renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 $sgpr35 = S_MOV_B32 5 @@ -122,7 +106,7 @@ body: | ; GCN-LABEL: name: dont_preserve_wwm_if_no_chain_calls ; GCN: liveins: $sgpr35, $vgpr8 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec @@ -151,11 +135,11 @@ body: | liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 ; GCN-LABEL: name: dont_preserve_wwm_if_init_whole_wave - ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + ; GCN: liveins: $sgpr0, $sgpr35 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1 renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 @@ -209,7 +193,7 @@ body: | ; GCN-LABEL: name: dont_preserve_v0_v7 ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index b21285e83dc21..4be102f7860ea 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s # Check that %3 was not rematerialized before the last store since its operand %1 # is killed by that store. diff --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir index 4571e792c7cb5..168d63d3a95b9 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir +++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir @@ -20,16 +20,10 @@ body: | ; GCN-LABEL: name: preserve_scratch_vgpr_inactive_lanes ; GCN: liveins: $sgpr35, $vgpr0, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec - ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0 renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 $sgpr35 = S_MOV_B32 5 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index bbeb2e1884a9f..924340ec8a2a6 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -13,333 +13,333 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b32 s16, s33 ; GFX906-NEXT: s_mov_b32 s33, s32 ; GFX906-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX906-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, -1 -; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[18:19] -; GFX906-NEXT: ; implicit-def: $vgpr2 +; GFX906-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane ; GFX906-NEXT: s_mov_b32 s21, s15 -; GFX906-NEXT: v_writelane_b32 v2, s6, 0 -; GFX906-NEXT: v_writelane_b32 v2, s7, 1 -; GFX906-NEXT: v_writelane_b32 v2, s21, 2 +; GFX906-NEXT: v_writelane_b32 v39, s6, 0 +; GFX906-NEXT: v_writelane_b32 v39, s7, 1 +; GFX906-NEXT: v_writelane_b32 v39, s21, 2 ; GFX906-NEXT: s_mov_b32 s22, s14 -; GFX906-NEXT: v_writelane_b32 v2, s22, 3 +; GFX906-NEXT: v_writelane_b32 v39, s22, 3 ; GFX906-NEXT: s_mov_b32 s23, s13 -; GFX906-NEXT: v_writelane_b32 v2, s23, 4 +; GFX906-NEXT: v_writelane_b32 v39, s23, 4 ; GFX906-NEXT: s_mov_b32 s24, s12 -; GFX906-NEXT: v_writelane_b32 v2, s24, 5 +; GFX906-NEXT: v_writelane_b32 v39, s24, 5 ; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11] -; GFX906-NEXT: v_writelane_b32 v2, s26, 6 +; GFX906-NEXT: v_writelane_b32 v39, s26, 6 ; GFX906-NEXT: v_writelane_b32 v41, s16, 4 -; GFX906-NEXT: v_writelane_b32 v2, s27, 7 +; GFX906-NEXT: v_writelane_b32 v39, s27, 7 ; GFX906-NEXT: v_writelane_b32 v41, s34, 2 -; GFX906-NEXT: v_writelane_b32 v2, s8, 8 +; GFX906-NEXT: v_writelane_b32 v39, s8, 8 ; GFX906-NEXT: v_writelane_b32 v41, s35, 3 -; GFX906-NEXT: v_writelane_b32 v2, s9, 9 +; GFX906-NEXT: v_writelane_b32 v39, s9, 9 ; GFX906-NEXT: v_writelane_b32 v41, s30, 0 -; GFX906-NEXT: v_writelane_b32 v2, s4, 10 +; GFX906-NEXT: v_writelane_b32 v39, s4, 10 ; GFX906-NEXT: s_addk_i32 s32, 0x2800 +; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX906-NEXT: v_writelane_b32 v41, s31, 1 ; GFX906-NEXT: v_mov_b32_e32 v32, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: v_writelane_b32 v2, s5, 11 +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: v_writelane_b32 v39, s5, 11 ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: v_mov_b32_e32 v33, v2 ; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def v[0:31] ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def v40 ; GFX906-NEXT: ;;#ASMEND ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s11 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: v_mov_b32_e32 v40, v33 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_writelane_b32 v40, s11, 12 +; GFX906-NEXT: v_writelane_b32 v39, s11, 12 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s12 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s12, 13 +; GFX906-NEXT: v_writelane_b32 v39, s12, 13 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s13 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s13, 14 +; GFX906-NEXT: v_writelane_b32 v39, s13, 14 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s14 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s14, 15 +; GFX906-NEXT: v_writelane_b32 v39, s14, 15 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s15 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s15, 16 +; GFX906-NEXT: v_writelane_b32 v39, s15, 16 ; GFX906-NEXT: s_getpc_b64 s[10:11] ; GFX906-NEXT: s_add_u32 s10, s10, foo@gotpcrel32@lo+4 ; GFX906-NEXT: s_addc_u32 s11, s11, foo@gotpcrel32@hi+12 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s16 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s16, 17 +; GFX906-NEXT: v_writelane_b32 v39, s16, 17 ; GFX906-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s17 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s17, 18 +; GFX906-NEXT: v_writelane_b32 v39, s17, 18 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s18 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s18, 19 +; GFX906-NEXT: v_writelane_b32 v39, s18, 19 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s19 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s19, 20 +; GFX906-NEXT: v_writelane_b32 v39, s19, 20 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s20 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s20, 21 +; GFX906-NEXT: v_writelane_b32 v39, s20, 21 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_writelane_b32 v40, s10, 22 -; GFX906-NEXT: v_writelane_b32 v40, s11, 23 +; GFX906-NEXT: v_writelane_b32 v39, s10, 22 +; GFX906-NEXT: v_writelane_b32 v39, s11, 23 +; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s16, v40, 22 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: s_mov_b32 s12, s24 ; GFX906-NEXT: s_mov_b32 s13, s23 ; GFX906-NEXT: s_mov_b32 s14, s22 ; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: s_mov_b32 s15, s21 ; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX906-NEXT: v_readlane_b32 s17, v40, 23 -; GFX906-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: v_readlane_b32 s17, v39, 23 +; GFX906-NEXT: v_mov_b32_e32 v40, v32 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s11, v40, 12 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_readlane_b32 s11, v39, 12 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s11 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s12, v40, 13 +; GFX906-NEXT: v_readlane_b32 s12, v39, 13 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s12 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s13, v40, 14 +; GFX906-NEXT: v_readlane_b32 s13, v39, 14 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s13 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s14, v40, 15 +; GFX906-NEXT: v_readlane_b32 s14, v39, 15 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s14 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s15, v40, 16 +; GFX906-NEXT: v_readlane_b32 s15, v39, 16 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s15 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s16, v40, 17 +; GFX906-NEXT: v_readlane_b32 s16, v39, 17 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s16 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s17, v40, 18 +; GFX906-NEXT: v_readlane_b32 s17, v39, 18 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s17 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s18, v40, 19 +; GFX906-NEXT: v_readlane_b32 s18, v39, 19 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s18 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s19, v40, 20 +; GFX906-NEXT: v_readlane_b32 s19, v39, 20 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s19 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s20, v40, 21 +; GFX906-NEXT: v_readlane_b32 s20, v39, 21 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s20 ; GFX906-NEXT: ;;#ASMEND ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s21 ; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v39, s21, 24 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s22 ; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v39, s22, 25 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s23 ; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v39, s23, 26 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s24 ; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v39, s24, 27 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s25 ; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v39, s25, 28 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s26 ; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v39, s26, 29 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s27 ; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v39, s27, 30 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s28 ; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v39, s28, 31 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; def s29 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: v_writelane_b32 v40, s21, 24 -; GFX906-NEXT: v_writelane_b32 v40, s22, 25 -; GFX906-NEXT: v_writelane_b32 v40, s23, 26 -; GFX906-NEXT: v_writelane_b32 v40, s24, 27 -; GFX906-NEXT: v_writelane_b32 v40, s25, 28 -; GFX906-NEXT: v_writelane_b32 v40, s26, 29 -; GFX906-NEXT: v_writelane_b32 v40, s27, 30 -; GFX906-NEXT: v_writelane_b32 v40, s28, 31 -; GFX906-NEXT: v_writelane_b32 v40, s29, 32 -; GFX906-NEXT: v_readlane_b32 s4, v40, 10 -; GFX906-NEXT: v_readlane_b32 s6, v40, 0 -; GFX906-NEXT: v_readlane_b32 s8, v40, 8 -; GFX906-NEXT: v_readlane_b32 s10, v40, 6 -; GFX906-NEXT: v_readlane_b32 s16, v40, 22 -; GFX906-NEXT: v_readlane_b32 s12, v40, 5 -; GFX906-NEXT: v_readlane_b32 s13, v40, 4 -; GFX906-NEXT: v_readlane_b32 s14, v40, 3 -; GFX906-NEXT: v_readlane_b32 s15, v40, 2 -; GFX906-NEXT: v_readlane_b32 s5, v40, 11 -; GFX906-NEXT: v_readlane_b32 s7, v40, 1 -; GFX906-NEXT: v_readlane_b32 s9, v40, 9 -; GFX906-NEXT: v_readlane_b32 s11, v40, 7 -; GFX906-NEXT: v_readlane_b32 s17, v40, 23 +; GFX906-NEXT: v_writelane_b32 v39, s29, 32 ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: v_readlane_b32 s4, v39, 10 +; GFX906-NEXT: v_readlane_b32 s6, v39, 0 +; GFX906-NEXT: v_readlane_b32 s8, v39, 8 +; GFX906-NEXT: v_readlane_b32 s10, v39, 6 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 +; GFX906-NEXT: v_readlane_b32 s12, v39, 5 +; GFX906-NEXT: v_mov_b32_e32 v31, v40 +; GFX906-NEXT: v_readlane_b32 s13, v39, 4 +; GFX906-NEXT: v_readlane_b32 s14, v39, 3 +; GFX906-NEXT: v_readlane_b32 s15, v39, 2 +; GFX906-NEXT: v_readlane_b32 s5, v39, 11 +; GFX906-NEXT: v_readlane_b32 s7, v39, 1 +; GFX906-NEXT: v_readlane_b32 s9, v39, 9 +; GFX906-NEXT: v_readlane_b32 s11, v39, 7 +; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s21, v40, 24 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_readlane_b32 s4, v39, 10 +; GFX906-NEXT: v_readlane_b32 s6, v39, 0 +; GFX906-NEXT: v_readlane_b32 s8, v39, 8 +; GFX906-NEXT: v_readlane_b32 s10, v39, 6 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 +; GFX906-NEXT: v_readlane_b32 s5, v39, 11 +; GFX906-NEXT: v_readlane_b32 s7, v39, 1 +; GFX906-NEXT: v_readlane_b32 s9, v39, 9 +; GFX906-NEXT: v_readlane_b32 s11, v39, 7 +; GFX906-NEXT: v_readlane_b32 s12, v39, 5 +; GFX906-NEXT: v_readlane_b32 s13, v39, 4 +; GFX906-NEXT: v_readlane_b32 s14, v39, 3 +; GFX906-NEXT: v_readlane_b32 s15, v39, 2 +; GFX906-NEXT: v_mov_b32_e32 v31, v40 +; GFX906-NEXT: v_readlane_b32 s17, v39, 23 +; GFX906-NEXT: v_readlane_b32 s21, v39, 24 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s21 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s22, v40, 25 +; GFX906-NEXT: v_readlane_b32 s22, v39, 25 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s22 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s23, v40, 26 +; GFX906-NEXT: v_readlane_b32 s23, v39, 26 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s23 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s24, v40, 27 +; GFX906-NEXT: v_readlane_b32 s24, v39, 27 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s24 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s25, v40, 28 +; GFX906-NEXT: v_readlane_b32 s25, v39, 28 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s25 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s26, v40, 29 +; GFX906-NEXT: v_readlane_b32 s26, v39, 29 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s26 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s27, v40, 30 +; GFX906-NEXT: v_readlane_b32 s27, v39, 30 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s27 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s28, v40, 31 +; GFX906-NEXT: v_readlane_b32 s28, v39, 31 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s28 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s29, v40, 32 +; GFX906-NEXT: v_readlane_b32 s29, v39, 32 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s29 ; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: v_readlane_b32 s4, v40, 10 -; GFX906-NEXT: v_readlane_b32 s6, v40, 0 -; GFX906-NEXT: v_readlane_b32 s8, v40, 8 -; GFX906-NEXT: v_readlane_b32 s10, v40, 6 -; GFX906-NEXT: v_readlane_b32 s16, v40, 22 -; GFX906-NEXT: v_readlane_b32 s5, v40, 11 -; GFX906-NEXT: v_readlane_b32 s7, v40, 1 -; GFX906-NEXT: v_readlane_b32 s9, v40, 9 -; GFX906-NEXT: v_readlane_b32 s11, v40, 7 -; GFX906-NEXT: v_readlane_b32 s12, v40, 5 -; GFX906-NEXT: v_readlane_b32 s13, v40, 4 -; GFX906-NEXT: v_readlane_b32 s14, v40, 3 -; GFX906-NEXT: v_readlane_b32 s15, v40, 2 -; GFX906-NEXT: v_readlane_b32 s17, v40, 23 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 -; GFX906-NEXT: ; kill: killed $vgpr40 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 @@ -360,12 +360,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX906-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, -1 -; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, s[6:7] ; GFX906-NEXT: s_addk_i32 s32, 0xd800 ; GFX906-NEXT: s_mov_b32 s33, s4 @@ -378,346 +377,346 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_mov_b32 s16, s33 ; GFX908-NEXT: s_mov_b32 s33, s32 ; GFX908-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX908-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill -; GFX908-NEXT: s_mov_b64 exec, -1 -; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[18:19] -; GFX908-NEXT: v_mov_b32_e32 v3, s16 -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill -; GFX908-NEXT: v_mov_b32_e32 v3, s34 -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill -; GFX908-NEXT: v_mov_b32_e32 v3, s35 -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; GFX908-NEXT: v_mov_b32_e32 v2, s16 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; GFX908-NEXT: v_mov_b32_e32 v2, s34 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; GFX908-NEXT: v_mov_b32_e32 v2, s35 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill ; GFX908-NEXT: s_addk_i32 s32, 0x2c00 +; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 s[16:17], exec ; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:172 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:168 ; GFX908-NEXT: v_writelane_b32 v2, s30, 0 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:168 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[16:17] ; GFX908-NEXT: s_mov_b64 s[16:17], exec ; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:172 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:168 ; GFX908-NEXT: v_writelane_b32 v2, s31, 0 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:168 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[16:17] -; GFX908-NEXT: ; implicit-def: $vgpr2 +; GFX908-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane ; GFX908-NEXT: s_mov_b32 s21, s15 -; GFX908-NEXT: v_writelane_b32 v2, s6, 0 -; GFX908-NEXT: v_writelane_b32 v2, s7, 1 -; GFX908-NEXT: v_writelane_b32 v2, s21, 2 +; GFX908-NEXT: v_writelane_b32 v39, s6, 0 +; GFX908-NEXT: v_writelane_b32 v39, s7, 1 +; GFX908-NEXT: v_writelane_b32 v39, s21, 2 ; GFX908-NEXT: s_mov_b32 s22, s14 -; GFX908-NEXT: v_writelane_b32 v2, s22, 3 +; GFX908-NEXT: v_writelane_b32 v39, s22, 3 ; GFX908-NEXT: s_mov_b32 s23, s13 -; GFX908-NEXT: v_writelane_b32 v2, s23, 4 +; GFX908-NEXT: v_writelane_b32 v39, s23, 4 ; GFX908-NEXT: s_mov_b32 s24, s12 -; GFX908-NEXT: v_writelane_b32 v2, s24, 5 +; GFX908-NEXT: v_writelane_b32 v39, s24, 5 ; GFX908-NEXT: s_mov_b64 s[26:27], s[10:11] -; GFX908-NEXT: v_writelane_b32 v2, s26, 6 -; GFX908-NEXT: v_writelane_b32 v2, s27, 7 -; GFX908-NEXT: v_writelane_b32 v2, s8, 8 -; GFX908-NEXT: v_writelane_b32 v2, s9, 9 -; GFX908-NEXT: v_writelane_b32 v2, s4, 10 +; GFX908-NEXT: v_writelane_b32 v39, s26, 6 +; GFX908-NEXT: v_writelane_b32 v39, s27, 7 +; GFX908-NEXT: v_writelane_b32 v39, s8, 8 +; GFX908-NEXT: v_writelane_b32 v39, s9, 9 +; GFX908-NEXT: v_writelane_b32 v39, s4, 10 ; GFX908-NEXT: v_mov_b32_e32 v32, v31 -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX908-NEXT: v_writelane_b32 v2, s5, 11 +; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX908-NEXT: v_writelane_b32 v39, s5, 11 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: v_mov_b32_e32 v33, v2 ; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def v[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def v40 ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s11 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: v_mov_b32_e32 v40, v33 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_writelane_b32 v40, s11, 12 +; GFX908-NEXT: v_writelane_b32 v39, s11, 12 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s12 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s12, 13 +; GFX908-NEXT: v_writelane_b32 v39, s12, 13 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s13 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s13, 14 +; GFX908-NEXT: v_writelane_b32 v39, s13, 14 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s14 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s14, 15 +; GFX908-NEXT: v_writelane_b32 v39, s14, 15 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s15 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s15, 16 +; GFX908-NEXT: v_writelane_b32 v39, s15, 16 ; GFX908-NEXT: s_getpc_b64 s[10:11] ; GFX908-NEXT: s_add_u32 s10, s10, foo@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s11, s11, foo@gotpcrel32@hi+12 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s16 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s16, 17 +; GFX908-NEXT: v_writelane_b32 v39, s16, 17 ; GFX908-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s17 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s17, 18 +; GFX908-NEXT: v_writelane_b32 v39, s17, 18 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s18 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s18, 19 +; GFX908-NEXT: v_writelane_b32 v39, s18, 19 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s19 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s19, 20 +; GFX908-NEXT: v_writelane_b32 v39, s19, 20 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s20 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s20, 21 +; GFX908-NEXT: v_writelane_b32 v39, s20, 21 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_writelane_b32 v40, s10, 22 -; GFX908-NEXT: v_writelane_b32 v40, s11, 23 +; GFX908-NEXT: v_writelane_b32 v39, s10, 22 +; GFX908-NEXT: v_writelane_b32 v39, s11, 23 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s16, v40, 22 +; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 ; GFX908-NEXT: s_mov_b32 s14, s22 ; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s15, s21 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX908-NEXT: v_readlane_b32 s17, v40, 23 -; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX908-NEXT: v_readlane_b32 s17, v39, 23 +; GFX908-NEXT: v_mov_b32_e32 v40, v32 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s11, v40, 12 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_readlane_b32 s11, v39, 12 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s11 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s12, v40, 13 +; GFX908-NEXT: v_readlane_b32 s12, v39, 13 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s12 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s13, v40, 14 +; GFX908-NEXT: v_readlane_b32 s13, v39, 14 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s13 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s14, v40, 15 +; GFX908-NEXT: v_readlane_b32 s14, v39, 15 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s14 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s15, v40, 16 +; GFX908-NEXT: v_readlane_b32 s15, v39, 16 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s15 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s16, v40, 17 +; GFX908-NEXT: v_readlane_b32 s16, v39, 17 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s16 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s17, v40, 18 +; GFX908-NEXT: v_readlane_b32 s17, v39, 18 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s17 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s18, v40, 19 +; GFX908-NEXT: v_readlane_b32 s18, v39, 19 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s18 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s19, v40, 20 +; GFX908-NEXT: v_readlane_b32 s19, v39, 20 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s19 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s20, v40, 21 +; GFX908-NEXT: v_readlane_b32 s20, v39, 21 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s20 ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s21 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v39, s21, 24 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s22 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v39, s22, 25 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s23 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v39, s23, 26 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s24 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v39, s24, 27 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s25 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v39, s25, 28 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s26 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v39, s26, 29 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s27 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v39, s27, 30 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s28 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v39, s28, 31 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def s29 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX908-NEXT: v_writelane_b32 v40, s21, 24 -; GFX908-NEXT: v_writelane_b32 v40, s22, 25 -; GFX908-NEXT: v_writelane_b32 v40, s23, 26 -; GFX908-NEXT: v_writelane_b32 v40, s24, 27 -; GFX908-NEXT: v_writelane_b32 v40, s25, 28 -; GFX908-NEXT: v_writelane_b32 v40, s26, 29 -; GFX908-NEXT: v_writelane_b32 v40, s27, 30 -; GFX908-NEXT: v_writelane_b32 v40, s28, 31 -; GFX908-NEXT: v_writelane_b32 v40, s29, 32 -; GFX908-NEXT: v_readlane_b32 s4, v40, 10 -; GFX908-NEXT: v_readlane_b32 s6, v40, 0 -; GFX908-NEXT: v_readlane_b32 s8, v40, 8 -; GFX908-NEXT: v_readlane_b32 s10, v40, 6 -; GFX908-NEXT: v_readlane_b32 s16, v40, 22 -; GFX908-NEXT: v_readlane_b32 s12, v40, 5 -; GFX908-NEXT: v_readlane_b32 s13, v40, 4 -; GFX908-NEXT: v_readlane_b32 s14, v40, 3 -; GFX908-NEXT: v_readlane_b32 s15, v40, 2 -; GFX908-NEXT: v_readlane_b32 s5, v40, 11 -; GFX908-NEXT: v_readlane_b32 s7, v40, 1 -; GFX908-NEXT: v_readlane_b32 s9, v40, 9 -; GFX908-NEXT: v_readlane_b32 s11, v40, 7 -; GFX908-NEXT: v_readlane_b32 s17, v40, 23 +; GFX908-NEXT: v_writelane_b32 v39, s29, 32 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_readlane_b32 s4, v39, 10 +; GFX908-NEXT: v_readlane_b32 s6, v39, 0 +; GFX908-NEXT: v_readlane_b32 s8, v39, 8 +; GFX908-NEXT: v_readlane_b32 s10, v39, 6 +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 +; GFX908-NEXT: v_readlane_b32 s12, v39, 5 +; GFX908-NEXT: v_mov_b32_e32 v31, v40 +; GFX908-NEXT: v_readlane_b32 s13, v39, 4 +; GFX908-NEXT: v_readlane_b32 s14, v39, 3 +; GFX908-NEXT: v_readlane_b32 s15, v39, 2 +; GFX908-NEXT: v_readlane_b32 s5, v39, 11 +; GFX908-NEXT: v_readlane_b32 s7, v39, 1 +; GFX908-NEXT: v_readlane_b32 s9, v39, 9 +; GFX908-NEXT: v_readlane_b32 s11, v39, 7 +; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s21, v40, 24 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_readlane_b32 s4, v39, 10 +; GFX908-NEXT: v_readlane_b32 s6, v39, 0 +; GFX908-NEXT: v_readlane_b32 s8, v39, 8 +; GFX908-NEXT: v_readlane_b32 s10, v39, 6 +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 +; GFX908-NEXT: v_readlane_b32 s5, v39, 11 +; GFX908-NEXT: v_readlane_b32 s7, v39, 1 +; GFX908-NEXT: v_readlane_b32 s9, v39, 9 +; GFX908-NEXT: v_readlane_b32 s11, v39, 7 +; GFX908-NEXT: v_readlane_b32 s12, v39, 5 +; GFX908-NEXT: v_readlane_b32 s13, v39, 4 +; GFX908-NEXT: v_readlane_b32 s14, v39, 3 +; GFX908-NEXT: v_readlane_b32 s15, v39, 2 +; GFX908-NEXT: v_mov_b32_e32 v31, v40 +; GFX908-NEXT: v_readlane_b32 s17, v39, 23 +; GFX908-NEXT: v_readlane_b32 s21, v39, 24 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s21 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s22, v40, 25 +; GFX908-NEXT: v_readlane_b32 s22, v39, 25 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s22 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s23, v40, 26 +; GFX908-NEXT: v_readlane_b32 s23, v39, 26 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s23 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s24, v40, 27 +; GFX908-NEXT: v_readlane_b32 s24, v39, 27 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s24 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s25, v40, 28 +; GFX908-NEXT: v_readlane_b32 s25, v39, 28 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s25 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s26, v40, 29 +; GFX908-NEXT: v_readlane_b32 s26, v39, 29 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s26 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s27, v40, 30 +; GFX908-NEXT: v_readlane_b32 s27, v39, 30 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s27 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s28, v40, 31 +; GFX908-NEXT: v_readlane_b32 s28, v39, 31 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s28 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s29, v40, 32 +; GFX908-NEXT: v_readlane_b32 s29, v39, 32 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use s29 ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX908-NEXT: v_readlane_b32 s4, v40, 10 -; GFX908-NEXT: v_readlane_b32 s6, v40, 0 -; GFX908-NEXT: v_readlane_b32 s8, v40, 8 -; GFX908-NEXT: v_readlane_b32 s10, v40, 6 -; GFX908-NEXT: v_readlane_b32 s16, v40, 22 -; GFX908-NEXT: v_readlane_b32 s5, v40, 11 -; GFX908-NEXT: v_readlane_b32 s7, v40, 1 -; GFX908-NEXT: v_readlane_b32 s9, v40, 9 -; GFX908-NEXT: v_readlane_b32 s11, v40, 7 -; GFX908-NEXT: v_readlane_b32 s12, v40, 5 -; GFX908-NEXT: v_readlane_b32 s13, v40, 4 -; GFX908-NEXT: v_readlane_b32 s14, v40, 3 -; GFX908-NEXT: v_readlane_b32 s15, v40, 2 -; GFX908-NEXT: v_readlane_b32 s17, v40, 23 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 s[4:5], exec ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 @@ -737,37 +736,34 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:172 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s31, v0, 0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[4:5] ; GFX908-NEXT: s_mov_b64 s[4:5], exec ; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:172 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s30, v0, 0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[4:5] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload -; GFX908-NEXT: ; kill: killed $vgpr40 +; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s4, v0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s34, v0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s35, v0 ; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload -; GFX908-NEXT: s_mov_b64 exec, -1 -; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_addk_i32 s32, 0xd400 ; GFX908-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll index d58477c194ea6..c0d228e1254e6 100644 --- a/llvm/test/CodeGen/AMDGPU/recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/recursion.ll @@ -3,7 +3,11 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s ; CHECK-LABEL: {{^}}recursive: +; CHECK: .set recursive.private_seg_size, 16+(max(16384)) ; CHECK: ScratchSize: 16 + +; V5-LABEL: {{^}}recursive: +; V5: .set recursive.has_recursion, 1 define void @recursive() { call void @recursive() store volatile i32 0, ptr addrspace(1) undef @@ -11,18 +15,22 @@ define void @recursive() { } ; CHECK-LABEL: {{^}}tail_recursive: +; CHECK: .set tail_recursive.private_seg_size, 0 ; CHECK: ScratchSize: 0 define void @tail_recursive() { tail call void @tail_recursive() ret void } +; CHECK: .set calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size)) define void @calls_tail_recursive() norecurse { tail call void @tail_recursive() ret void } ; CHECK-LABEL: {{^}}tail_recursive_with_stack: +; CHECK: .set tail_recursive_with_stack.private_seg_size, 8 +; CHECK: .set tail_recursive_with_stack.has_recursion, 1 define void @tail_recursive_with_stack() { %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -33,11 +41,11 @@ define void @tail_recursive_with_stack() { ; For an arbitrary recursive call, report a large number for unknown stack ; usage for code object v4 and older ; CHECK-LABEL: {{^}}calls_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}} +; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size)) ; ; V5-LABEL: {{^}}calls_recursive: -; V5: .amdhsa_private_segment_fixed_size 0{{$}} -; V5: .amdhsa_uses_dynamic_stack 1 +; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size)) +; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack) define amdgpu_kernel void @calls_recursive() { call void @recursive() ret void @@ -46,7 +54,7 @@ define amdgpu_kernel void @calls_recursive() { ; Make sure we do not report a huge stack size for tail recursive ; functions ; CHECK-LABEL: {{^}}kernel_indirectly_calls_tail_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 0{{$}} +; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+(max(calls_tail_recursive.private_seg_size)) define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() { call void @calls_tail_recursive() ret void @@ -57,22 +65,22 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() { ; in the kernel. ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size)) ; ; V5-LABEL: {{^}}kernel_calls_tail_recursive: -; V5: .amdhsa_private_segment_fixed_size 0{{$}} -; V5: .amdhsa_uses_dynamic_stack 1 +; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size)) +; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion) define amdgpu_kernel void @kernel_calls_tail_recursive() { call void @tail_recursive() ret void } ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: -; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size)) ; ; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: -; V5: .amdhsa_private_segment_fixed_size 8{{$}} -; V5: .amdhsa_uses_dynamic_stack 1 +; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size)) +; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack) define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() { call void @tail_recursive_with_stack() ret void diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index 447a8bf9956f3..fe01728c00563 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -1,5 +1,5 @@ -# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs=0 -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s -# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s +# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs=0 -start-before=greedy,1 -stop-after=virtregrewriter,2 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,2 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s # FIXME: We should not produce a verifier error after erroring diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll index 8d12b3fe626da..35e11ad6a648b 100644 --- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll +++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 ; SI-LABEL: {{^}}foo: ; SI: .section .AMDGPU.csdata ; SI: ; Kernel info: -; SI: ; NumSgprs: {{[0-9]+}} +; SI: ; TotalNumSgprs: {{[0-9]+}} ; SI: ; NumVgprs: {{[0-9]+}} define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %abase, ptr addrspace(1) %bbase) nounwind { %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0); diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 19cc60963e900..f7f5bd56fa6f1 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -243,350 +243,345 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v13 -; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4] -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_ashrrev_i64 v[2:3], s4, v[2:3] +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: s_waitcnt vmcnt(4) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 ; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s10, 2 +; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 ; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, s10, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v2, vcc +; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v3, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v0, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v1, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[11:12], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v19 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v14, vcc, s10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v11, v10, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v8, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v13, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v13, v9, vcc +; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, s10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v10, v9, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v8, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[18:19], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v14 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[17:18], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v15, v13, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v15, v13, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 ; GFX9-O0-NEXT: s_mov_b32 s13, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v8, v8, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 -; GFX9-O0-NEXT: v_min_u32_e64 v8, v8, v9 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 ; GFX9-O0-NEXT: s_mov_b32 s12, 0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 -; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 +; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 ; GFX9-O0-NEXT: s_mov_b32 s18, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s18 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 -; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr16 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 -; GFX9-O0-NEXT: v_min_u32_e64 v12, v5, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 ; GFX9-O0-NEXT: s_mov_b32 s12, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 ; GFX9-O0-NEXT: s_mov_b32 s14, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[12:13], v11, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v12, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[12:13], v10, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v11, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] -; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v7, 1 +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v2, v5, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 5 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -594,67 +589,66 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -687,123 +681,117 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 +; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 +; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s5, v[23:24] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v23, v5, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] ; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] -; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[6:7] +; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -823,22 +811,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, v23 -; GFX9-O0-NEXT: v_and_b32_e64 v21, v11, v21 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v23 -; GFX9-O0-NEXT: v_and_b32_e64 v23, v11, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v20 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v19, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 @@ -854,149 +842,149 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v20, vcc, v11, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v21 -; GFX9-O0-NEXT: v_or_b32_e64 v19, v19, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v17, v17, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 10 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 11 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22] +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s6, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], v12, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 ; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6 ; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_lshrrev_b64 v[22:23], v5, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s6, 0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v21 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v22 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s7 @@ -1015,12 +1003,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v17, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v16, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v15, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s4 @@ -1032,429 +1020,428 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 10 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 11 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f -; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4 +; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 ; GFX9-O0-NEXT: s_mov_b32 s10, 63 -; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5] +; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v4, v7, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[5:6] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[16:17] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-O0-NEXT: v_mul_lo_u32 v8, v1, v0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[17:18] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[17:18], s[6:7], v5, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 -; GFX9-O0-NEXT: v_add3_u32 v2, v0, v2, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], s4, v[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-O0-NEXT: v_add3_u32 v8, v0, v5, v8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v16, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 +; GFX9-O0-NEXT: v_mul_lo_u32 v9, v8, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v17, v2, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v19 -; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v0 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v2, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v20 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 +; GFX9-O0-NEXT: v_mul_lo_u32 v14, v14, v0 +; GFX9-O0-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v8, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-O0-NEXT: v_add3_u32 v8, v8, v9, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v17, s[6:7], v11, v12 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v6, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v16, s[6:7], v14, v15 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v8, v9, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v12 -; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v6, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v20, v9, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v8 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v2, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 -; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v21 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v23, v11, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-O0-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v0, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v22, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v2, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v20 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v19, s[6:7], v6, v19, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff ; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v19, v19, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_and_b32_e64 v2, v2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 ; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: v_and_b32_e64 v21, v20, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v0, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v19 +; GFX9-O0-NEXT: v_and_b32_e64 v18, v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v0, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v22 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v23, v1, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v20 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v19, s[6:7], v1, v19, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s4, v[22:23] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 +; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v5 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v19 -; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], s4, v[0:1] -; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v22 -; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v19, v20 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v6, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v20 -; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v5, v6 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[18:19], s4, v[0:1] +; GFX9-O0-NEXT: v_lshrrev_b64 v[22:23], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_add_co_u32_e64 v18, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_add_co_u32_e64 v18, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v6 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v12 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v11, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 @@ -1462,53 +1449,48 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_xor_b32_e64 v9, v6, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_xor_b32_e64 v8, v5, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1725,266 +1707,258 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v15, v4, v2 +; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v3, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 ; GFX9-O0-NEXT: s_mov_b32 s9, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 ; GFX9-O0-NEXT: s_mov_b32 s14, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 -; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 -; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 ; GFX9-O0-NEXT: s_mov_b32 s8, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 ; GFX9-O0-NEXT: s_mov_b32 s10, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[8:9], v11, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[8:9], v10, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[8:9], v4, v11, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[12:13], s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: s_mov_b32 s10, s6 ; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] -; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v7, 1 +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v2, v5, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -1992,11 +1966,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_8 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -2025,20 +1999,19 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2085,13 +2058,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -2100,9 +2066,15 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2116,92 +2088,87 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 +; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s5, v[23:24] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v23, v5, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] ; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] -; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[6:7] +; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -2221,22 +2188,22 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, v23 -; GFX9-O0-NEXT: v_and_b32_e64 v21, v11, v21 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v23 -; GFX9-O0-NEXT: v_and_b32_e64 v23, v11, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v20 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v19, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 @@ -2252,66 +2219,66 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v20, vcc, v11, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v21 -; GFX9-O0-NEXT: v_or_b32_e64 v19, v19, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v17, v17, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2349,52 +2316,52 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22] +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s6, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], v12, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 ; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6 ; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_lshrrev_b64 v[22:23], v5, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s6, 0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v21 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v22 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s7 @@ -2413,12 +2380,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v17, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v16, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v15, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s4 @@ -2430,7 +2397,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2443,10 +2410,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2474,403 +2442,396 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f -; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4 +; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 ; GFX9-O0-NEXT: s_mov_b32 s10, 63 -; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5] +; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v4, v7, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 ; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-O0-NEXT: v_mul_lo_u32 v5, v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mul_lo_u32 v4, v5, v2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], s4, v[13:14] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v7, v3 -; GFX9-O0-NEXT: v_mad_u64_u32 v[13:14], s[6:7], v7, v2, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], s4, v[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v6, v3 +; GFX9-O0-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v6, v2, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[17:18], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[3:4], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v3, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v8 -; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 -; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v5 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v2, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s4, v[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_mul_lo_u32 v10, v10, v4 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v2, v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s6 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 -; GFX9-O0-NEXT: v_add_co_u32_e64 v13, s[6:7], v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[6:7], v10, v11 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v8, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v8, v7, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v19, v11, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v8 -; GFX9-O0-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v5, v7, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v18, v10, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v7 +; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v4, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[6:7], v7, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v15, s[6:7], v8, v15, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v6, s[6:7], v6, v15 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v7, v14, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff ; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-O0-NEXT: v_and_b32_e64 v14, v14, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 ; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: v_and_b32_e64 v17, v16, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v5, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-O0-NEXT: v_and_b32_e64 v16, v15, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v4, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v6 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v19, v6, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v18, v5, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v15, s[6:7], v6, v15, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 -; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[5:6] -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v8, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v15 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v5, v14, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], s4, v[4:5] +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v14, v15 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v6, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v6, v7 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v8 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v3, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v7 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v3, v6, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 -; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/remat-vop.mir b/llvm/test/CodeGen/AMDGPU/remat-vop.mir index 248a9e2ddb636..4f6ea44ccf68b 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-vop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-vop.mir @@ -7,12 +7,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mov_b32_e32 - ; GCN: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 2, implicit $exec @@ -31,16 +31,12 @@ body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_mov_b32_e32_impuse ; GCN: $m0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec, implicit $m0 - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec, implicit $m0 - ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec, implicit $m0 - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit $m0 + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec, implicit $m0 + ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec, implicit $m0 + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 $m0 = IMPLICIT_DEF %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit $m0 @@ -59,12 +55,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_remat_v_mov_b32_e32_exec_def - ; GCN: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_2]] ; GCN-NEXT: $exec = S_ANDN2_B64_term $exec, undef renamable $sgpr0_sgpr1, implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec @@ -82,12 +78,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mov_b32_e64 - ; GCN: renamable $vgpr0 = V_MOV_B32_e64 1, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e64 2, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e64 3, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MOV_B32_e64_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e64 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e64 2, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e64 3, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e64 1, implicit $exec %1:vgpr_32 = V_MOV_B32_e64 2, implicit $exec @@ -105,16 +101,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_mov_b32_dpp - ; GCN: renamable $vgpr0 = V_MOV_B32_dpp undef $vgpr0, undef $vgpr0, 1, 15, 15, 1, implicit $exec - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_dpp undef $vgpr1, undef $vgpr0, 1, 15, 15, 1, implicit $exec - ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_dpp undef $vgpr0, undef $vgpr0, 1, 15, 15, 1, implicit $exec - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp undef [[V_MOV_B32_dpp]], undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp undef [[V_MOV_B32_dpp1]], undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp undef [[V_MOV_B32_dpp2]], undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_dpp]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_dpp1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_dpp2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec %2:vgpr_32 = V_MOV_B32_dpp undef %2:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec @@ -130,12 +122,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_accvgpr_read_b32 - ; GCN: renamable $vgpr0 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ACCVGPR_READ_B32_e64_:%[0-9]+]]:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec + ; GCN-NEXT: [[V_ACCVGPR_READ_B32_e64_1:%[0-9]+]]:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec + ; GCN-NEXT: [[V_ACCVGPR_READ_B32_e64_2:%[0-9]+]]:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_READ_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_READ_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_READ_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec %1:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec @@ -151,12 +143,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_accvgpr_write_b32 - ; GCN: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec - ; GCN-NEXT: renamable $agpr1 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr1 - ; GCN-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 3, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr0 + ; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + ; GCN-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec + ; GCN-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 3, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_WRITE_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_WRITE_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_WRITE_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %0:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec %1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec @@ -172,12 +164,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mov_b64_pseudo - ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_MOV_B64_PSEUDO 2, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 3, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 2, implicit $exec + ; GCN-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 3, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1, implicit $exec %1:vreg_64_align2 = V_MOV_B64_PSEUDO 2, implicit $exec @@ -193,12 +185,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_i32_f64_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode @@ -216,16 +208,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_e32_fp_except - ; GCN: renamable $vgpr0 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr0 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode @@ -245,16 +233,12 @@ body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_e32_mode_def ; GCN: $mode = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 $mode = IMPLICIT_DEF %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode @@ -271,12 +255,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_i32_f64_e64 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e64 0, 1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F64_e64 0, 2, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e64 0, 3, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_I32_F64_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 1, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 3, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 1, 0, 0, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 2, 0, 0, implicit $exec, implicit $mode @@ -294,12 +278,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_i32_f64_e64_undef - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e64 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F64_e64 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e64 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_I32_F64_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64, 0, 0, implicit $exec, implicit $mode @@ -317,16 +301,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_dpp - ; GCN: renamable $vgpr0 = V_CVT_I32_F64_dpp undef $vgpr0, 0, undef $vgpr0_vgpr1, 336, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_CVT_I32_F64_dpp undef $vgpr1, 0, undef $vgpr0_vgpr1, 336, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr0 = V_CVT_I32_F64_dpp undef $vgpr0, 0, undef $vgpr0_vgpr1, 336, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_I32_F64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_dpp undef [[V_CVT_I32_F64_dpp]], 0, undef %1:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_dpp1:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_dpp undef [[V_CVT_I32_F64_dpp1]], 0, undef %1:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F64_dpp2:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_dpp undef [[V_CVT_I32_F64_dpp2]], 0, undef %1:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_dpp]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_dpp1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_dpp2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CVT_I32_F64_dpp undef %1:vgpr_32, 0, undef %0:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_CVT_I32_F64_dpp undef %2:vgpr_32, 0, undef %0:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode @@ -344,16 +324,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_e32_imp_def - ; GCN: renamable $vgpr0 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 + ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 + ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 %1:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 @@ -371,16 +347,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_e32_imp_use - ; GCN: renamable $vgpr0 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit $m0 - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit $m0 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit $m0 - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit $m0 + ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit $m0 + ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit $m0 + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit $m0 %1:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit $m0 @@ -396,12 +368,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f64_i32_e32 - ; GCN: renamable $vgpr0_vgpr1 = V_CVT_F64_I32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_CVT_F64_I32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_CVT_F64_I32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_I32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F64_I32_e32_1:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_I32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_I32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = V_CVT_F64_I32_e32 1, implicit $exec, implicit $mode %1:vreg_64_align2 = V_CVT_F64_I32_e32 2, implicit $exec, implicit $mode @@ -417,12 +389,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f32_f64_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_F32_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_F32_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_F32_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_F32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 2, implicit $exec, implicit $mode @@ -438,12 +410,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f64_f32_e32 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_CVT_F64_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_CVT_F64_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_CVT_F64_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_CVT_F64_F32_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F64_F32_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F64_F32_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 1, implicit $exec, implicit $mode %1:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 2, implicit $exec, implicit $mode @@ -459,12 +431,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_u32_f64_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_U32_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_U32_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_U32_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_U32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_U32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_U32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 2, implicit $exec, implicit $mode @@ -480,12 +452,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f64_u32_e32 - ; GCN: renamable $vgpr0_vgpr1 = V_CVT_F64_U32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_CVT_F64_U32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_CVT_F64_U32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_CVT_F64_U32_e32_:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_U32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F64_U32_e32_1:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_U32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F64_U32_e32_2:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_U32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = V_CVT_F64_U32_e32 1, implicit $exec, implicit $mode %1:vreg_64_align2 = V_CVT_F64_U32_e32 2, implicit $exec, implicit $mode @@ -501,12 +473,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f32_i32_e32 - ; GCN: renamable $vgpr0 = V_CVT_F32_I32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_I32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_I32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_I32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_I32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_F32_I32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_CVT_F32_I32_e32 2, implicit $exec, implicit $mode @@ -522,12 +494,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f32_i32_sdwa - ; GCN: renamable $vgpr0 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_F32_I32_sdwa:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_I32_sdwa1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_I32_sdwa2:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode @@ -547,16 +519,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cvt_f32_i32_sdwa_dst_unused_preserve - ; GCN: renamable $vgpr0 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef $vgpr0(tied-def 0) - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef $vgpr1(tied-def 0) - ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef $vgpr0(tied-def 0) - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_F32_I32_sdwa:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef [[V_CVT_F32_I32_sdwa]](tied-def 0) + ; GCN-NEXT: [[V_CVT_F32_I32_sdwa1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef [[V_CVT_F32_I32_sdwa1]](tied-def 0) + ; GCN-NEXT: [[V_CVT_F32_I32_sdwa2:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef [[V_CVT_F32_I32_sdwa2]](tied-def 0) + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %0:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef %1:vgpr_32(tied-def 0) %2:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %0:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef %2:vgpr_32(tied-def 0) @@ -572,12 +540,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f32_u32_e32 - ; GCN: renamable $vgpr0 = V_CVT_F32_U32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_U32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_U32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_U32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_U32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_F32_U32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_CVT_F32_U32_e32 2, implicit $exec, implicit $mode @@ -593,12 +561,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_u32_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_U32_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_U32_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_U32_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_U32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_U32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 2, implicit $exec, implicit $mode @@ -614,12 +582,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_i32_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode @@ -635,12 +603,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f32_f16_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_F32_F16_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_F32_F16_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_F32_F16_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F16_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 2, implicit $exec, implicit $mode @@ -656,12 +624,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_rpi_i32_f32_e32 - ; GCN: renamable $vgpr0 = V_CVT_RPI_I32_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CVT_RPI_I32_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_RPI_I32_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_RPI_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_RPI_I32_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_RPI_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_RPI_I32_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_RPI_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_RPI_I32_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_RPI_I32_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_RPI_I32_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_RPI_I32_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_RPI_I32_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_CVT_RPI_I32_F32_e32 2, implicit $exec, implicit $mode @@ -677,12 +645,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_flr_i32_f32_e32 - ; GCN: renamable $vgpr0 = V_CVT_FLR_I32_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CVT_FLR_I32_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_FLR_I32_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_FLR_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_FLR_I32_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_FLR_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_FLR_I32_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_FLR_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_FLR_I32_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_FLR_I32_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_FLR_I32_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_FLR_I32_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_FLR_I32_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_CVT_FLR_I32_F32_e32 2, implicit $exec, implicit $mode @@ -698,12 +666,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_off_f32_i4_e32 - ; GCN: renamable $vgpr0 = V_CVT_OFF_F32_I4_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CVT_OFF_F32_I4_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_OFF_F32_I4_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_OFF_F32_I4_e32_:%[0-9]+]]:vgpr_32 = V_CVT_OFF_F32_I4_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_OFF_F32_I4_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_OFF_F32_I4_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_OFF_F32_I4_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_OFF_F32_I4_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_OFF_F32_I4_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_OFF_F32_I4_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_OFF_F32_I4_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_OFF_F32_I4_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_CVT_OFF_F32_I4_e32 2, implicit $exec, implicit $mode @@ -719,12 +687,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_f32_ubyte0_e32 - ; GCN: renamable $vgpr0 = V_CVT_F32_UBYTE0_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_UBYTE0_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_UBYTE0_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_F32_UBYTE0_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_UBYTE0_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_F32_UBYTE0_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_UBYTE0_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_UBYTE0_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_UBYTE0_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_CVT_F32_UBYTE0_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_CVT_F32_UBYTE0_e32 2, implicit $exec, implicit $mode @@ -740,12 +708,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_fract_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_FRACT_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FRACT_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FRACT_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FRACT_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FRACT_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FRACT_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FRACT_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FRACT_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FRACT_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_FRACT_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_FRACT_F32_e32 2, implicit $exec, implicit $mode @@ -761,12 +729,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_trunc_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_TRUNC_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_TRUNC_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_TRUNC_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_TRUNC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_TRUNC_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_TRUNC_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_TRUNC_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_TRUNC_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_TRUNC_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_TRUNC_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_TRUNC_F32_e32 2, implicit $exec, implicit $mode @@ -782,12 +750,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ceil_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_CEIL_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CEIL_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CEIL_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CEIL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CEIL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CEIL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CEIL_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CEIL_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CEIL_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_CEIL_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_CEIL_F32_e32 2, implicit $exec, implicit $mode @@ -803,12 +771,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_rndne_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_RNDNE_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RNDNE_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_RNDNE_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_RNDNE_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RNDNE_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RNDNE_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RNDNE_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RNDNE_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RNDNE_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_RNDNE_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RNDNE_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RNDNE_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_RNDNE_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_RNDNE_F32_e32 2, implicit $exec, implicit $mode @@ -824,12 +792,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_floor_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_FLOOR_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FLOOR_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FLOOR_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FLOOR_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FLOOR_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FLOOR_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FLOOR_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FLOOR_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FLOOR_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_FLOOR_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_FLOOR_F32_e32 2, implicit $exec, implicit $mode @@ -845,12 +813,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_exp_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_EXP_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_EXP_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_EXP_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_EXP_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_EXP_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_EXP_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_EXP_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_EXP_F32_e32 2, implicit $exec, implicit $mode @@ -866,12 +834,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_log_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_LOG_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LOG_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_LOG_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LOG_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_LOG_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_LOG_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_LOG_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_LOG_F32_e32 2, implicit $exec, implicit $mode @@ -887,12 +855,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_rcp_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_RCP_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_RCP_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_RCP_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RCP_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RCP_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_RCP_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_RCP_F32_e32 2, implicit $exec, implicit $mode @@ -908,12 +876,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_rcp_iflag_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_RCP_IFLAG_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RCP_IFLAG_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_RCP_IFLAG_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_RCP_IFLAG_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RCP_IFLAG_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RCP_IFLAG_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_IFLAG_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_IFLAG_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_IFLAG_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 2, implicit $exec, implicit $mode @@ -929,12 +897,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_rsq_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_RSQ_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RSQ_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_RSQ_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_RSQ_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RSQ_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RSQ_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_RSQ_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_RSQ_F32_e32 2, implicit $exec, implicit $mode @@ -950,12 +918,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sqrt_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_SQRT_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SQRT_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_SQRT_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SQRT_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SQRT_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SQRT_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_SQRT_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_SQRT_F32_e32 2, implicit $exec, implicit $mode @@ -971,12 +939,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_rcp_f64_e32 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_RCP_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_RCP_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_RCP_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_RCP_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RCP_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RCP_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RCP_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RCP_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RCP_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = nofpexcept V_RCP_F64_e32 1, implicit $exec, implicit $mode %1:vreg_64_align2 = nofpexcept V_RCP_F64_e32 2, implicit $exec, implicit $mode @@ -992,12 +960,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_rsq_f64_e32 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_RSQ_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_RSQ_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_RSQ_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_RSQ_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RSQ_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_RSQ_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 1, implicit $exec, implicit $mode %1:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 2, implicit $exec, implicit $mode @@ -1013,12 +981,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sqrt_f64_e32 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_SQRT_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_SQRT_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_SQRT_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_SQRT_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SQRT_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SQRT_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 1, implicit $exec, implicit $mode %1:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 2, implicit $exec, implicit $mode @@ -1034,12 +1002,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sin_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_SIN_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SIN_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_SIN_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SIN_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SIN_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SIN_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_SIN_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SIN_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SIN_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_SIN_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_SIN_F32_e32 2, implicit $exec, implicit $mode @@ -1055,12 +1023,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cos_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_COS_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_COS_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_COS_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_COS_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_COS_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_COS_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_COS_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_COS_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_COS_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_COS_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_COS_F32_e32 2, implicit $exec, implicit $mode @@ -1076,12 +1044,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_not_b32_e32 - ; GCN: renamable $vgpr0 = V_NOT_B32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_NOT_B32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_NOT_B32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_NOT_B32_e32_2:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_NOT_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_NOT_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_NOT_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_NOT_B32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_NOT_B32_e32 2, implicit $exec, implicit $mode @@ -1097,12 +1065,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_bfrev_b32_e32 - ; GCN: renamable $vgpr0 = V_BFREV_B32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_BFREV_B32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_BFREV_B32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_BFREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_BFREV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_BFREV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_BFREV_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFREV_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFREV_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_BFREV_B32_e32 2, implicit $exec, implicit $mode @@ -1118,12 +1086,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ffbh_u32_e32 - ; GCN: renamable $vgpr0 = V_FFBH_U32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_FFBH_U32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_FFBH_U32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FFBH_U32_e32_:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FFBH_U32_e32_1:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FFBH_U32_e32_2:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_FFBH_U32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_FFBH_U32_e32 2, implicit $exec, implicit $mode @@ -1139,12 +1107,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ffbl_b32_e32 - ; GCN: renamable $vgpr0 = V_FFBL_B32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_FFBL_B32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_FFBL_B32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FFBL_B32_e32_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FFBL_B32_e32_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FFBL_B32_e32_2:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBL_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBL_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBL_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_FFBL_B32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_FFBL_B32_e32 2, implicit $exec, implicit $mode @@ -1160,12 +1128,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ffbh_i32_e32 - ; GCN: renamable $vgpr0 = V_FFBH_I32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_FFBH_I32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_FFBH_I32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FFBH_I32_e32_:%[0-9]+]]:vgpr_32 = V_FFBH_I32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FFBH_I32_e32_1:%[0-9]+]]:vgpr_32 = V_FFBH_I32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FFBH_I32_e32_2:%[0-9]+]]:vgpr_32 = V_FFBH_I32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_I32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_I32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_I32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_FFBH_I32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_FFBH_I32_e32 2, implicit $exec, implicit $mode @@ -1181,12 +1149,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_frexp_exp_i32_f64_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_FREXP_EXP_I32_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FREXP_EXP_I32_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FREXP_EXP_I32_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FREXP_EXP_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FREXP_EXP_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FREXP_EXP_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 2, implicit $exec, implicit $mode @@ -1202,12 +1170,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_frexp_mant_f64_e32 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_FREXP_MANT_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_FREXP_MANT_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_FREXP_MANT_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_FREXP_MANT_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FREXP_MANT_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FREXP_MANT_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 1, implicit $exec, implicit $mode %1:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 2, implicit $exec, implicit $mode @@ -1223,12 +1191,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_fract_f64_e32 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_FRACT_F64_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_FRACT_F64_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_FRACT_F64_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_FRACT_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FRACT_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FRACT_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F64_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F64_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F64_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 1, implicit $exec, implicit $mode %1:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 2, implicit $exec, implicit $mode @@ -1244,12 +1212,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_frexp_exp_i32_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_FREXP_EXP_I32_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FREXP_EXP_I32_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FREXP_EXP_I32_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FREXP_EXP_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FREXP_EXP_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FREXP_EXP_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 2, implicit $exec, implicit $mode @@ -1265,12 +1233,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_frexp_mant_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_FREXP_MANT_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FREXP_MANT_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FREXP_MANT_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FREXP_MANT_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FREXP_MANT_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FREXP_MANT_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 2, implicit $exec, implicit $mode @@ -1286,12 +1254,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_exp_legacy_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_EXP_LEGACY_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_EXP_LEGACY_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_EXP_LEGACY_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_EXP_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_EXP_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_EXP_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_LEGACY_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_LEGACY_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_LEGACY_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 2, implicit $exec, implicit $mode @@ -1307,12 +1275,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_log_legacy_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_LOG_LEGACY_F32_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LOG_LEGACY_F32_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_LOG_LEGACY_F32_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LOG_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_LOG_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_LOG_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_LEGACY_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_LEGACY_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_LEGACY_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 2, implicit $exec, implicit $mode @@ -1328,12 +1296,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sat_pk_u8_i16_e32 - ; GCN: renamable $vgpr0 = V_SAT_PK_U8_I16_e32 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_SAT_PK_U8_I16_e32 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_SAT_PK_U8_I16_e32 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SAT_PK_U8_I16_e32_:%[0-9]+]]:vgpr_32 = V_SAT_PK_U8_I16_e32 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SAT_PK_U8_I16_e32_1:%[0-9]+]]:vgpr_32 = V_SAT_PK_U8_I16_e32 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SAT_PK_U8_I16_e32_2:%[0-9]+]]:vgpr_32 = V_SAT_PK_U8_I16_e32 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_SAT_PK_U8_I16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAT_PK_U8_I16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAT_PK_U8_I16_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_SAT_PK_U8_I16_e32 1, implicit $exec, implicit $mode %1:vgpr_32 = V_SAT_PK_U8_I16_e32 2, implicit $exec, implicit $mode @@ -1349,12 +1317,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_accvgpr_mov_b32 - ; GCN: renamable $agpr0 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec - ; GCN-NEXT: renamable $agpr1 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr1 - ; GCN-NEXT: renamable $agpr0 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr0 + ; GCN: [[V_ACCVGPR_MOV_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec + ; GCN-NEXT: [[V_ACCVGPR_MOV_B32_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec + ; GCN-NEXT: [[V_ACCVGPR_MOV_B32_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_MOV_B32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_MOV_B32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_MOV_B32_2]] ; GCN-NEXT: S_ENDPGM 0 %0:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec %1:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec @@ -1372,16 +1340,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cndmask_b32_e32 - ; GCN: renamable $vgpr0 = V_CNDMASK_B32_e32 1, undef $vgpr0, implicit $exec, implicit undef $vcc - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_CNDMASK_B32_e32 1, undef $vgpr0, implicit $exec, implicit undef $vcc - ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e32 1, undef $vgpr0, implicit $exec, implicit undef $vcc - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 1, undef %1:vgpr_32, implicit $exec, implicit undef $vcc + ; GCN-NEXT: [[V_CNDMASK_B32_e32_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 1, undef %1:vgpr_32, implicit $exec, implicit undef $vcc + ; GCN-NEXT: [[V_CNDMASK_B32_e32_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 1, undef %1:vgpr_32, implicit $exec, implicit undef $vcc + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CNDMASK_B32_e32 1, undef %0:vgpr_32, implicit $exec, implicit undef $vcc %2:vgpr_32 = V_CNDMASK_B32_e32 1, undef %0:vgpr_32, implicit $exec, implicit undef $vcc @@ -1399,16 +1363,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cndmask_b32_sdwa - ; GCN: renamable $vgpr0 = V_CNDMASK_B32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_CNDMASK_B32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc - ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc + ; GCN-NEXT: [[V_CNDMASK_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc + ; GCN-NEXT: [[V_CNDMASK_B32_sdwa2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_sdwa]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_sdwa1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_sdwa2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc %2:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc @@ -1426,16 +1386,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_cndmask_b32_dpp - ; GCN: renamable $vgpr0 = V_CNDMASK_B32_dpp undef $vgpr0, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit undef $vcc - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_CNDMASK_B32_dpp undef $vgpr1, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit undef $vcc - ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_dpp undef $vgpr0, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit undef $vcc - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CNDMASK_B32_dpp:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_dpp undef [[V_CNDMASK_B32_dpp]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc + ; GCN-NEXT: [[V_CNDMASK_B32_dpp1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_dpp undef [[V_CNDMASK_B32_dpp1]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc + ; GCN-NEXT: [[V_CNDMASK_B32_dpp2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_dpp undef [[V_CNDMASK_B32_dpp2]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_dpp]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_dpp1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_dpp2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CNDMASK_B32_dpp undef %1:vgpr_32, 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc %2:vgpr_32 = V_CNDMASK_B32_dpp undef %2:vgpr_32, 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc @@ -1451,12 +1407,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cndmask_b32_e64 - ; GCN: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef %0:sreg_64_xexec, implicit $exec %2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef %0:sreg_64_xexec, implicit $exec @@ -1472,12 +1428,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_madmk_f32 - ; GCN: renamable $vgpr0 = nofpexcept V_MADMK_F32 1, 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADMK_F32 2, 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MADMK_F32 3, 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MADMK_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F32 1, 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MADMK_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F32 2, 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MADMK_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F32 3, 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MADMK_F32 1, 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MADMK_F32 2, 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -1493,12 +1449,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_add_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_ADD_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_ADD_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_ADD_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_ADD_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -1514,12 +1470,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_add_f32_e64 - ; GCN: renamable $vgpr0 = nofpexcept V_ADD_F32_e64 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F32_e64 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_ADD_F32_e64 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -1535,12 +1491,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_add_f32_sdwa - ; GCN: renamable $vgpr0 = nofpexcept V_ADD_F32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_ADD_F32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ADD_F32_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F32_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F32_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_sdwa]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_sdwa1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_sdwa2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode @@ -1558,16 +1514,12 @@ machineFunctionInfo: body: | bb.0: ; GCN-LABEL: name: test_no_remat_v_add_f32_dpp - ; GCN: renamable $vgpr0 = nofpexcept V_ADD_F32_dpp undef $vgpr0, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit $mode - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F32_dpp undef $vgpr1, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_ADD_F32_dpp undef $vgpr0, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit $mode - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ADD_F32_dpp:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_dpp undef [[V_ADD_F32_dpp]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F32_dpp1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_dpp undef [[V_ADD_F32_dpp1]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F32_dpp2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_dpp undef [[V_ADD_F32_dpp2]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_dpp]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_dpp1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_dpp2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_ADD_F32_dpp undef %1:vgpr_32, 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_ADD_F32_dpp undef %2:vgpr_32, 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode @@ -1583,12 +1535,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sub_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_SUB_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_SUB_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SUB_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SUB_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SUB_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_SUB_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_SUB_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -1604,12 +1556,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_subrev_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_SUBREV_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_SUBREV_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SUBREV_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SUBREV_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_SUBREV_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_SUBREV_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_SUBREV_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -1625,12 +1577,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_legacy_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_MUL_LEGACY_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_LEGACY_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MUL_LEGACY_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MUL_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MUL_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LEGACY_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LEGACY_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LEGACY_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -1646,12 +1598,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_MUL_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MUL_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MUL_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MUL_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -1667,12 +1619,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_i32_i24_e32 - ; GCN: renamable $vgpr0 = V_MUL_I32_I24_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MUL_I32_I24_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MUL_I32_I24_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_I32_I24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_I32_I24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_I32_I24_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_I32_I24_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_I32_I24_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_I32_I24_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MUL_I32_I24_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_I32_I24_e32 2, undef %0:vgpr_32, implicit $exec @@ -1688,12 +1640,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_hi_i32_i24_e32 - ; GCN: renamable $vgpr0 = V_MUL_HI_I32_I24_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MUL_HI_I32_I24_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MUL_HI_I32_I24_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_HI_I32_I24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_I24_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_I32_I24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_I24_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_I32_I24_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_I24_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_I24_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_I24_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_I24_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MUL_HI_I32_I24_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_HI_I32_I24_e32 2, undef %0:vgpr_32, implicit $exec @@ -1709,12 +1661,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_u32_u24_e32 - ; GCN: renamable $vgpr0 = V_MUL_U32_U24_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MUL_U32_U24_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MUL_U32_U24_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_U32_U24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_U32_U24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_U32_U24_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_U32_U24_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_U32_U24_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_U32_U24_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MUL_U32_U24_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_U32_U24_e32 2, undef %0:vgpr_32, implicit $exec @@ -1730,12 +1682,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_hi_u32_u24_e32 - ; GCN: renamable $vgpr0 = V_MUL_HI_U32_U24_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MUL_HI_U32_U24_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MUL_HI_U32_U24_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_HI_U32_U24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_U24_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_U32_U24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_U24_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_U32_U24_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_U24_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_U24_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_U24_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_U24_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MUL_HI_U32_U24_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_HI_U32_U24_e32 2, undef %0:vgpr_32, implicit $exec @@ -1751,12 +1703,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_min_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_MIN_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MIN_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MIN_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MIN_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MIN_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MIN_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MIN_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -1772,12 +1724,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_max_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_MAX_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MAX_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAX_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAX_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAX_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MAX_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MAX_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -1793,12 +1745,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_min_i32_e32 - ; GCN: renamable $vgpr0 = V_MIN_I32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MIN_I32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MIN_I32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MIN_I32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MIN_I32_e32_1:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MIN_I32_e32_2:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MIN_I32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MIN_I32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1814,12 +1766,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_max_i32_e32 - ; GCN: renamable $vgpr0 = V_MAX_I32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MAX_I32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MAX_I32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAX_I32_e32_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MAX_I32_e32_1:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MAX_I32_e32_2:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MAX_I32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MAX_I32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1835,12 +1787,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_min_u32_e32 - ; GCN: renamable $vgpr0 = V_MIN_U32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MIN_U32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MIN_U32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MIN_U32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MIN_U32_e32_1:%[0-9]+]]:vgpr_32 = V_MIN_U32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MIN_U32_e32_2:%[0-9]+]]:vgpr_32 = V_MIN_U32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MIN_U32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MIN_U32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1856,12 +1808,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_max_u32_e32 - ; GCN: renamable $vgpr0 = V_MAX_U32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MAX_U32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MAX_U32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAX_U32_e32_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MAX_U32_e32_1:%[0-9]+]]:vgpr_32 = V_MAX_U32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MAX_U32_e32_2:%[0-9]+]]:vgpr_32 = V_MAX_U32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MAX_U32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MAX_U32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1877,12 +1829,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lshrrev_b32_e32 - ; GCN: renamable $vgpr0 = V_LSHRREV_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_LSHRREV_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_LSHRREV_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHRREV_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1898,12 +1850,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lshlrev_b32_e32 - ; GCN: renamable $vgpr0 = V_LSHLREV_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_LSHLREV_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHLREV_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1919,12 +1871,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ashrrev_i32_e32 - ; GCN: renamable $vgpr0 = V_ASHRREV_I32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ASHRREV_I32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ASHRREV_I32_e32_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ASHRREV_I32_e32_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ASHRREV_I32_e32_2:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_ASHRREV_I32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ASHRREV_I32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1940,12 +1892,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_and_b32_e32 - ; GCN: renamable $vgpr0 = V_AND_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_AND_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_AND_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e32_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_AND_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_AND_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_AND_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_AND_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_AND_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1961,12 +1913,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_or_b32_e32 - ; GCN: renamable $vgpr0 = V_OR_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_OR_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_OR_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_OR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_OR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_OR_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_OR_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_OR_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_OR_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_OR_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -1982,12 +1934,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_xor_b32_e32 - ; GCN: renamable $vgpr0 = V_XOR_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_XOR_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_XOR_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_XOR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_XOR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_XOR_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_XOR_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_XOR_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_XOR_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_XOR_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2003,12 +1955,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_madak_f32 - ; GCN: renamable $vgpr0 = nofpexcept V_MADAK_F32 1, undef $vgpr0, 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADAK_F32 2, undef $vgpr0, 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MADAK_F32 3, undef $vgpr0, 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MADAK_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F32 1, undef %1:vgpr_32, 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MADAK_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F32 2, undef %1:vgpr_32, 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MADAK_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F32 3, undef %1:vgpr_32, 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MADAK_F32 1, undef %0:vgpr_32, 1, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MADAK_F32 2, undef %0:vgpr_32, 2, implicit $exec, implicit $mode @@ -2024,12 +1976,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_add_u32_e32 - ; GCN: renamable $vgpr0 = V_ADD_U32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ADD_U32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_ADD_U32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ADD_U32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2045,12 +1997,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sub_u32_e32 - ; GCN: renamable $vgpr0 = V_SUB_U32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_SUB_U32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_SUB_U32_e32_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_SUB_U32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_SUB_U32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2066,12 +2018,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_subrev_u32_e32 - ; GCN: renamable $vgpr0 = V_SUBREV_U32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_SUBREV_U32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SUBREV_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_SUBREV_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_SUBREV_U32_e32_2:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_SUBREV_U32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_SUBREV_U32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2087,12 +2039,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_bfm_b32_e32 - ; GCN: renamable $vgpr0 = V_BFM_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_BFM_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_BFM_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_BFM_B32_e32_:%[0-9]+]]:vgpr_32 = V_BFM_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BFM_B32_e32_1:%[0-9]+]]:vgpr_32 = V_BFM_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BFM_B32_e32_2:%[0-9]+]]:vgpr_32 = V_BFM_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_BFM_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFM_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFM_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_BFM_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_BFM_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2108,12 +2060,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_bcnt_u32_b32_e32 - ; GCN: renamable $vgpr0 = V_BCNT_U32_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_BCNT_U32_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_BCNT_U32_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_BCNT_U32_B32_e32_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BCNT_U32_B32_e32_1:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BCNT_U32_B32_e32_2:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_BCNT_U32_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BCNT_U32_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BCNT_U32_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_BCNT_U32_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_BCNT_U32_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2129,12 +2081,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mbcnt_lo_u32_b32_e32 - ; GCN: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MBCNT_LO_U32_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MBCNT_LO_U32_B32_e32_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_LO_U32_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_LO_U32_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_LO_U32_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MBCNT_LO_U32_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MBCNT_LO_U32_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2150,12 +2102,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mbcnt_hi_u32_b32_e32 - ; GCN: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MBCNT_HI_U32_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MBCNT_HI_U32_B32_e32_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_HI_U32_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_HI_U32_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_HI_U32_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MBCNT_HI_U32_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MBCNT_HI_U32_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2171,12 +2123,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ldexp_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_LDEXP_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_LDEXP_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LDEXP_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_LDEXP_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_LDEXP_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_LDEXP_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_LDEXP_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -2192,12 +2144,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_pknorm_i16_f32_e32 - ; GCN: renamable $vgpr0 = V_CVT_PKNORM_I16_F32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_CVT_PKNORM_I16_F32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_PKNORM_I16_F32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_PKNORM_I16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_CVT_PKNORM_I16_F32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_CVT_PKNORM_I16_F32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_I16_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_I16_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_I16_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2213,12 +2165,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_pknorm_u16_f32_e32 - ; GCN: renamable $vgpr0 = V_CVT_PKNORM_U16_F32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_CVT_PKNORM_U16_F32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_PKNORM_U16_F32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_PKNORM_U16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_CVT_PKNORM_U16_F32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_CVT_PKNORM_U16_F32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_U16_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_U16_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_U16_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2234,12 +2186,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_pkrtz_f16_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_PKRTZ_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_PKRTZ_F16_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_PKRTZ_F16_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKRTZ_F16_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKRTZ_F16_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKRTZ_F16_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -2255,12 +2207,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_pk_u16_u32_e32 - ; GCN: renamable $vgpr0 = V_CVT_PK_U16_U32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_CVT_PK_U16_U32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_PK_U16_U32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_PK_U16_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_PK_U16_U32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_CVT_PK_U16_U32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_PK_U16_U32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_CVT_PK_U16_U32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_PK_U16_U32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U16_U32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U16_U32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U16_U32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CVT_PK_U16_U32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_CVT_PK_U16_U32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2276,12 +2228,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_pk_i16_i32_e32 - ; GCN: renamable $vgpr0 = V_CVT_PK_I16_I32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_CVT_PK_I16_I32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CVT_PK_I16_I32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_PK_I16_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_PK_I16_I32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_CVT_PK_I16_I32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_PK_I16_I32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_CVT_PK_I16_I32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_PK_I16_I32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_I16_I32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_I16_I32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_I16_I32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CVT_PK_I16_I32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_CVT_PK_I16_I32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2297,12 +2249,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_min_legacy_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_MIN_LEGACY_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_LEGACY_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MIN_LEGACY_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MIN_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MIN_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MIN_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_LEGACY_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_LEGACY_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_LEGACY_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -2318,12 +2270,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_max_legacy_f32_e32 - ; GCN: renamable $vgpr0 = nofpexcept V_MAX_LEGACY_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_LEGACY_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MAX_LEGACY_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAX_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAX_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAX_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_LEGACY_F32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_LEGACY_F32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_LEGACY_F32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -2339,12 +2291,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lshr_b32_e32 - ; GCN: renamable $vgpr0 = V_LSHR_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_LSHR_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_LSHR_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LSHR_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHR_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_LSHR_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHR_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHR_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHR_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHR_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_LSHR_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHR_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2360,12 +2312,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lshl_b32_e32 - ; GCN: renamable $vgpr0 = V_LSHL_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_LSHL_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_LSHL_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LSHL_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHL_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHL_B32_e32_1:%[0-9]+]]:vgpr_32 = V_LSHL_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHL_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHL_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_LSHL_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHL_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2381,12 +2333,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ashr_i32_e32 - ; GCN: renamable $vgpr0 = V_ASHR_I32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ASHR_I32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ASHR_I32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ASHR_I32_e32_:%[0-9]+]]:vgpr_32 = V_ASHR_I32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ASHR_I32_e32_1:%[0-9]+]]:vgpr_32 = V_ASHR_I32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ASHR_I32_e32_2:%[0-9]+]]:vgpr_32 = V_ASHR_I32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHR_I32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHR_I32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHR_I32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_ASHR_I32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ASHR_I32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2402,12 +2354,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_xnor_b32_e32 - ; GCN: renamable $vgpr0 = V_XNOR_B32_e32 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_XNOR_B32_e32 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_XNOR_B32_e32 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_XNOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XNOR_B32_e32 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_XNOR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_XNOR_B32_e32 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_XNOR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_XNOR_B32_e32 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_XNOR_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_XNOR_B32_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_XNOR_B32_e32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_XNOR_B32_e32 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_XNOR_B32_e32 2, undef %0:vgpr_32, implicit $exec @@ -2423,12 +2375,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_fmamk_f32 - ; GCN: renamable $vgpr0 = nofpexcept V_FMAMK_F32 1, 1, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAMK_F32 2, 2, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FMAMK_F32 3, 3, undef $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F32 1, 1, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMAMK_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F32 2, 2, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMAMK_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F32 3, 3, undef %1:vgpr_32, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_FMAMK_F32 1, 1, undef %0:vgpr_32, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_FMAMK_F32 2, 2, undef %0:vgpr_32, implicit $exec, implicit $mode @@ -2444,12 +2396,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_fmaak_f32 - ; GCN: renamable $vgpr0 = nofpexcept V_FMAAK_F32 1, undef $vgpr0, 1, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAAK_F32 2, undef $vgpr0, 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FMAAK_F32 3, undef $vgpr0, 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAAK_F32 1, undef %1:vgpr_32, 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMAAK_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAAK_F32 2, undef %1:vgpr_32, 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMAAK_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAAK_F32 3, undef %1:vgpr_32, 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAAK_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAAK_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAAK_F32_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_FMAAK_F32 1, undef %0:vgpr_32, 1, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_FMAAK_F32 2, undef %0:vgpr_32, 2, implicit $exec, implicit $mode @@ -2465,12 +2417,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mad_legacy_f32_e64 - ; GCN: renamable $vgpr0 = nofpexcept V_MAD_LEGACY_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_LEGACY_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MAD_LEGACY_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAD_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAD_LEGACY_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAD_LEGACY_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_LEGACY_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_LEGACY_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_LEGACY_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -2486,12 +2438,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mad_f32_e64 - ; GCN: renamable $vgpr0 = nofpexcept V_MAD_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MAD_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -2507,12 +2459,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_fma_legacy_f32_e64 - ; GCN: renamable $vgpr0 = nofpexcept V_FMA_LEGACY_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_LEGACY_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FMA_LEGACY_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FMA_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMA_LEGACY_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMA_LEGACY_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_LEGACY_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_LEGACY_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_LEGACY_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -2528,12 +2480,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_fma_f32_e64 - ; GCN: renamable $vgpr0 = nofpexcept V_FMA_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FMA_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMA_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -2549,12 +2501,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mad_i32_i24_e64 - ; GCN: renamable $vgpr0 = V_MAD_I32_I24_e64 1, 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MAD_I32_I24_e64 2, 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 3, 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAD_I32_I24_e64_:%[0-9]+]]:vgpr_32 = V_MAD_I32_I24_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_MAD_I32_I24_e64_1:%[0-9]+]]:vgpr_32 = V_MAD_I32_I24_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_MAD_I32_I24_e64_2:%[0-9]+]]:vgpr_32 = V_MAD_I32_I24_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_I32_I24_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_I32_I24_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_I32_I24_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MAD_I32_I24_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_MAD_I32_I24_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec @@ -2570,12 +2522,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mad_u32_u24_e64 - ; GCN: renamable $vgpr0 = V_MAD_U32_U24_e64 1, 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MAD_U32_U24_e64 2, 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MAD_U32_U24_e64 3, 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAD_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MAD_U32_U24_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_MAD_U32_U24_e64_1:%[0-9]+]]:vgpr_32 = V_MAD_U32_U24_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_MAD_U32_U24_e64_2:%[0-9]+]]:vgpr_32 = V_MAD_U32_U24_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_U32_U24_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_U32_U24_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_U32_U24_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MAD_U32_U24_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_MAD_U32_U24_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec @@ -2591,12 +2543,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lerp_u8_e64 - ; GCN: renamable $vgpr0 = V_LERP_U8_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_LERP_U8_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_LERP_U8_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LERP_U8_e64_:%[0-9]+]]:vgpr_32 = V_LERP_U8_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LERP_U8_e64_1:%[0-9]+]]:vgpr_32 = V_LERP_U8_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LERP_U8_e64_2:%[0-9]+]]:vgpr_32 = V_LERP_U8_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LERP_U8_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LERP_U8_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LERP_U8_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_LERP_U8_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LERP_U8_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -2612,12 +2564,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_fma_f64_e64 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, 1, 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_FMA_F64_e64 0, 2, 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, 3, 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_FMA_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 1, 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMA_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 2, 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMA_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 3, 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 1, 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode %2:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 2, 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode @@ -2633,12 +2585,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_add_f64_e64 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_ADD_F64_e64 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_ADD_F64_e64 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_ADD_F64_e64 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode %2:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode @@ -2654,12 +2606,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_f64_e64 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_MUL_F64_e64 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_MUL_F64_e64 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_MUL_F64_e64 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MUL_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MUL_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode %2:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode @@ -2675,12 +2627,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_min_f64_e64 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_MIN_F64_e64 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_MIN_F64_e64 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_MIN_F64_e64 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_MIN_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MIN_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MIN_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode %2:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode @@ -2696,12 +2648,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_max_f64_e64 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_MAX_F64_e64 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_MAX_F64_e64 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_MAX_F64_e64 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_MAX_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAX_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAX_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode %2:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode @@ -2717,12 +2669,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_lo_u32_e64 - ; GCN: renamable $vgpr0 = V_MUL_LO_U32_e64 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U32_e64 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MUL_LO_U32_e64 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MUL_LO_U32_e64 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_LO_U32_e64 2, undef %0:vgpr_32, implicit $exec @@ -2738,12 +2690,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_hi_u32_e64 - ; GCN: renamable $vgpr0 = V_MUL_HI_U32_e64 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MUL_HI_U32_e64 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MUL_HI_U32_e64 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MUL_HI_U32_e64 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_HI_U32_e64 2, undef %0:vgpr_32, implicit $exec @@ -2759,12 +2711,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_lo_i32_e64 - ; GCN: renamable $vgpr0 = V_MUL_LO_I32_e64 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_I32_e64 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MUL_LO_I32_e64 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_LO_I32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_I32_e64 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_LO_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_I32_e64 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_LO_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_I32_e64 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_I32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_I32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_I32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MUL_LO_I32_e64 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_LO_I32_e64 2, undef %0:vgpr_32, implicit $exec @@ -2780,12 +2732,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_mul_hi_i32_e64 - ; GCN: renamable $vgpr0 = V_MUL_HI_I32_e64 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MUL_HI_I32_e64 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MUL_HI_I32_e64 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MUL_HI_I32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_e64 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_e64 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_e64 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MUL_HI_I32_e64 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_HI_I32_e64 2, undef %0:vgpr_32, implicit $exec @@ -2801,12 +2753,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cubeid_f32_e64 - ; GCN: renamable $vgpr0 = V_CUBEID_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CUBEID_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CUBEID_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CUBEID_F32_e64_:%[0-9]+]]:vgpr_32 = V_CUBEID_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CUBEID_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CUBEID_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CUBEID_F32_e64_2:%[0-9]+]]:vgpr_32 = V_CUBEID_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEID_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEID_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEID_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CUBEID_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_CUBEID_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -2822,12 +2774,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cubesc_f32_e64 - ; GCN: renamable $vgpr0 = V_CUBESC_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CUBESC_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CUBESC_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CUBESC_F32_e64_:%[0-9]+]]:vgpr_32 = V_CUBESC_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CUBESC_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CUBESC_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CUBESC_F32_e64_2:%[0-9]+]]:vgpr_32 = V_CUBESC_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBESC_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBESC_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBESC_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CUBESC_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_CUBESC_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -2843,12 +2795,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cubetc_f32_e64 - ; GCN: renamable $vgpr0 = V_CUBETC_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CUBETC_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CUBETC_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CUBETC_F32_e64_:%[0-9]+]]:vgpr_32 = V_CUBETC_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CUBETC_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CUBETC_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CUBETC_F32_e64_2:%[0-9]+]]:vgpr_32 = V_CUBETC_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBETC_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBETC_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBETC_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CUBETC_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_CUBETC_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -2864,12 +2816,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cubema_f32_e64 - ; GCN: renamable $vgpr0 = V_CUBEMA_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_CUBEMA_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_CUBEMA_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CUBEMA_F32_e64_:%[0-9]+]]:vgpr_32 = V_CUBEMA_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CUBEMA_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CUBEMA_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CUBEMA_F32_e64_2:%[0-9]+]]:vgpr_32 = V_CUBEMA_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEMA_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEMA_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEMA_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_CUBEMA_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_CUBEMA_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -2885,12 +2837,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_bfe_u32_e64 - ; GCN: renamable $vgpr0 = V_BFE_U32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_BFE_U32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_BFE_U32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BFE_U32_e64_1:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BFE_U32_e64_2:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_BFE_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_BFE_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -2906,12 +2858,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_bfe_i32_e64 - ; GCN: renamable $vgpr0 = V_BFE_I32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_BFE_I32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_BFE_I32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BFE_I32_e64_1:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BFE_I32_e64_2:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_I32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_I32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_I32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_BFE_I32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_BFE_I32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -2927,12 +2879,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_bfi_b32_e64 - ; GCN: renamable $vgpr0 = V_BFI_B32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_BFI_B32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_BFI_B32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BFI_B32_e64_1:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_BFI_B32_e64_2:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_BFI_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFI_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_BFI_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_BFI_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_BFI_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -2948,12 +2900,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_alignbit_b32_e64 - ; GCN: renamable $vgpr0 = V_ALIGNBIT_B32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ALIGNBIT_B32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ALIGNBIT_B32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_1:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_2:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBIT_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBIT_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBIT_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_ALIGNBIT_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ALIGNBIT_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -2969,12 +2921,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_alignbyte_b32_e64 - ; GCN: renamable $vgpr0 = V_ALIGNBYTE_B32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ALIGNBYTE_B32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ALIGNBYTE_B32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ALIGNBYTE_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBYTE_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ALIGNBYTE_B32_e64_1:%[0-9]+]]:vgpr_32 = V_ALIGNBYTE_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ALIGNBYTE_B32_e64_2:%[0-9]+]]:vgpr_32 = V_ALIGNBYTE_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBYTE_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBYTE_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBYTE_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_ALIGNBYTE_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ALIGNBYTE_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -2990,12 +2942,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_min3_i32_e64 - ; GCN: renamable $vgpr0 = V_MIN3_I32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MIN3_I32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MIN3_I32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MIN3_I32_e64_:%[0-9]+]]:vgpr_32 = V_MIN3_I32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MIN3_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN3_I32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MIN3_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN3_I32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_I32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_I32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_I32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MIN3_I32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MIN3_I32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3011,12 +2963,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_min3_u32_e64 - ; GCN: renamable $vgpr0 = V_MIN3_U32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MIN3_U32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MIN3_U32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MIN3_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN3_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MIN3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN3_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MIN3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN3_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MIN3_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MIN3_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3032,12 +2984,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_max3_i32_e64 - ; GCN: renamable $vgpr0 = V_MAX3_I32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MAX3_I32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MAX3_I32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAX3_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX3_I32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MAX3_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX3_I32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MAX3_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX3_I32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_I32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_I32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_I32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MAX3_I32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MAX3_I32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3053,12 +3005,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_max3_u32_e64 - ; GCN: renamable $vgpr0 = V_MAX3_U32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MAX3_U32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MAX3_U32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAX3_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX3_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MAX3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX3_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MAX3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX3_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MAX3_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MAX3_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3074,12 +3026,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_med3_i32_e64 - ; GCN: renamable $vgpr0 = V_MED3_I32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MED3_I32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MED3_I32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MED3_I32_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MED3_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MED3_I32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MED3_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MED3_I32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_I32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_I32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_I32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MED3_I32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MED3_I32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3095,12 +3047,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_med3_u32_e64 - ; GCN: renamable $vgpr0 = V_MED3_U32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MED3_U32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MED3_U32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MED3_U32_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MED3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MED3_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_MED3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MED3_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MED3_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MED3_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3116,12 +3068,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_min3_f32_e64 - ; GCN: renamable $vgpr0 = V_MIN3_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_MIN3_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MIN3_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MIN3_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN3_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MIN3_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN3_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MIN3_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN3_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MIN3_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_MIN3_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -3137,12 +3089,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_max3_f32_e64 - ; GCN: renamable $vgpr0 = V_MAX3_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_MAX3_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MAX3_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MAX3_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX3_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAX3_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX3_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MAX3_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX3_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MAX3_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_MAX3_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -3158,12 +3110,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_med3_f32_e64 - ; GCN: renamable $vgpr0 = V_MED3_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = V_MED3_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MED3_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MED3_F32_e64_:%[0-9]+]]:vgpr_32 = V_MED3_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MED3_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MED3_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MED3_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MED3_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MED3_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vgpr_32 = V_MED3_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -3179,12 +3131,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sad_u8_e64 - ; GCN: renamable $vgpr0 = V_SAD_U8_e64 1, 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_SAD_U8_e64 2, 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_SAD_U8_e64 3, 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SAD_U8_e64_:%[0-9]+]]:vgpr_32 = V_SAD_U8_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SAD_U8_e64_1:%[0-9]+]]:vgpr_32 = V_SAD_U8_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SAD_U8_e64_2:%[0-9]+]]:vgpr_32 = V_SAD_U8_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U8_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U8_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U8_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_SAD_U8_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_SAD_U8_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec @@ -3200,12 +3152,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sad_hi_u8_e64 - ; GCN: renamable $vgpr0 = V_SAD_HI_U8_e64 1, 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_SAD_HI_U8_e64 2, 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_SAD_HI_U8_e64 3, 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SAD_HI_U8_e64_:%[0-9]+]]:vgpr_32 = V_SAD_HI_U8_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SAD_HI_U8_e64_1:%[0-9]+]]:vgpr_32 = V_SAD_HI_U8_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SAD_HI_U8_e64_2:%[0-9]+]]:vgpr_32 = V_SAD_HI_U8_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_HI_U8_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_HI_U8_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_HI_U8_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_SAD_HI_U8_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_SAD_HI_U8_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec @@ -3221,12 +3173,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sad_u16_e64 - ; GCN: renamable $vgpr0 = V_SAD_U16_e64 1, 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_SAD_U16_e64 2, 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_SAD_U16_e64 3, 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SAD_U16_e64_:%[0-9]+]]:vgpr_32 = V_SAD_U16_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SAD_U16_e64_1:%[0-9]+]]:vgpr_32 = V_SAD_U16_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SAD_U16_e64_2:%[0-9]+]]:vgpr_32 = V_SAD_U16_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U16_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_SAD_U16_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_SAD_U16_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec @@ -3242,12 +3194,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sad_u32_e64 - ; GCN: renamable $vgpr0 = V_SAD_U32_e64 1, 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_SAD_U32_e64 2, 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_SAD_U32_e64 3, 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SAD_U32_e64_:%[0-9]+]]:vgpr_32 = V_SAD_U32_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SAD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SAD_U32_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SAD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SAD_U32_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_SAD_U32_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_SAD_U32_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec @@ -3263,12 +3215,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_cvt_pk_u8_f32_e64 - ; GCN: renamable $vgpr0 = nofpexcept V_CVT_PK_U8_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_PK_U8_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_PK_U8_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_CVT_PK_U8_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_PK_U8_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_CVT_PK_U8_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U8_F32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U8_F32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U8_F32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, implicit $exec, implicit $mode @@ -3284,12 +3236,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_div_fixup_f64_e64 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_DIV_FIXUP_F64_e64 0, 1, 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_DIV_FIXUP_F64_e64 0, 2, 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_DIV_FIXUP_F64_e64 0, 3, 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_DIV_FIXUP_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 1, 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_DIV_FIXUP_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 2, 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_DIV_FIXUP_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 3, 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_DIV_FIXUP_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_DIV_FIXUP_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_DIV_FIXUP_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 1, 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode %2:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 2, 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode @@ -3305,12 +3257,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ldexp_f64_e64 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_LDEXP_F64_e64 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_LDEXP_F64_e64 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_LDEXP_F64_e64 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_LDEXP_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_LDEXP_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_LDEXP_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -3326,12 +3278,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_msad_u8_e64 - ; GCN: renamable $vgpr0 = V_MSAD_U8_e64 1, 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MSAD_U8_e64 2, 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_MSAD_U8_e64 3, 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_MSAD_U8_e64_:%[0-9]+]]:vgpr_32 = V_MSAD_U8_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_MSAD_U8_e64_1:%[0-9]+]]:vgpr_32 = V_MSAD_U8_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_MSAD_U8_e64_2:%[0-9]+]]:vgpr_32 = V_MSAD_U8_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MSAD_U8_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MSAD_U8_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MSAD_U8_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_MSAD_U8_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_MSAD_U8_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec @@ -3347,12 +3299,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_trig_preop_f64_e64 - ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_TRIG_PREOP_F64_e64 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_TRIG_PREOP_F64_e64 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_TRIG_PREOP_F64_e64 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_TRIG_PREOP_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_TRIG_PREOP_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_TRIG_PREOP_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_TRIG_PREOP_F64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_TRIG_PREOP_F64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_TRIG_PREOP_F64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode %2:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode @@ -3368,12 +3320,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lshlrev_b64_e64 - ; GCN: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 1, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 2, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 1, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 2, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B64_e64_2:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 3, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = V_LSHLREV_B64_e64 1, undef %0:vreg_64_align2, implicit $exec %2:vreg_64_align2 = V_LSHLREV_B64_e64 2, undef %0:vreg_64_align2, implicit $exec @@ -3389,12 +3341,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lshrrev_b64_e64 - ; GCN: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 1, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 2, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 3, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 1, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 2, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B64_e64_2:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 3, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = V_LSHRREV_B64_e64 1, undef %0:vreg_64_align2, implicit $exec %2:vreg_64_align2 = V_LSHRREV_B64_e64 2, undef %0:vreg_64_align2, implicit $exec @@ -3410,12 +3362,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_ashrrev_i64_e64 - ; GCN: renamable $vgpr0_vgpr1 = V_ASHRREV_I64_e64 1, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_ASHRREV_I64_e64 2, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_ASHRREV_I64_e64 3, undef $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN: [[V_ASHRREV_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_ASHRREV_I64_e64 1, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: [[V_ASHRREV_I64_e64_1:%[0-9]+]]:vreg_64_align2 = V_ASHRREV_I64_e64 2, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: [[V_ASHRREV_I64_e64_2:%[0-9]+]]:vreg_64_align2 = V_ASHRREV_I64_e64 3, undef %1:vreg_64_align2, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I64_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I64_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I64_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vreg_64_align2 = V_ASHRREV_I64_e64 1, undef %0:vreg_64_align2, implicit $exec %2:vreg_64_align2 = V_ASHRREV_I64_e64 2, undef %0:vreg_64_align2, implicit $exec @@ -3431,12 +3383,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_perm_b32_e64 - ; GCN: renamable $vgpr0 = V_PERM_B32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_PERM_B32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_PERM_B32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_PERM_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_PERM_B32_e64_2:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PERM_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PERM_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PERM_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_PERM_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_PERM_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3452,12 +3404,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_add3_u32_e64 - ; GCN: renamable $vgpr0 = V_ADD3_U32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ADD3_U32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ADD3_U32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ADD3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD3_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD3_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD3_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_ADD3_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ADD3_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3473,12 +3425,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_and_or_b32_e64 - ; GCN: renamable $vgpr0 = V_AND_OR_B32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_AND_OR_B32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_AND_OR_B32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_AND_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_AND_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_AND_OR_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_AND_OR_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_AND_OR_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_AND_OR_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_AND_OR_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3494,12 +3446,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_or3_b32_e64 - ; GCN: renamable $vgpr0 = V_OR3_B32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_OR3_B32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_OR3_B32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_OR3_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_OR3_B32_e64_2:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_OR3_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_OR3_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_OR3_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_OR3_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_OR3_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3515,12 +3467,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_xad_u32_e64 - ; GCN: renamable $vgpr0 = V_XAD_U32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_XAD_U32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_XAD_U32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_XAD_U32_e64_:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_XAD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_XAD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_XAD_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_XAD_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_XAD_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_XAD_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_XAD_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3536,12 +3488,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_add_i32_e64 - ; GCN: renamable $vgpr0 = V_ADD_I32_e64 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ADD_I32_e64 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ADD_I32_e64 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e64 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_I32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_I32_e64 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_I32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_I32_e64 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_I32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_I32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_I32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_ADD_I32_e64 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_ADD_I32_e64 2, undef %0:vgpr_32, 0, implicit $exec @@ -3557,12 +3509,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_add_lshl_u32_e64 - ; GCN: renamable $vgpr0 = V_ADD_LSHL_U32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_ADD_LSHL_U32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_ADD_LSHL_U32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_ADD_LSHL_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_LSHL_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ADD_LSHL_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_LSHL_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_ADD_LSHL_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_LSHL_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_LSHL_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_LSHL_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_LSHL_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_ADD_LSHL_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ADD_LSHL_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3578,12 +3530,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_sub_i32_e64 - ; GCN: renamable $vgpr0 = V_SUB_I32_e64 1, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_SUB_I32_e64 2, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_SUB_I32_e64 3, undef $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 1, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SUB_I32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 2, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: [[V_SUB_I32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 3, undef %1:vgpr_32, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_I32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_I32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_I32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_SUB_I32_e64 1, undef %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_SUB_I32_e64 2, undef %0:vgpr_32, 0, implicit $exec @@ -3599,12 +3551,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lshl_add_u32_e64 - ; GCN: renamable $vgpr0 = V_LSHL_ADD_U32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_LSHL_ADD_U32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHL_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHL_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_ADD_U32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_ADD_U32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_ADD_U32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_LSHL_ADD_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHL_ADD_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3620,12 +3572,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_v_lshl_or_b32_e64 - ; GCN: renamable $vgpr0 = V_LSHL_OR_B32_e64 1, 1, undef $vgpr0, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_LSHL_OR_B32_e64 2, 2, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr0 = V_LSHL_OR_B32_e64 3, 3, undef $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; GCN: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_OR_B32_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_OR_B32_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_OR_B32_e64_2]] ; GCN-NEXT: S_ENDPGM 0 %1:vgpr_32 = V_LSHL_OR_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHL_OR_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec @@ -3645,13 +3597,14 @@ body: | ; GCN-LABEL: name: test_remat_v_lshlrev_b16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_LSHLREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B16_e32_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B16_e32_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_LSHLREV_B16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHLREV_B16_e32 2, %0:vgpr_32, implicit $exec @@ -3670,13 +3623,14 @@ body: | ; GCN-LABEL: name: test_remat_v_lshlrev_b16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e64 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e64 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e64 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B16_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_LSHLREV_B16_e64 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHLREV_B16_e64 2, %0:vgpr_32, implicit $exec @@ -3696,13 +3650,14 @@ body: | ; GCN-LABEL: name: test_remat_v_lshrrev_b16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B16_e32_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B16_e32_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_LSHRREV_B16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHRREV_B16_e32 2, %0:vgpr_32, implicit $exec @@ -3721,13 +3676,14 @@ body: | ; GCN-LABEL: name: test_remat_v_lshrrev_b16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e64 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e64 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e64 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B16_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_LSHRREV_B16_e64 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_LSHRREV_B16_e64 2, %0:vgpr_32, implicit $exec @@ -3747,13 +3703,14 @@ body: | ; GCN-LABEL: name: test_remat_v_ashrrev_i16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_ASHRREV_I16_e32_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_ASHRREV_I16_e32_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_ASHRREV_I16_e32_2:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_ASHRREV_I16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ASHRREV_I16_e32 2, %0:vgpr_32, implicit $exec @@ -3772,13 +3729,14 @@ body: | ; GCN-LABEL: name: test_remat_v_ashrrev_i16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e64 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e64 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e64 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_ASHRREV_I16_e64_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_ASHRREV_I16_e64_2:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_ASHRREV_I16_e64 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ASHRREV_I16_e64 2, %0:vgpr_32, implicit $exec @@ -3798,13 +3756,14 @@ body: | ; GCN-LABEL: name: test_remat_v_add_u16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_ADD_U16_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_ADD_U16_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_ADD_U16_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_ADD_U16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_ADD_U16_e32 2, %0:vgpr_32, implicit $exec @@ -3824,13 +3783,14 @@ body: | ; GCN-LABEL: name: test_remat_v_add_u16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e64 1, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e64 2, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e64 3, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 1, [[COPY]], 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U16_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 2, [[COPY]], 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U16_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 3, [[COPY]], 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_ADD_U16_e64 1, %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_ADD_U16_e64 2, %0:vgpr_32, 0, implicit $exec @@ -3850,13 +3810,14 @@ body: | ; GCN-LABEL: name: test_remat_v_sub_u16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_SUB_U16_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_SUB_U16_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_SUB_U16_e32_2:%[0-9]+]]:vgpr_32 = V_SUB_U16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_SUB_U16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_SUB_U16_e32 2, %0:vgpr_32, implicit $exec @@ -3876,13 +3837,14 @@ body: | ; GCN-LABEL: name: test_remat_v_sub_u16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e64 1, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e64 2, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e64 3, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 1, [[COPY]], 0, implicit $exec + ; GCN-NEXT: [[V_SUB_U16_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 2, [[COPY]], 0, implicit $exec + ; GCN-NEXT: [[V_SUB_U16_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 3, [[COPY]], 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_SUB_U16_e64 1, %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_SUB_U16_e64 2, %0:vgpr_32, 0, implicit $exec @@ -3902,13 +3864,14 @@ body: | ; GCN-LABEL: name: test_remat_v_subrev_u16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_SUBREV_U16_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_SUBREV_U16_e32_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_SUBREV_U16_e32_2:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_SUBREV_U16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_SUBREV_U16_e32 2, %0:vgpr_32, implicit $exec @@ -3928,13 +3891,14 @@ body: | ; GCN-LABEL: name: test_remat_v_subrev_u16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e64 1, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e64 2, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e64 3, $vgpr0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_SUBREV_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e64 1, [[COPY]], 0, implicit $exec + ; GCN-NEXT: [[V_SUBREV_U16_e64_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e64 2, [[COPY]], 0, implicit $exec + ; GCN-NEXT: [[V_SUBREV_U16_e64_2:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e64 3, [[COPY]], 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_SUBREV_U16_e64 1, %0:vgpr_32, 0, implicit $exec %2:vgpr_32 = V_SUBREV_U16_e64 2, %0:vgpr_32, 0, implicit $exec @@ -3954,13 +3918,14 @@ body: | ; GCN-LABEL: name: test_remat_v_min_u16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MIN_U16_e32_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MIN_U16_e32_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MIN_U16_e32_2:%[0-9]+]]:vgpr_32 = V_MIN_U16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MIN_U16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MIN_U16_e32 2, %0:vgpr_32, implicit $exec @@ -3980,13 +3945,14 @@ body: | ; GCN-LABEL: name: test_remat_v_min_u16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e64 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e64 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e64 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MIN_U16_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MIN_U16_e64 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MIN_U16_e64 2, %0:vgpr_32, implicit $exec @@ -4006,13 +3972,14 @@ body: | ; GCN-LABEL: name: test_remat_v_max_u16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MAX_U16_e32_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MAX_U16_e32_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MAX_U16_e32_2:%[0-9]+]]:vgpr_32 = V_MAX_U16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MAX_U16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MAX_U16_e32 2, %0:vgpr_32, implicit $exec @@ -4032,13 +3999,14 @@ body: | ; GCN-LABEL: name: test_remat_v_max_u16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e64 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e64 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e64 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MAX_U16_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MAX_U16_e64 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MAX_U16_e64 2, %0:vgpr_32, implicit $exec @@ -4058,13 +4026,14 @@ body: | ; GCN-LABEL: name: test_remat_v_min_i16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MIN_I16_e32_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MIN_I16_e32_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MIN_I16_e32_2:%[0-9]+]]:vgpr_32 = V_MIN_I16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MIN_I16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MIN_I16_e32 2, %0:vgpr_32, implicit $exec @@ -4084,13 +4053,14 @@ body: | ; GCN-LABEL: name: test_remat_v_min_i16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e64 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e64 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e64 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MIN_I16_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MIN_I16_e64 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MIN_I16_e64 2, %0:vgpr_32, implicit $exec @@ -4110,13 +4080,14 @@ body: | ; GCN-LABEL: name: test_remat_v_max_i16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MAX_I16_e32_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MAX_I16_e32_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MAX_I16_e32_2:%[0-9]+]]:vgpr_32 = V_MAX_I16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MAX_I16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MAX_I16_e32 2, %0:vgpr_32, implicit $exec @@ -4136,13 +4107,14 @@ body: | ; GCN-LABEL: name: test_remat_v_max_i16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e64 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e64 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e64 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MAX_I16_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MAX_I16_e64 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MAX_I16_e64 2, %0:vgpr_32, implicit $exec @@ -4162,13 +4134,14 @@ body: | ; GCN-LABEL: name: test_remat_v_mul_lo_u16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e32 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e32 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e32 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MUL_LO_U16_e32_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e32 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MUL_LO_U16_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e32 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MUL_LO_U16_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e32 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MUL_LO_U16_e32 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_LO_U16_e32 2, %0:vgpr_32, implicit $exec @@ -4188,13 +4161,14 @@ body: | ; GCN-LABEL: name: test_remat_v_mul_lo_u16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e64 1, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e64 2, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e64 3, $vgpr0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MUL_LO_U16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e64 1, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MUL_LO_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e64 2, [[COPY]], implicit $exec + ; GCN-NEXT: [[V_MUL_LO_U16_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e64 3, [[COPY]], implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MUL_LO_U16_e64 1, %0:vgpr_32, implicit $exec %2:vgpr_32 = V_MUL_LO_U16_e64 2, %0:vgpr_32, implicit $exec @@ -4214,13 +4188,14 @@ body: | ; GCN-LABEL: name: test_remat_v_add_f16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e32 2, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e32 3, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_ADD_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_ADD_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e32 2, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_ADD_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e32 3, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_ADD_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_ADD_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec @@ -4240,13 +4215,14 @@ body: | ; GCN-LABEL: name: test_remat_v_add_f16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_ADD_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_ADD_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec @@ -4266,13 +4242,14 @@ body: | ; GCN-LABEL: name: test_remat_v_sub_f16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e32 2, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e32 3, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_SUB_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_SUB_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e32 2, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_SUB_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e32 3, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_SUB_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_SUB_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec @@ -4292,13 +4269,14 @@ body: | ; GCN-LABEL: name: test_remat_v_sub_f16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_SUB_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_SUB_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_SUB_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec @@ -4318,13 +4296,14 @@ body: | ; GCN-LABEL: name: test_remat_v_subrev_f16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e32 2, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e32 3, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_SUBREV_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_SUBREV_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e32 2, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_SUBREV_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e32 3, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_SUBREV_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_SUBREV_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec @@ -4344,13 +4323,14 @@ body: | ; GCN-LABEL: name: test_remat_v_subrev_f16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_SUBREV_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_SUBREV_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_SUBREV_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec @@ -4370,13 +4350,14 @@ body: | ; GCN-LABEL: name: test_remat_v_mul_f16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e32 2, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e32 3, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MUL_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e32 2, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e32 3, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MUL_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_MUL_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec @@ -4396,13 +4377,14 @@ body: | ; GCN-LABEL: name: test_remat_v_mul_f16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec @@ -4422,13 +4404,14 @@ body: | ; GCN-LABEL: name: test_remat_v_ldexp_f16_e32 ; GCN: liveins: $vgpr0, $vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_LDEXP_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_LDEXP_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_LDEXP_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, %0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, %0, implicit $mode, implicit $exec @@ -4448,13 +4431,14 @@ body: | ; GCN-LABEL: name: test_remat_v_ldexp_f16_e64 ; GCN: liveins: $vgpr0, $vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e64 0, $vgpr0, 0, 1, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e64 0, $vgpr0, 0, 1, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e64 0, $vgpr0, 0, 1, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, 1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_LDEXP_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, 1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_LDEXP_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, 1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, %0, 0, 1, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, %0, 0, 1, 0, 0, implicit $mode, implicit $exec @@ -4474,13 +4458,14 @@ body: | ; GCN-LABEL: name: test_remat_v_min_f16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e32 2, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e32 3, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MIN_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MIN_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e32 2, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MIN_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e32 3, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MIN_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_MIN_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec @@ -4500,13 +4485,14 @@ body: | ; GCN-LABEL: name: test_remat_v_min_f16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MIN_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MIN_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec @@ -4526,13 +4512,14 @@ body: | ; GCN-LABEL: name: test_remat_v_max_f16_e32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e32 1, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e32 2, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e32 3, $vgpr0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MAX_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e32 1, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MAX_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e32 2, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MAX_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e32 3, [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MAX_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_MAX_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec @@ -4552,13 +4539,14 @@ body: | ; GCN-LABEL: name: test_remat_v_max_f16_e64 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MAX_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MAX_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e64_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e64_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e64_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec @@ -4578,13 +4566,14 @@ body: | ; GCN-LABEL: name: test_remat_v_madak_f16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADAK_F16 1, $vgpr0, 1, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADAK_F16 2, $vgpr0, 2, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADAK_F16 3, $vgpr0, 3, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MADAK_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F16 1, [[COPY]], 1, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MADAK_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F16 2, [[COPY]], 2, implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MADAK_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F16 3, [[COPY]], 3, implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MADAK_F16 1, %0, 1, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MADAK_F16 2, %0, 2, implicit $exec, implicit $mode @@ -4604,13 +4593,14 @@ body: | ; GCN-LABEL: name: test_remat_v_madmk_f16 ; GCN: liveins: $vgpr0, $vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADMK_F16 1, 1, $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADMK_F16 2, 2, $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADMK_F16 3, 3, $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MADMK_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F16 1, 1, [[COPY]], implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MADMK_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F16 2, 2, [[COPY]], implicit $exec, implicit $mode + ; GCN-NEXT: [[V_MADMK_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F16 3, 3, [[COPY]], implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MADMK_F16 1, 1, %0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_MADMK_F16 2, 2, %0, implicit $exec, implicit $mode @@ -4630,13 +4620,14 @@ body: | ; GCN-LABEL: name: test_remat_v_fmamk_f16 ; GCN: liveins: $vgpr0, $vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAMK_F16 1, 1, $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAMK_F16 2, 2, $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAMK_F16 3, 3, $vgpr0, implicit $exec, implicit $mode - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_FMAMK_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F16 1, 1, [[COPY]], implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMAMK_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F16 2, 2, [[COPY]], implicit $exec, implicit $mode + ; GCN-NEXT: [[V_FMAMK_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F16 3, 3, [[COPY]], implicit $exec, implicit $mode + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_FMAMK_F16 1, 1, %0, implicit $exec, implicit $mode %2:vgpr_32 = nofpexcept V_FMAMK_F16 2, 2, %0, implicit $exec, implicit $mode @@ -4656,13 +4647,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_mad_i16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_I16 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_I16 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_I16 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MAD_I16_:%[0-9]+]]:vgpr_32 = V_PK_MAD_I16 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MAD_I16_1:%[0-9]+]]:vgpr_32 = V_PK_MAD_I16 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MAD_I16_2:%[0-9]+]]:vgpr_32 = V_PK_MAD_I16 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_I16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_I16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_I16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_MAD_I16 8, %0, 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4683,13 +4675,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_mad_u16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_U16 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_U16 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_U16 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MAD_U16_:%[0-9]+]]:vgpr_32 = V_PK_MAD_U16 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MAD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MAD_U16 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MAD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MAD_U16 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_U16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_U16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_U16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_MAD_U16 8, %0, 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4710,13 +4703,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_add_u16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_U16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_U16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_U16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_ADD_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4737,13 +4731,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_add_i16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_ADD_I16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_I16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_I16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_I16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_I16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_I16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_ADD_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4764,13 +4759,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_mul_lo_u16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_MUL_LO_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MUL_LO_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MUL_LO_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MUL_LO_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_LO_U16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_LO_U16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_LO_U16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_MUL_LO_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4791,13 +4787,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_min_i16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MIN_I16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MIN_I16_1:%[0-9]+]]:vgpr_32 = V_PK_MIN_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MIN_I16_2:%[0-9]+]]:vgpr_32 = V_PK_MIN_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_I16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_I16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_I16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_MIN_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4818,13 +4815,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_max_i16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MAX_I16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MAX_I16_1:%[0-9]+]]:vgpr_32 = V_PK_MAX_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MAX_I16_2:%[0-9]+]]:vgpr_32 = V_PK_MAX_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_I16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_I16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_I16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_MAX_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4845,13 +4843,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_min_u16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MIN_U16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MIN_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MIN_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MIN_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MIN_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_U16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_U16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_U16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_MIN_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4872,13 +4871,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_max_u16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MAX_U16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MAX_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MAX_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MAX_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MAX_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_U16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_U16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_U16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_MAX_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4899,13 +4899,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_sub_u16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_SUB_U16_:%[0-9]+]]:vgpr_32 = V_PK_SUB_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_SUB_U16_1:%[0-9]+]]:vgpr_32 = V_PK_SUB_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_SUB_U16_2:%[0-9]+]]:vgpr_32 = V_PK_SUB_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_U16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_U16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_U16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_SUB_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4926,13 +4927,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_sub_i16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_SUB_I16_:%[0-9]+]]:vgpr_32 = V_PK_SUB_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_SUB_I16_1:%[0-9]+]]:vgpr_32 = V_PK_SUB_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_SUB_I16_2:%[0-9]+]]:vgpr_32 = V_PK_SUB_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_I16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_I16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_I16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_SUB_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4953,13 +4955,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_lshlrev_b16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHLREV_B16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHLREV_B16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHLREV_B16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_LSHLREV_B16_1:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_LSHLREV_B16_2:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHLREV_B16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHLREV_B16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHLREV_B16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_LSHLREV_B16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -4980,13 +4983,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_ashrrev_i16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_ASHRREV_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_ASHRREV_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_ASHRREV_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_ASHRREV_I16_1:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_ASHRREV_I16_2:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ASHRREV_I16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ASHRREV_I16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ASHRREV_I16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_ASHRREV_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -5007,13 +5011,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_lshrrev_b16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHRREV_B16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHRREV_B16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHRREV_B16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_LSHRREV_B16_1:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_LSHRREV_B16_2:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHRREV_B16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHRREV_B16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHRREV_B16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_LSHRREV_B16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec @@ -5034,13 +5039,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_add_f16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_ADD_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_ADD_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_ADD_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_PK_ADD_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5064,19 +5070,14 @@ body: | ; GCN-LABEL: name: test_no_remat_v_pk_add_f16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; GCN-NEXT: renamable $vgpr1 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_F16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_F16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_PK_ADD_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5097,13 +5098,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_mul_f16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MUL_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MUL_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MUL_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MUL_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_MUL_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MUL_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_MUL_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MUL_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_PK_MUL_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5124,13 +5126,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_min_f16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MIN_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MIN_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MIN_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MIN_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_MIN_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MIN_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_MIN_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MIN_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_PK_MIN_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5151,13 +5154,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_max_f16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MAX_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MAX_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MAX_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_MAX_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MAX_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_MAX_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MAX_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_PK_MAX_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5178,13 +5182,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_fma_f16 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_FMA_F16 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_FMA_F16 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_FMA_F16 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_FMA_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_FMA_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F16_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F16_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F16_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_PK_FMA_F16 8, %0, 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_PK_FMA_F16 9, %0, 9, %0, 9, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5205,13 +5210,14 @@ body: | ; GCN-LABEL: name: test_remat_v_mad_mix_f32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_MIX_F32 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_MIX_F32 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MAD_MIX_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_MIX_F32 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MAD_MIX_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_MIX_F32 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_MIX_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_MIX_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_MIX_F32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0, 8, %0, 8, %0, 0, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_MAD_MIX_F32 9, %0, 9, %0, 9, %0, 0, 0, 0, implicit $mode, implicit $exec @@ -5231,13 +5237,14 @@ body: | ; GCN-LABEL: name: test_remat_v_fma_mix_f32 ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_MIX_F32 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_MIX_F32 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_FMA_MIX_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_FMA_MIX_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_MIX_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_MIX_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_MIX_F32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_FMA_MIX_F32 8, %0, 8, %0, 8, %0, 0, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, %0, 9, %0, 9, %0, 0, 0, 0, implicit $mode, implicit $exec @@ -5257,13 +5264,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_fma_f32 ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_FMA_F32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_FMA_F32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %0, 8, %0, 11, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %2:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %0, 8, %0, 11, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5285,19 +5293,14 @@ body: | ; GCN-LABEL: name: test_no_remat_v_pk_fma_f32 ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.2, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.2, align 4, addrspace 5) - ; GCN-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_V64_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.2, align 4, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_FMA_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_FMA_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_64_align2 = V_PK_FMA_F32 8, %0, 8, %0, 11, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %2:vreg_64_align2 = V_PK_FMA_F32 8, %0, 8, %0, 11, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5317,13 +5320,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_mul_f32 ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_MUL_F32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_MUL_F32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $mode, implicit $exec %2:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5343,13 +5347,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_add_f32 ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $mode, implicit $exec %2:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -5369,13 +5374,14 @@ body: | ; GCN-LABEL: name: test_remat_v_pk_mov_b32 ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_MOV_B32 9, $vgpr0_vgpr1, 9, $vgpr0_vgpr1, 12, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_MOV_B32 10, $vgpr0_vgpr1, 10, $vgpr0_vgpr1, 13, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3 - ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MOV_B32_1:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 9, [[COPY]], 9, [[COPY]], 12, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_PK_MOV_B32_2:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 10, [[COPY]], 10, [[COPY]], 13, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_1]] + ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_2]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_64_align2 = V_PK_MOV_B32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $exec %2:vreg_64_align2 = V_PK_MOV_B32 9, %0, 9, %0, 12, 0, 0, 0, 0, implicit $exec @@ -5395,12 +5401,12 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: test_remat_subreg_def - ; GCN: renamable $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1 - ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 2, implicit $exec - ; GCN-NEXT: S_NOP 0, implicit renamable $vgpr0_vgpr1 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: undef [[V_MOV_B32_e32_1:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_2]] + ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec undef %1.sub0:vreg_64 = V_MOV_B32_e32 2, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll index 002de8bb4eb51..8bbae59f468f1 100644 --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -2,7 +2,7 @@ ; RUN: FileCheck -check-prefix=REMARK %s < %t ; STDERR: remark: foo.cl:27:0: Function Name: test_kernel -; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28 +; STDERR-NEXT: remark: foo.cl:27:0: TotalSGPRs: 28 ; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9 ; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43 ; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0 @@ -27,7 +27,7 @@ ; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 } ; REMARK-NEXT: Function: test_kernel ; REMARK-NEXT: Args: -; REMARK-NEXT: - String: ' SGPRs: ' +; REMARK-NEXT: - String: ' TotalSGPRs: ' ; REMARK-NEXT: - NumSGPR: '28' ; REMARK-NEXT: ... ; REMARK-NEXT: --- !Analysis @@ -122,7 +122,7 @@ define void @test_func() !dbg !6 { } ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel -; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4 +; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 4 ; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0 @@ -141,12 +141,12 @@ define void @empty_func() !dbg !8 { } ; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call -; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: 39 -; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: 32 -; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: 10 +; STDERR-NEXT: remark: foo.cl:64:0: TotalSGPRs: test_indirect_call.numbered_sgpr+6 +; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: test_indirect_call.num_vgpr +; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr ; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0 ; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: 8 +; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0 @@ -159,12 +159,12 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 { } ; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack -; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: 39 -; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: 32 -; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: 10 +; STDERR-NEXT: remark: foo.cl:74:0: TotalSGPRs: test_indirect_w_static_stack.numbered_sgpr+6 +; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: test_indirect_w_static_stack.num_vgpr +; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr ; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144 ; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: 8 +; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0 diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll index bba59ba4d8030..5d5aad76afd09 100644 --- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll @@ -1,6 +1,6 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN,ALL %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s ; Make sure there's no assertion when trying to report the resource ; usage for a function which becomes dead during codegen. @@ -21,9 +21,10 @@ define internal fastcc void @unreachable() { ; GCN-NOT: s_swappc_b64 ; GCN: s_endpgm -; GCN: .amdhsa_private_segment_fixed_size 0 -; GCN-NOT: .amdhsa_uses_dynamic_stack 0 -; GCN-V5: .amdhsa_uses_dynamic_stack 0 +; GCN-NOT: .amdhsa_uses_dynamic_stack +; GCN-V5: .amdhsa_uses_dynamic_stack +; ALL: .set entry.private_seg_size, 0 +; ALL: .set entry.has_dyn_sized_stack, 0 define amdgpu_kernel void @entry() { bb0: br i1 false, label %bb1, label %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index ad82869c001f6..7f8240eeb98eb 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -8,9 +8,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-LABEL: kernel0: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART @@ -22,46 +19,47 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s2, 0 +; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: v_writelane_b32 v23, s3, 1 +; CHECK-NEXT: v_writelane_b32 v22, s2, 0 +; CHECK-NEXT: v_writelane_b32 v22, s3, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 2 -; CHECK-NEXT: v_writelane_b32 v23, s5, 3 -; CHECK-NEXT: v_writelane_b32 v23, s6, 4 -; CHECK-NEXT: v_writelane_b32 v23, s7, 5 +; CHECK-NEXT: v_writelane_b32 v22, s4, 2 +; CHECK-NEXT: v_writelane_b32 v22, s5, 3 +; CHECK-NEXT: v_writelane_b32 v22, s6, 4 +; CHECK-NEXT: v_writelane_b32 v22, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 6 -; CHECK-NEXT: v_writelane_b32 v23, s5, 7 -; CHECK-NEXT: v_writelane_b32 v23, s6, 8 -; CHECK-NEXT: v_writelane_b32 v23, s7, 9 -; CHECK-NEXT: v_writelane_b32 v23, s8, 10 -; CHECK-NEXT: v_writelane_b32 v23, s9, 11 -; CHECK-NEXT: v_writelane_b32 v23, s10, 12 -; CHECK-NEXT: v_writelane_b32 v23, s11, 13 +; CHECK-NEXT: v_writelane_b32 v22, s4, 6 +; CHECK-NEXT: v_writelane_b32 v22, s5, 7 +; CHECK-NEXT: v_writelane_b32 v22, s6, 8 +; CHECK-NEXT: v_writelane_b32 v22, s7, 9 +; CHECK-NEXT: v_writelane_b32 v22, s8, 10 +; CHECK-NEXT: v_writelane_b32 v22, s9, 11 +; CHECK-NEXT: v_writelane_b32 v22, s10, 12 +; CHECK-NEXT: v_writelane_b32 v22, s11, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 14 -; CHECK-NEXT: v_writelane_b32 v23, s5, 15 -; CHECK-NEXT: v_writelane_b32 v23, s6, 16 -; CHECK-NEXT: v_writelane_b32 v23, s7, 17 -; CHECK-NEXT: v_writelane_b32 v23, s8, 18 -; CHECK-NEXT: v_writelane_b32 v23, s9, 19 -; CHECK-NEXT: v_writelane_b32 v23, s10, 20 -; CHECK-NEXT: v_writelane_b32 v23, s11, 21 -; CHECK-NEXT: v_writelane_b32 v23, s12, 22 -; CHECK-NEXT: v_writelane_b32 v23, s13, 23 -; CHECK-NEXT: v_writelane_b32 v23, s14, 24 -; CHECK-NEXT: v_writelane_b32 v23, s15, 25 -; CHECK-NEXT: v_writelane_b32 v23, s16, 26 -; CHECK-NEXT: v_writelane_b32 v23, s17, 27 -; CHECK-NEXT: v_writelane_b32 v23, s18, 28 -; CHECK-NEXT: v_writelane_b32 v23, s19, 29 +; CHECK-NEXT: v_writelane_b32 v22, s4, 14 +; CHECK-NEXT: v_writelane_b32 v22, s5, 15 +; CHECK-NEXT: v_writelane_b32 v22, s6, 16 +; CHECK-NEXT: v_writelane_b32 v22, s7, 17 +; CHECK-NEXT: v_writelane_b32 v22, s8, 18 +; CHECK-NEXT: v_writelane_b32 v22, s9, 19 +; CHECK-NEXT: v_writelane_b32 v22, s10, 20 +; CHECK-NEXT: v_writelane_b32 v22, s11, 21 +; CHECK-NEXT: v_writelane_b32 v22, s12, 22 +; CHECK-NEXT: v_writelane_b32 v22, s13, 23 +; CHECK-NEXT: v_writelane_b32 v22, s14, 24 +; CHECK-NEXT: v_writelane_b32 v22, s15, 25 +; CHECK-NEXT: v_writelane_b32 v22, s16, 26 +; CHECK-NEXT: v_writelane_b32 v22, s17, 27 +; CHECK-NEXT: v_writelane_b32 v22, s18, 28 +; CHECK-NEXT: v_writelane_b32 v22, s19, 29 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[42:43] ; CHECK-NEXT: ;;#ASMEND @@ -71,14 +69,14 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 30 -; CHECK-NEXT: v_writelane_b32 v23, s5, 31 -; CHECK-NEXT: v_writelane_b32 v23, s6, 32 -; CHECK-NEXT: v_writelane_b32 v23, s7, 33 -; CHECK-NEXT: v_writelane_b32 v23, s8, 34 -; CHECK-NEXT: v_writelane_b32 v23, s9, 35 -; CHECK-NEXT: v_writelane_b32 v23, s10, 36 -; CHECK-NEXT: v_writelane_b32 v23, s11, 37 +; CHECK-NEXT: v_writelane_b32 v22, s4, 30 +; CHECK-NEXT: v_writelane_b32 v22, s5, 31 +; CHECK-NEXT: v_writelane_b32 v22, s6, 32 +; CHECK-NEXT: v_writelane_b32 v22, s7, 33 +; CHECK-NEXT: v_writelane_b32 v22, s8, 34 +; CHECK-NEXT: v_writelane_b32 v22, s9, 35 +; CHECK-NEXT: v_writelane_b32 v22, s10, 36 +; CHECK-NEXT: v_writelane_b32 v22, s11, 37 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART @@ -96,161 +94,159 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 38 -; CHECK-NEXT: v_writelane_b32 v23, s1, 39 -; CHECK-NEXT: v_writelane_b32 v23, s2, 40 -; CHECK-NEXT: v_writelane_b32 v23, s3, 41 -; CHECK-NEXT: v_writelane_b32 v23, s4, 42 -; CHECK-NEXT: v_writelane_b32 v23, s5, 43 -; CHECK-NEXT: v_writelane_b32 v23, s6, 44 -; CHECK-NEXT: v_writelane_b32 v23, s7, 45 -; CHECK-NEXT: v_writelane_b32 v23, s8, 46 -; CHECK-NEXT: v_writelane_b32 v23, s9, 47 -; CHECK-NEXT: v_writelane_b32 v23, s10, 48 -; CHECK-NEXT: v_writelane_b32 v23, s11, 49 -; CHECK-NEXT: v_writelane_b32 v23, s12, 50 -; CHECK-NEXT: v_writelane_b32 v23, s13, 51 -; CHECK-NEXT: v_writelane_b32 v23, s14, 52 -; CHECK-NEXT: v_writelane_b32 v23, s15, 53 +; CHECK-NEXT: v_writelane_b32 v22, s0, 38 +; CHECK-NEXT: v_writelane_b32 v22, s1, 39 +; CHECK-NEXT: v_writelane_b32 v22, s2, 40 +; CHECK-NEXT: v_writelane_b32 v22, s3, 41 +; CHECK-NEXT: v_writelane_b32 v22, s4, 42 +; CHECK-NEXT: v_writelane_b32 v22, s5, 43 +; CHECK-NEXT: v_writelane_b32 v22, s6, 44 +; CHECK-NEXT: v_writelane_b32 v22, s7, 45 +; CHECK-NEXT: v_writelane_b32 v22, s8, 46 +; CHECK-NEXT: v_writelane_b32 v22, s9, 47 +; CHECK-NEXT: v_writelane_b32 v22, s10, 48 +; CHECK-NEXT: v_writelane_b32 v22, s11, 49 +; CHECK-NEXT: v_writelane_b32 v22, s12, 50 +; CHECK-NEXT: v_writelane_b32 v22, s13, 51 +; CHECK-NEXT: v_writelane_b32 v22, s14, 52 +; CHECK-NEXT: v_writelane_b32 v22, s15, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 54 -; CHECK-NEXT: v_writelane_b32 v23, s1, 55 -; CHECK-NEXT: v_writelane_b32 v23, s2, 56 -; CHECK-NEXT: v_writelane_b32 v23, s3, 57 +; CHECK-NEXT: v_writelane_b32 v22, s0, 54 +; CHECK-NEXT: v_writelane_b32 v22, s1, 55 +; CHECK-NEXT: v_writelane_b32 v22, s2, 56 +; CHECK-NEXT: v_writelane_b32 v22, s3, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 58 -; CHECK-NEXT: v_writelane_b32 v23, s1, 59 -; CHECK-NEXT: v_writelane_b32 v23, s2, 60 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s3, 61 -; CHECK-NEXT: v_writelane_b32 v23, s4, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 0 -; CHECK-NEXT: v_writelane_b32 v23, s5, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 1 +; CHECK-NEXT: v_writelane_b32 v22, s0, 58 +; CHECK-NEXT: v_writelane_b32 v22, s1, 59 +; CHECK-NEXT: v_writelane_b32 v22, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v22, s3, 61 +; CHECK-NEXT: v_writelane_b32 v22, s4, 62 +; CHECK-NEXT: v_writelane_b32 v23, s6, 0 +; CHECK-NEXT: v_writelane_b32 v22, s5, 63 +; CHECK-NEXT: v_writelane_b32 v23, s7, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 2 -; CHECK-NEXT: v_writelane_b32 v0, s1, 3 -; CHECK-NEXT: v_writelane_b32 v0, s2, 4 -; CHECK-NEXT: v_writelane_b32 v0, s3, 5 -; CHECK-NEXT: v_writelane_b32 v0, s4, 6 -; CHECK-NEXT: v_writelane_b32 v0, s5, 7 -; CHECK-NEXT: v_writelane_b32 v0, s6, 8 -; CHECK-NEXT: v_writelane_b32 v0, s7, 9 -; CHECK-NEXT: v_writelane_b32 v0, s8, 10 -; CHECK-NEXT: v_writelane_b32 v0, s9, 11 -; CHECK-NEXT: v_writelane_b32 v0, s10, 12 -; CHECK-NEXT: v_writelane_b32 v0, s11, 13 -; CHECK-NEXT: v_writelane_b32 v0, s12, 14 -; CHECK-NEXT: v_writelane_b32 v0, s13, 15 -; CHECK-NEXT: v_writelane_b32 v0, s14, 16 -; CHECK-NEXT: v_writelane_b32 v0, s15, 17 +; CHECK-NEXT: v_writelane_b32 v23, s0, 2 +; CHECK-NEXT: v_writelane_b32 v23, s1, 3 +; CHECK-NEXT: v_writelane_b32 v23, s2, 4 +; CHECK-NEXT: v_writelane_b32 v23, s3, 5 +; CHECK-NEXT: v_writelane_b32 v23, s4, 6 +; CHECK-NEXT: v_writelane_b32 v23, s5, 7 +; CHECK-NEXT: v_writelane_b32 v23, s6, 8 +; CHECK-NEXT: v_writelane_b32 v23, s7, 9 +; CHECK-NEXT: v_writelane_b32 v23, s8, 10 +; CHECK-NEXT: v_writelane_b32 v23, s9, 11 +; CHECK-NEXT: v_writelane_b32 v23, s10, 12 +; CHECK-NEXT: v_writelane_b32 v23, s11, 13 +; CHECK-NEXT: v_writelane_b32 v23, s12, 14 +; CHECK-NEXT: v_writelane_b32 v23, s13, 15 +; CHECK-NEXT: v_writelane_b32 v23, s14, 16 +; CHECK-NEXT: v_writelane_b32 v23, s15, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 18 -; CHECK-NEXT: v_writelane_b32 v0, s1, 19 +; CHECK-NEXT: v_writelane_b32 v23, s0, 18 +; CHECK-NEXT: v_writelane_b32 v23, s1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 20 -; CHECK-NEXT: v_writelane_b32 v0, s1, 21 -; CHECK-NEXT: v_writelane_b32 v0, s2, 22 -; CHECK-NEXT: v_writelane_b32 v0, s3, 23 +; CHECK-NEXT: v_writelane_b32 v23, s0, 20 +; CHECK-NEXT: v_writelane_b32 v23, s1, 21 +; CHECK-NEXT: v_writelane_b32 v23, s2, 22 +; CHECK-NEXT: v_writelane_b32 v23, s3, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 24 -; CHECK-NEXT: v_writelane_b32 v0, s1, 25 -; CHECK-NEXT: v_writelane_b32 v0, s2, 26 -; CHECK-NEXT: v_writelane_b32 v0, s3, 27 -; CHECK-NEXT: v_writelane_b32 v0, s4, 28 -; CHECK-NEXT: v_writelane_b32 v0, s5, 29 -; CHECK-NEXT: v_writelane_b32 v0, s6, 30 -; CHECK-NEXT: v_writelane_b32 v0, s7, 31 +; CHECK-NEXT: v_writelane_b32 v23, s0, 24 +; CHECK-NEXT: v_writelane_b32 v23, s1, 25 +; CHECK-NEXT: v_writelane_b32 v23, s2, 26 +; CHECK-NEXT: v_writelane_b32 v23, s3, 27 +; CHECK-NEXT: v_writelane_b32 v23, s4, 28 +; CHECK-NEXT: v_writelane_b32 v23, s5, 29 +; CHECK-NEXT: v_writelane_b32 v23, s6, 30 +; CHECK-NEXT: v_writelane_b32 v23, s7, 31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 32 -; CHECK-NEXT: v_writelane_b32 v0, s1, 33 -; CHECK-NEXT: v_writelane_b32 v0, s2, 34 -; CHECK-NEXT: v_writelane_b32 v0, s3, 35 -; CHECK-NEXT: v_writelane_b32 v0, s4, 36 -; CHECK-NEXT: v_writelane_b32 v0, s5, 37 -; CHECK-NEXT: v_writelane_b32 v0, s6, 38 -; CHECK-NEXT: v_writelane_b32 v0, s7, 39 -; CHECK-NEXT: v_writelane_b32 v0, s8, 40 -; CHECK-NEXT: v_writelane_b32 v0, s9, 41 -; CHECK-NEXT: v_writelane_b32 v0, s10, 42 -; CHECK-NEXT: v_writelane_b32 v0, s11, 43 -; CHECK-NEXT: v_writelane_b32 v0, s12, 44 -; CHECK-NEXT: v_writelane_b32 v0, s13, 45 -; CHECK-NEXT: v_writelane_b32 v0, s14, 46 -; CHECK-NEXT: v_writelane_b32 v0, s15, 47 +; CHECK-NEXT: v_writelane_b32 v23, s0, 32 +; CHECK-NEXT: v_writelane_b32 v23, s1, 33 +; CHECK-NEXT: v_writelane_b32 v23, s2, 34 +; CHECK-NEXT: v_writelane_b32 v23, s3, 35 +; CHECK-NEXT: v_writelane_b32 v23, s4, 36 +; CHECK-NEXT: v_writelane_b32 v23, s5, 37 +; CHECK-NEXT: v_writelane_b32 v23, s6, 38 +; CHECK-NEXT: v_writelane_b32 v23, s7, 39 +; CHECK-NEXT: v_writelane_b32 v23, s8, 40 +; CHECK-NEXT: v_writelane_b32 v23, s9, 41 +; CHECK-NEXT: v_writelane_b32 v23, s10, 42 +; CHECK-NEXT: v_writelane_b32 v23, s11, 43 +; CHECK-NEXT: v_writelane_b32 v23, s12, 44 +; CHECK-NEXT: v_writelane_b32 v23, s13, 45 +; CHECK-NEXT: v_writelane_b32 v23, s14, 46 +; CHECK-NEXT: v_writelane_b32 v23, s15, 47 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ret -; CHECK-NEXT: ; kill: killed $vgpr23 -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %bb0 -; CHECK-NEXT: v_readlane_b32 s0, v23, 0 -; CHECK-NEXT: v_readlane_b32 s1, v23, 1 +; CHECK-NEXT: v_readlane_b32 s0, v22, 0 +; CHECK-NEXT: v_readlane_b32 s1, v22, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 2 -; CHECK-NEXT: v_readlane_b32 s1, v23, 3 -; CHECK-NEXT: v_readlane_b32 s2, v23, 4 -; CHECK-NEXT: v_readlane_b32 s3, v23, 5 +; CHECK-NEXT: v_readlane_b32 s0, v22, 2 +; CHECK-NEXT: v_readlane_b32 s1, v22, 3 +; CHECK-NEXT: v_readlane_b32 s2, v22, 4 +; CHECK-NEXT: v_readlane_b32 s3, v22, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 6 -; CHECK-NEXT: v_readlane_b32 s1, v23, 7 -; CHECK-NEXT: v_readlane_b32 s2, v23, 8 -; CHECK-NEXT: v_readlane_b32 s3, v23, 9 -; CHECK-NEXT: v_readlane_b32 s4, v23, 10 -; CHECK-NEXT: v_readlane_b32 s5, v23, 11 -; CHECK-NEXT: v_readlane_b32 s6, v23, 12 -; CHECK-NEXT: v_readlane_b32 s7, v23, 13 +; CHECK-NEXT: v_readlane_b32 s0, v22, 6 +; CHECK-NEXT: v_readlane_b32 s1, v22, 7 +; CHECK-NEXT: v_readlane_b32 s2, v22, 8 +; CHECK-NEXT: v_readlane_b32 s3, v22, 9 +; CHECK-NEXT: v_readlane_b32 s4, v22, 10 +; CHECK-NEXT: v_readlane_b32 s5, v22, 11 +; CHECK-NEXT: v_readlane_b32 s6, v22, 12 +; CHECK-NEXT: v_readlane_b32 s7, v22, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 14 -; CHECK-NEXT: v_readlane_b32 s1, v23, 15 -; CHECK-NEXT: v_readlane_b32 s2, v23, 16 -; CHECK-NEXT: v_readlane_b32 s3, v23, 17 -; CHECK-NEXT: v_readlane_b32 s4, v23, 18 -; CHECK-NEXT: v_readlane_b32 s5, v23, 19 -; CHECK-NEXT: v_readlane_b32 s6, v23, 20 -; CHECK-NEXT: v_readlane_b32 s7, v23, 21 -; CHECK-NEXT: v_readlane_b32 s8, v23, 22 -; CHECK-NEXT: v_readlane_b32 s9, v23, 23 -; CHECK-NEXT: v_readlane_b32 s10, v23, 24 -; CHECK-NEXT: v_readlane_b32 s11, v23, 25 -; CHECK-NEXT: v_readlane_b32 s12, v23, 26 -; CHECK-NEXT: v_readlane_b32 s13, v23, 27 -; CHECK-NEXT: v_readlane_b32 s14, v23, 28 -; CHECK-NEXT: v_readlane_b32 s15, v23, 29 +; CHECK-NEXT: v_readlane_b32 s0, v22, 14 +; CHECK-NEXT: v_readlane_b32 s1, v22, 15 +; CHECK-NEXT: v_readlane_b32 s2, v22, 16 +; CHECK-NEXT: v_readlane_b32 s3, v22, 17 +; CHECK-NEXT: v_readlane_b32 s4, v22, 18 +; CHECK-NEXT: v_readlane_b32 s5, v22, 19 +; CHECK-NEXT: v_readlane_b32 s6, v22, 20 +; CHECK-NEXT: v_readlane_b32 s7, v22, 21 +; CHECK-NEXT: v_readlane_b32 s8, v22, 22 +; CHECK-NEXT: v_readlane_b32 s9, v22, 23 +; CHECK-NEXT: v_readlane_b32 s10, v22, 24 +; CHECK-NEXT: v_readlane_b32 s11, v22, 25 +; CHECK-NEXT: v_readlane_b32 s12, v22, 26 +; CHECK-NEXT: v_readlane_b32 s13, v22, 27 +; CHECK-NEXT: v_readlane_b32 s14, v22, 28 +; CHECK-NEXT: v_readlane_b32 s15, v22, 29 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 30 -; CHECK-NEXT: v_readlane_b32 s1, v23, 31 -; CHECK-NEXT: v_readlane_b32 s2, v23, 32 -; CHECK-NEXT: v_readlane_b32 s3, v23, 33 -; CHECK-NEXT: v_readlane_b32 s4, v23, 34 -; CHECK-NEXT: v_readlane_b32 s5, v23, 35 -; CHECK-NEXT: v_readlane_b32 s6, v23, 36 -; CHECK-NEXT: v_readlane_b32 s7, v23, 37 +; CHECK-NEXT: v_readlane_b32 s0, v22, 30 +; CHECK-NEXT: v_readlane_b32 s1, v22, 31 +; CHECK-NEXT: v_readlane_b32 s2, v22, 32 +; CHECK-NEXT: v_readlane_b32 s3, v22, 33 +; CHECK-NEXT: v_readlane_b32 s4, v22, 34 +; CHECK-NEXT: v_readlane_b32 s5, v22, 35 +; CHECK-NEXT: v_readlane_b32 s6, v22, 36 +; CHECK-NEXT: v_readlane_b32 s7, v22, 37 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[42:43] ; CHECK-NEXT: ;;#ASMEND @@ -260,10 +256,10 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 38 -; CHECK-NEXT: v_readlane_b32 s1, v23, 39 -; CHECK-NEXT: v_readlane_b32 s2, v23, 40 -; CHECK-NEXT: v_readlane_b32 s3, v23, 41 +; CHECK-NEXT: v_readlane_b32 s0, v22, 38 +; CHECK-NEXT: v_readlane_b32 s1, v22, 39 +; CHECK-NEXT: v_readlane_b32 s2, v22, 40 +; CHECK-NEXT: v_readlane_b32 s3, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND @@ -276,111 +272,108 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s4, v23, 42 -; CHECK-NEXT: v_readlane_b32 s5, v23, 43 -; CHECK-NEXT: v_readlane_b32 s6, v23, 44 -; CHECK-NEXT: v_readlane_b32 s7, v23, 45 -; CHECK-NEXT: v_readlane_b32 s8, v23, 46 -; CHECK-NEXT: v_readlane_b32 s9, v23, 47 -; CHECK-NEXT: v_readlane_b32 s10, v23, 48 -; CHECK-NEXT: v_readlane_b32 s11, v23, 49 -; CHECK-NEXT: v_readlane_b32 s12, v23, 50 -; CHECK-NEXT: v_readlane_b32 s13, v23, 51 -; CHECK-NEXT: v_readlane_b32 s14, v23, 52 -; CHECK-NEXT: v_readlane_b32 s15, v23, 53 +; CHECK-NEXT: v_readlane_b32 s4, v22, 42 +; CHECK-NEXT: v_readlane_b32 s5, v22, 43 +; CHECK-NEXT: v_readlane_b32 s6, v22, 44 +; CHECK-NEXT: v_readlane_b32 s7, v22, 45 +; CHECK-NEXT: v_readlane_b32 s8, v22, 46 +; CHECK-NEXT: v_readlane_b32 s9, v22, 47 +; CHECK-NEXT: v_readlane_b32 s10, v22, 48 +; CHECK-NEXT: v_readlane_b32 s11, v22, 49 +; CHECK-NEXT: v_readlane_b32 s12, v22, 50 +; CHECK-NEXT: v_readlane_b32 s13, v22, 51 +; CHECK-NEXT: v_readlane_b32 s14, v22, 52 +; CHECK-NEXT: v_readlane_b32 s15, v22, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 54 -; CHECK-NEXT: v_readlane_b32 s1, v23, 55 -; CHECK-NEXT: v_readlane_b32 s2, v23, 56 -; CHECK-NEXT: v_readlane_b32 s3, v23, 57 +; CHECK-NEXT: v_readlane_b32 s0, v22, 54 +; CHECK-NEXT: v_readlane_b32 s1, v22, 55 +; CHECK-NEXT: v_readlane_b32 s2, v22, 56 +; CHECK-NEXT: v_readlane_b32 s3, v22, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 58 -; CHECK-NEXT: v_readlane_b32 s1, v23, 59 -; CHECK-NEXT: v_readlane_b32 s2, v23, 60 -; CHECK-NEXT: v_readlane_b32 s3, v23, 61 -; CHECK-NEXT: v_readlane_b32 s4, v23, 62 -; CHECK-NEXT: v_readlane_b32 s5, v23, 63 -; CHECK-NEXT: v_readlane_b32 s6, v0, 0 -; CHECK-NEXT: v_readlane_b32 s7, v0, 1 +; CHECK-NEXT: v_readlane_b32 s0, v22, 58 +; CHECK-NEXT: v_readlane_b32 s1, v22, 59 +; CHECK-NEXT: v_readlane_b32 s2, v22, 60 +; CHECK-NEXT: v_readlane_b32 s3, v22, 61 +; CHECK-NEXT: v_readlane_b32 s4, v22, 62 +; CHECK-NEXT: v_readlane_b32 s5, v22, 63 +; CHECK-NEXT: v_readlane_b32 s6, v23, 0 +; CHECK-NEXT: v_readlane_b32 s7, v23, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 2 -; CHECK-NEXT: v_readlane_b32 s1, v0, 3 -; CHECK-NEXT: v_readlane_b32 s2, v0, 4 -; CHECK-NEXT: v_readlane_b32 s3, v0, 5 -; CHECK-NEXT: v_readlane_b32 s4, v0, 6 -; CHECK-NEXT: v_readlane_b32 s5, v0, 7 -; CHECK-NEXT: v_readlane_b32 s6, v0, 8 -; CHECK-NEXT: v_readlane_b32 s7, v0, 9 -; CHECK-NEXT: v_readlane_b32 s8, v0, 10 -; CHECK-NEXT: v_readlane_b32 s9, v0, 11 -; CHECK-NEXT: v_readlane_b32 s10, v0, 12 -; CHECK-NEXT: v_readlane_b32 s11, v0, 13 -; CHECK-NEXT: v_readlane_b32 s12, v0, 14 -; CHECK-NEXT: v_readlane_b32 s13, v0, 15 -; CHECK-NEXT: v_readlane_b32 s14, v0, 16 -; CHECK-NEXT: v_readlane_b32 s15, v0, 17 +; CHECK-NEXT: v_readlane_b32 s0, v23, 2 +; CHECK-NEXT: v_readlane_b32 s1, v23, 3 +; CHECK-NEXT: v_readlane_b32 s2, v23, 4 +; CHECK-NEXT: v_readlane_b32 s3, v23, 5 +; CHECK-NEXT: v_readlane_b32 s4, v23, 6 +; CHECK-NEXT: v_readlane_b32 s5, v23, 7 +; CHECK-NEXT: v_readlane_b32 s6, v23, 8 +; CHECK-NEXT: v_readlane_b32 s7, v23, 9 +; CHECK-NEXT: v_readlane_b32 s8, v23, 10 +; CHECK-NEXT: v_readlane_b32 s9, v23, 11 +; CHECK-NEXT: v_readlane_b32 s10, v23, 12 +; CHECK-NEXT: v_readlane_b32 s11, v23, 13 +; CHECK-NEXT: v_readlane_b32 s12, v23, 14 +; CHECK-NEXT: v_readlane_b32 s13, v23, 15 +; CHECK-NEXT: v_readlane_b32 s14, v23, 16 +; CHECK-NEXT: v_readlane_b32 s15, v23, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 18 -; CHECK-NEXT: v_readlane_b32 s1, v0, 19 +; CHECK-NEXT: v_readlane_b32 s0, v23, 18 +; CHECK-NEXT: v_readlane_b32 s1, v23, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 20 -; CHECK-NEXT: v_readlane_b32 s1, v0, 21 -; CHECK-NEXT: v_readlane_b32 s2, v0, 22 -; CHECK-NEXT: v_readlane_b32 s3, v0, 23 +; CHECK-NEXT: v_readlane_b32 s0, v23, 20 +; CHECK-NEXT: v_readlane_b32 s1, v23, 21 +; CHECK-NEXT: v_readlane_b32 s2, v23, 22 +; CHECK-NEXT: v_readlane_b32 s3, v23, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 24 -; CHECK-NEXT: v_readlane_b32 s1, v0, 25 -; CHECK-NEXT: v_readlane_b32 s2, v0, 26 -; CHECK-NEXT: v_readlane_b32 s3, v0, 27 -; CHECK-NEXT: v_readlane_b32 s4, v0, 28 -; CHECK-NEXT: v_readlane_b32 s5, v0, 29 -; CHECK-NEXT: v_readlane_b32 s6, v0, 30 -; CHECK-NEXT: v_readlane_b32 s7, v0, 31 +; CHECK-NEXT: v_readlane_b32 s0, v23, 24 +; CHECK-NEXT: v_readlane_b32 s1, v23, 25 +; CHECK-NEXT: v_readlane_b32 s2, v23, 26 +; CHECK-NEXT: v_readlane_b32 s3, v23, 27 +; CHECK-NEXT: v_readlane_b32 s4, v23, 28 +; CHECK-NEXT: v_readlane_b32 s5, v23, 29 +; CHECK-NEXT: v_readlane_b32 s6, v23, 30 +; CHECK-NEXT: v_readlane_b32 s7, v23, 31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 32 -; CHECK-NEXT: v_readlane_b32 s1, v0, 33 -; CHECK-NEXT: v_readlane_b32 s2, v0, 34 -; CHECK-NEXT: v_readlane_b32 s3, v0, 35 -; CHECK-NEXT: v_readlane_b32 s4, v0, 36 -; CHECK-NEXT: v_readlane_b32 s5, v0, 37 -; CHECK-NEXT: v_readlane_b32 s6, v0, 38 -; CHECK-NEXT: v_readlane_b32 s7, v0, 39 -; CHECK-NEXT: v_readlane_b32 s8, v0, 40 -; CHECK-NEXT: v_readlane_b32 s9, v0, 41 -; CHECK-NEXT: v_readlane_b32 s10, v0, 42 -; CHECK-NEXT: v_readlane_b32 s11, v0, 43 -; CHECK-NEXT: v_readlane_b32 s12, v0, 44 -; CHECK-NEXT: v_readlane_b32 s13, v0, 45 -; CHECK-NEXT: v_readlane_b32 s14, v0, 46 -; CHECK-NEXT: v_readlane_b32 s15, v0, 47 +; CHECK-NEXT: v_readlane_b32 s0, v23, 32 +; CHECK-NEXT: v_readlane_b32 s1, v23, 33 +; CHECK-NEXT: v_readlane_b32 s2, v23, 34 +; CHECK-NEXT: v_readlane_b32 s3, v23, 35 +; CHECK-NEXT: v_readlane_b32 s4, v23, 36 +; CHECK-NEXT: v_readlane_b32 s5, v23, 37 +; CHECK-NEXT: v_readlane_b32 s6, v23, 38 +; CHECK-NEXT: v_readlane_b32 s7, v23, 39 +; CHECK-NEXT: v_readlane_b32 s8, v23, 40 +; CHECK-NEXT: v_readlane_b32 s9, v23, 41 +; CHECK-NEXT: v_readlane_b32 s10, v23, 42 +; CHECK-NEXT: v_readlane_b32 s11, v23, 43 +; CHECK-NEXT: v_readlane_b32 s12, v23, 44 +; CHECK-NEXT: v_readlane_b32 s13, v23, 45 +; CHECK-NEXT: v_readlane_b32 s14, v23, 46 +; CHECK-NEXT: v_readlane_b32 s15, v23, 47 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; kill: killed $vgpr23 -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 call void asm sideeffect "", "~{v[16:19]}"() #0 call void asm sideeffect "", "~{v[20:21]}"() #0 - call void asm sideeffect "", "~{v22}"() #0 %val0 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 %val1 = call <4 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll index 17a19116735e4..14a02d4d2dcec 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -1,19 +1,19 @@ ; REQUIRES: asserts ; RUN: llc -verify-machineinstrs=0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s -; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s +; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=greedy -wwm-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s ; RUN: llc -verify-machineinstrs=0 -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=O0 %s -; RUN: llc -verify-machineinstrs=0 -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s +; RUN: llc -verify-machineinstrs=0 -wwm-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s ; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s -; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s +; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -wwm-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s ; RUN: not --crash llc -verify-machineinstrs=0 -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s ; RUN: not --crash llc -verify-machineinstrs=0 -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s -; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc +; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, and -vgpr-regalloc ; DEFAULT: Greedy Register Allocator ; DEFAULT-NEXT: Virtual Register Rewriter @@ -23,6 +23,11 @@ ; DEFAULT-NEXT: SI Pre-allocate WWM Registers ; DEFAULT-NEXT: Greedy Register Allocator ; DEFAULT-NEXT: SI Lower WWM Copies +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: AMDGPU Reserve WWM Registers +; DEFAULT-NEXT: Virtual Register Map +; DEFAULT-NEXT: Live Register Matrix +; DEFAULT-NEXT: Greedy Register Allocator ; DEFAULT-NEXT: GCN NSA Reassign ; DEFAULT-NEXT: Virtual Register Rewriter ; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load @@ -37,6 +42,8 @@ ; O0-NEXT: SI Pre-allocate WWM Registers ; O0-NEXT: Fast Register Allocator ; O0-NEXT: SI Lower WWM Copies +; O0-NEXT: AMDGPU Reserve WWM Registers +; O0-NEXT: Fast Register Allocator ; O0-NEXT: SI Fix VGPR copies @@ -60,6 +67,11 @@ ; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter ; BASIC-DEFAULT-NEXT: Greedy Register Allocator ; BASIC-DEFAULT-NEXT: SI Lower WWM Copies +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: AMDGPU Reserve WWM Registers +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Greedy Register Allocator ; BASIC-DEFAULT-NEXT: GCN NSA Reassign ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter ; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load @@ -75,6 +87,11 @@ ; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers ; DEFAULT-BASIC-NEXT: Basic Register Allocator ; DEFAULT-BASIC-NEXT: SI Lower WWM Copies +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: AMDGPU Reserve WWM Registers +; DEFAULT-BASIC-NEXT: Virtual Register Map +; DEFAULT-BASIC-NEXT: Live Register Matrix +; DEFAULT-BASIC-NEXT: Basic Register Allocator ; DEFAULT-BASIC-NEXT: GCN NSA Reassign ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter ; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load @@ -96,6 +113,11 @@ ; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers ; BASIC-BASIC-NEXT: Basic Register Allocator ; BASIC-BASIC-NEXT: SI Lower WWM Copies +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: AMDGPU Reserve WWM Registers +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator ; BASIC-BASIC-NEXT: GCN NSA Reassign ; BASIC-BASIC-NEXT: Virtual Register Rewriter ; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir index 189aead1e5646..520717391b596 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -passes=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s @@ -45,28 +46,25 @@ body: | ; SGPR_SPILL: bb.0: ; SGPR_SPILL-NEXT: successors: %bb.1(0x80000000) ; SGPR_SPILL-NEXT: {{ $}} - ; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILL-NEXT: renamable $sgpr10 = IMPLICIT_DEF - ; SGPR_SPILL-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; SGPR_SPILL-NEXT: DBG_VALUE $noreg, 0 ; SGPR_SPILL-NEXT: {{ $}} ; SGPR_SPILL-NEXT: bb.1: - ; SGPR_SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[V_WRITELANE_B32_]], 0 - ; SGPR_SPILL-NEXT: KILL [[V_WRITELANE_B32_]] + ; SGPR_SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 ; SGPR_SPILL-NEXT: S_ENDPGM 0 + ; ; PEI-LABEL: name: test ; PEI: bb.0: ; PEI-NEXT: successors: %bb.1(0x80000000) ; PEI-NEXT: {{ $}} - ; PEI-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; PEI-NEXT: renamable $sgpr10 = IMPLICIT_DEF - ; PEI-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, killed $vgpr0 + ; PEI-NEXT: $vgpr0 = IMPLICIT_DEF + ; PEI-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, killed $vgpr0 ; PEI-NEXT: {{ $}} ; PEI-NEXT: bb.1: - ; PEI-NEXT: liveins: $vgpr0 - ; PEI-NEXT: {{ $}} - ; PEI-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; PEI-NEXT: KILL killed renamable $vgpr0 + ; PEI-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 ; PEI-NEXT: S_ENDPGM 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll index 29622d3fd0f1b..5692dc1e2a2c6 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll @@ -9,7 +9,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { call void asm sideeffect "", "~{v[8:15]}" () #0 call void asm sideeffect "", "~{v[16:19]}"() #0 call void asm sideeffect "", "~{v[20:21]}"() #0 - call void asm sideeffect "", "~{v22}"() #0 %val0 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 %val1 = call <4 x i32> asm sideeffect "; def $0", "=s" () #0 %val2 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index d430ba758572d..59036c64c8afc 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -9,19 +9,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 s0, s0, s13 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -31,91 +21,91 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s8, 0 -; GCN-NEXT: v_writelane_b32 v1, s9, 1 -; GCN-NEXT: v_writelane_b32 v1, s10, 2 -; GCN-NEXT: v_writelane_b32 v1, s11, 3 -; GCN-NEXT: v_writelane_b32 v1, s12, 4 -; GCN-NEXT: v_writelane_b32 v1, s13, 5 -; GCN-NEXT: v_writelane_b32 v1, s14, 6 -; GCN-NEXT: v_writelane_b32 v1, s15, 7 -; GCN-NEXT: v_writelane_b32 v1, s16, 8 -; GCN-NEXT: v_writelane_b32 v1, s17, 9 -; GCN-NEXT: v_writelane_b32 v1, s18, 10 -; GCN-NEXT: v_writelane_b32 v1, s19, 11 -; GCN-NEXT: v_writelane_b32 v1, s20, 12 -; GCN-NEXT: v_writelane_b32 v1, s21, 13 -; GCN-NEXT: v_writelane_b32 v1, s22, 14 -; GCN-NEXT: v_writelane_b32 v1, s23, 15 +; GCN-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v22, s8, 0 +; GCN-NEXT: v_writelane_b32 v22, s9, 1 +; GCN-NEXT: v_writelane_b32 v22, s10, 2 +; GCN-NEXT: v_writelane_b32 v22, s11, 3 +; GCN-NEXT: v_writelane_b32 v22, s12, 4 +; GCN-NEXT: v_writelane_b32 v22, s13, 5 +; GCN-NEXT: v_writelane_b32 v22, s14, 6 +; GCN-NEXT: v_writelane_b32 v22, s15, 7 +; GCN-NEXT: v_writelane_b32 v22, s16, 8 +; GCN-NEXT: v_writelane_b32 v22, s17, 9 +; GCN-NEXT: v_writelane_b32 v22, s18, 10 +; GCN-NEXT: v_writelane_b32 v22, s19, 11 +; GCN-NEXT: v_writelane_b32 v22, s20, 12 +; GCN-NEXT: v_writelane_b32 v22, s21, 13 +; GCN-NEXT: v_writelane_b32 v22, s22, 14 +; GCN-NEXT: v_writelane_b32 v22, s23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 16 -; GCN-NEXT: v_writelane_b32 v1, s9, 17 -; GCN-NEXT: v_writelane_b32 v1, s10, 18 -; GCN-NEXT: v_writelane_b32 v1, s11, 19 -; GCN-NEXT: v_writelane_b32 v1, s12, 20 -; GCN-NEXT: v_writelane_b32 v1, s13, 21 -; GCN-NEXT: v_writelane_b32 v1, s14, 22 -; GCN-NEXT: v_writelane_b32 v1, s15, 23 -; GCN-NEXT: v_writelane_b32 v1, s16, 24 -; GCN-NEXT: v_writelane_b32 v1, s17, 25 -; GCN-NEXT: v_writelane_b32 v1, s18, 26 -; GCN-NEXT: v_writelane_b32 v1, s19, 27 -; GCN-NEXT: v_writelane_b32 v1, s20, 28 -; GCN-NEXT: v_writelane_b32 v1, s21, 29 -; GCN-NEXT: v_writelane_b32 v1, s22, 30 -; GCN-NEXT: v_writelane_b32 v1, s23, 31 +; GCN-NEXT: v_writelane_b32 v22, s8, 16 +; GCN-NEXT: v_writelane_b32 v22, s9, 17 +; GCN-NEXT: v_writelane_b32 v22, s10, 18 +; GCN-NEXT: v_writelane_b32 v22, s11, 19 +; GCN-NEXT: v_writelane_b32 v22, s12, 20 +; GCN-NEXT: v_writelane_b32 v22, s13, 21 +; GCN-NEXT: v_writelane_b32 v22, s14, 22 +; GCN-NEXT: v_writelane_b32 v22, s15, 23 +; GCN-NEXT: v_writelane_b32 v22, s16, 24 +; GCN-NEXT: v_writelane_b32 v22, s17, 25 +; GCN-NEXT: v_writelane_b32 v22, s18, 26 +; GCN-NEXT: v_writelane_b32 v22, s19, 27 +; GCN-NEXT: v_writelane_b32 v22, s20, 28 +; GCN-NEXT: v_writelane_b32 v22, s21, 29 +; GCN-NEXT: v_writelane_b32 v22, s22, 30 +; GCN-NEXT: v_writelane_b32 v22, s23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 32 -; GCN-NEXT: v_writelane_b32 v1, s9, 33 -; GCN-NEXT: v_writelane_b32 v1, s10, 34 -; GCN-NEXT: v_writelane_b32 v1, s11, 35 -; GCN-NEXT: v_writelane_b32 v1, s12, 36 -; GCN-NEXT: v_writelane_b32 v1, s13, 37 -; GCN-NEXT: v_writelane_b32 v1, s14, 38 -; GCN-NEXT: v_writelane_b32 v1, s15, 39 -; GCN-NEXT: v_writelane_b32 v1, s16, 40 -; GCN-NEXT: v_writelane_b32 v1, s17, 41 -; GCN-NEXT: v_writelane_b32 v1, s18, 42 -; GCN-NEXT: v_writelane_b32 v1, s19, 43 -; GCN-NEXT: v_writelane_b32 v1, s20, 44 -; GCN-NEXT: v_writelane_b32 v1, s21, 45 -; GCN-NEXT: v_writelane_b32 v1, s22, 46 -; GCN-NEXT: v_writelane_b32 v1, s23, 47 +; GCN-NEXT: v_writelane_b32 v22, s8, 32 +; GCN-NEXT: v_writelane_b32 v22, s9, 33 +; GCN-NEXT: v_writelane_b32 v22, s10, 34 +; GCN-NEXT: v_writelane_b32 v22, s11, 35 +; GCN-NEXT: v_writelane_b32 v22, s12, 36 +; GCN-NEXT: v_writelane_b32 v22, s13, 37 +; GCN-NEXT: v_writelane_b32 v22, s14, 38 +; GCN-NEXT: v_writelane_b32 v22, s15, 39 +; GCN-NEXT: v_writelane_b32 v22, s16, 40 +; GCN-NEXT: v_writelane_b32 v22, s17, 41 +; GCN-NEXT: v_writelane_b32 v22, s18, 42 +; GCN-NEXT: v_writelane_b32 v22, s19, 43 +; GCN-NEXT: v_writelane_b32 v22, s20, 44 +; GCN-NEXT: v_writelane_b32 v22, s21, 45 +; GCN-NEXT: v_writelane_b32 v22, s22, 46 +; GCN-NEXT: v_writelane_b32 v22, s23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 48 -; GCN-NEXT: v_writelane_b32 v1, s9, 49 -; GCN-NEXT: v_writelane_b32 v1, s10, 50 -; GCN-NEXT: v_writelane_b32 v1, s11, 51 -; GCN-NEXT: v_writelane_b32 v1, s12, 52 -; GCN-NEXT: v_writelane_b32 v1, s13, 53 -; GCN-NEXT: v_writelane_b32 v1, s14, 54 -; GCN-NEXT: v_writelane_b32 v1, s15, 55 -; GCN-NEXT: v_writelane_b32 v1, s16, 56 -; GCN-NEXT: v_writelane_b32 v1, s17, 57 -; GCN-NEXT: v_writelane_b32 v1, s18, 58 -; GCN-NEXT: v_writelane_b32 v1, s19, 59 -; GCN-NEXT: v_writelane_b32 v1, s20, 60 -; GCN-NEXT: v_writelane_b32 v1, s21, 61 -; GCN-NEXT: v_writelane_b32 v1, s22, 62 -; GCN-NEXT: v_writelane_b32 v1, s23, 63 +; GCN-NEXT: v_writelane_b32 v22, s8, 48 +; GCN-NEXT: v_writelane_b32 v22, s9, 49 +; GCN-NEXT: v_writelane_b32 v22, s10, 50 +; GCN-NEXT: v_writelane_b32 v22, s11, 51 +; GCN-NEXT: v_writelane_b32 v22, s12, 52 +; GCN-NEXT: v_writelane_b32 v22, s13, 53 +; GCN-NEXT: v_writelane_b32 v22, s14, 54 +; GCN-NEXT: v_writelane_b32 v22, s15, 55 +; GCN-NEXT: v_writelane_b32 v22, s16, 56 +; GCN-NEXT: v_writelane_b32 v22, s17, 57 +; GCN-NEXT: v_writelane_b32 v22, s18, 58 +; GCN-NEXT: v_writelane_b32 v22, s19, 59 +; GCN-NEXT: v_writelane_b32 v22, s20, 60 +; GCN-NEXT: v_writelane_b32 v22, s21, 61 +; GCN-NEXT: v_writelane_b32 v22, s22, 62 +; GCN-NEXT: v_writelane_b32 v22, s23, 63 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[6:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v0, s6, 0 -; GCN-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v22, s6, 0 +; GCN-NEXT: v_writelane_b32 v22, s7, 1 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], 0 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -123,88 +113,88 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v1, 0 -; GCN-NEXT: v_readlane_b32 s5, v1, 1 -; GCN-NEXT: v_readlane_b32 s6, v1, 2 -; GCN-NEXT: v_readlane_b32 s7, v1, 3 -; GCN-NEXT: v_readlane_b32 s8, v1, 4 -; GCN-NEXT: v_readlane_b32 s9, v1, 5 -; GCN-NEXT: v_readlane_b32 s10, v1, 6 -; GCN-NEXT: v_readlane_b32 s11, v1, 7 -; GCN-NEXT: v_readlane_b32 s12, v1, 8 -; GCN-NEXT: v_readlane_b32 s13, v1, 9 -; GCN-NEXT: v_readlane_b32 s14, v1, 10 -; GCN-NEXT: v_readlane_b32 s15, v1, 11 -; GCN-NEXT: v_readlane_b32 s16, v1, 12 -; GCN-NEXT: v_readlane_b32 s17, v1, 13 -; GCN-NEXT: v_readlane_b32 s18, v1, 14 -; GCN-NEXT: v_readlane_b32 s19, v1, 15 +; GCN-NEXT: v_readlane_b32 s4, v23, 0 +; GCN-NEXT: v_readlane_b32 s5, v23, 1 +; GCN-NEXT: v_readlane_b32 s6, v23, 2 +; GCN-NEXT: v_readlane_b32 s7, v23, 3 +; GCN-NEXT: v_readlane_b32 s8, v23, 4 +; GCN-NEXT: v_readlane_b32 s9, v23, 5 +; GCN-NEXT: v_readlane_b32 s10, v23, 6 +; GCN-NEXT: v_readlane_b32 s11, v23, 7 +; GCN-NEXT: v_readlane_b32 s12, v23, 8 +; GCN-NEXT: v_readlane_b32 s13, v23, 9 +; GCN-NEXT: v_readlane_b32 s14, v23, 10 +; GCN-NEXT: v_readlane_b32 s15, v23, 11 +; GCN-NEXT: v_readlane_b32 s16, v23, 12 +; GCN-NEXT: v_readlane_b32 s17, v23, 13 +; GCN-NEXT: v_readlane_b32 s18, v23, 14 +; GCN-NEXT: v_readlane_b32 s19, v23, 15 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 16 -; GCN-NEXT: v_readlane_b32 s5, v1, 17 -; GCN-NEXT: v_readlane_b32 s6, v1, 18 -; GCN-NEXT: v_readlane_b32 s7, v1, 19 -; GCN-NEXT: v_readlane_b32 s8, v1, 20 -; GCN-NEXT: v_readlane_b32 s9, v1, 21 -; GCN-NEXT: v_readlane_b32 s10, v1, 22 -; GCN-NEXT: v_readlane_b32 s11, v1, 23 -; GCN-NEXT: v_readlane_b32 s12, v1, 24 -; GCN-NEXT: v_readlane_b32 s13, v1, 25 -; GCN-NEXT: v_readlane_b32 s14, v1, 26 -; GCN-NEXT: v_readlane_b32 s15, v1, 27 -; GCN-NEXT: v_readlane_b32 s16, v1, 28 -; GCN-NEXT: v_readlane_b32 s17, v1, 29 -; GCN-NEXT: v_readlane_b32 s18, v1, 30 -; GCN-NEXT: v_readlane_b32 s19, v1, 31 +; GCN-NEXT: v_readlane_b32 s4, v23, 16 +; GCN-NEXT: v_readlane_b32 s5, v23, 17 +; GCN-NEXT: v_readlane_b32 s6, v23, 18 +; GCN-NEXT: v_readlane_b32 s7, v23, 19 +; GCN-NEXT: v_readlane_b32 s8, v23, 20 +; GCN-NEXT: v_readlane_b32 s9, v23, 21 +; GCN-NEXT: v_readlane_b32 s10, v23, 22 +; GCN-NEXT: v_readlane_b32 s11, v23, 23 +; GCN-NEXT: v_readlane_b32 s12, v23, 24 +; GCN-NEXT: v_readlane_b32 s13, v23, 25 +; GCN-NEXT: v_readlane_b32 s14, v23, 26 +; GCN-NEXT: v_readlane_b32 s15, v23, 27 +; GCN-NEXT: v_readlane_b32 s16, v23, 28 +; GCN-NEXT: v_readlane_b32 s17, v23, 29 +; GCN-NEXT: v_readlane_b32 s18, v23, 30 +; GCN-NEXT: v_readlane_b32 s19, v23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 32 -; GCN-NEXT: v_readlane_b32 s5, v1, 33 -; GCN-NEXT: v_readlane_b32 s6, v1, 34 -; GCN-NEXT: v_readlane_b32 s7, v1, 35 -; GCN-NEXT: v_readlane_b32 s8, v1, 36 -; GCN-NEXT: v_readlane_b32 s9, v1, 37 -; GCN-NEXT: v_readlane_b32 s10, v1, 38 -; GCN-NEXT: v_readlane_b32 s11, v1, 39 -; GCN-NEXT: v_readlane_b32 s12, v1, 40 -; GCN-NEXT: v_readlane_b32 s13, v1, 41 -; GCN-NEXT: v_readlane_b32 s14, v1, 42 -; GCN-NEXT: v_readlane_b32 s15, v1, 43 -; GCN-NEXT: v_readlane_b32 s16, v1, 44 -; GCN-NEXT: v_readlane_b32 s17, v1, 45 -; GCN-NEXT: v_readlane_b32 s18, v1, 46 -; GCN-NEXT: v_readlane_b32 s19, v1, 47 +; GCN-NEXT: v_readlane_b32 s4, v23, 32 +; GCN-NEXT: v_readlane_b32 s5, v23, 33 +; GCN-NEXT: v_readlane_b32 s6, v23, 34 +; GCN-NEXT: v_readlane_b32 s7, v23, 35 +; GCN-NEXT: v_readlane_b32 s8, v23, 36 +; GCN-NEXT: v_readlane_b32 s9, v23, 37 +; GCN-NEXT: v_readlane_b32 s10, v23, 38 +; GCN-NEXT: v_readlane_b32 s11, v23, 39 +; GCN-NEXT: v_readlane_b32 s12, v23, 40 +; GCN-NEXT: v_readlane_b32 s13, v23, 41 +; GCN-NEXT: v_readlane_b32 s14, v23, 42 +; GCN-NEXT: v_readlane_b32 s15, v23, 43 +; GCN-NEXT: v_readlane_b32 s16, v23, 44 +; GCN-NEXT: v_readlane_b32 s17, v23, 45 +; GCN-NEXT: v_readlane_b32 s18, v23, 46 +; GCN-NEXT: v_readlane_b32 s19, v23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s8, v1, 48 -; GCN-NEXT: v_readlane_b32 s9, v1, 49 -; GCN-NEXT: v_readlane_b32 s10, v1, 50 -; GCN-NEXT: v_readlane_b32 s11, v1, 51 -; GCN-NEXT: v_readlane_b32 s12, v1, 52 -; GCN-NEXT: v_readlane_b32 s13, v1, 53 -; GCN-NEXT: v_readlane_b32 s14, v1, 54 -; GCN-NEXT: v_readlane_b32 s15, v1, 55 -; GCN-NEXT: v_readlane_b32 s16, v1, 56 -; GCN-NEXT: v_readlane_b32 s17, v1, 57 -; GCN-NEXT: v_readlane_b32 s18, v1, 58 -; GCN-NEXT: v_readlane_b32 s19, v1, 59 -; GCN-NEXT: v_readlane_b32 s20, v1, 60 -; GCN-NEXT: v_readlane_b32 s21, v1, 61 -; GCN-NEXT: v_readlane_b32 s22, v1, 62 -; GCN-NEXT: v_readlane_b32 s23, v1, 63 +; GCN-NEXT: v_readlane_b32 s8, v23, 48 +; GCN-NEXT: v_readlane_b32 s9, v23, 49 +; GCN-NEXT: v_readlane_b32 s10, v23, 50 +; GCN-NEXT: v_readlane_b32 s11, v23, 51 +; GCN-NEXT: v_readlane_b32 s12, v23, 52 +; GCN-NEXT: v_readlane_b32 s13, v23, 53 +; GCN-NEXT: v_readlane_b32 s14, v23, 54 +; GCN-NEXT: v_readlane_b32 s15, v23, 55 +; GCN-NEXT: v_readlane_b32 s16, v23, 56 +; GCN-NEXT: v_readlane_b32 s17, v23, 57 +; GCN-NEXT: v_readlane_b32 s18, v23, 58 +; GCN-NEXT: v_readlane_b32 s19, v23, 59 +; GCN-NEXT: v_readlane_b32 s20, v23, 60 +; GCN-NEXT: v_readlane_b32 s21, v23, 61 +; GCN-NEXT: v_readlane_b32 s22, v23, 62 +; GCN-NEXT: v_readlane_b32 s23, v23, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v0, 0 -; GCN-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-NEXT: v_readlane_b32 s4, v22, 0 +; GCN-NEXT: v_readlane_b32 s5, v22, 1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[8:23] ; GCN-NEXT: ;;#ASMEND @@ -212,20 +202,11 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: ; use s[4:5] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 call void asm sideeffect "", "~{v[16:19]}"() #0 call void asm sideeffect "", "~{v[20:21]}"() #0 - call void asm sideeffect "", "~{v22}"() #0 %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index b0fb24e60bead..bb0a707a7c90b 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -28,181 +28,180 @@ body: | ; GCN-LABEL: name: test_main ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $vcc_hi = frame-setup COPY $sgpr33 ; GCN-NEXT: $sgpr33 = frame-setup COPY $sgpr32 ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc - ; GCN-NEXT: renamable $vgpr2 = IMPLICIT_DEF - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr3 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr4 - ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr5 - ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr5 - ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr5 - ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr2 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr3 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr4 ; GCN-NEXT: $sgpr22 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2 + ; GCN-NEXT: $vgpr5 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5 ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: liveins: $vcc_hi ; GCN-NEXT: {{ $}} ; GCN-NEXT: KILL implicit-def $vcc_lo, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: liveins: $vcc_hi ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR killed $vgpr5, 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: - ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: liveins: $vcc_hi ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3 - ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2 - ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 1 - ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 0 - ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 31 - ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 30 - ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 29 - ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 28 - ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 27 - ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 26 - ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 25 - ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 24 - ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 23 - ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 22 - ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 21 - ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 20 - ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 19 - ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 18 - ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 17 - ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 16 - ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 15 - ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 14 - ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 13 - ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 12 - ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 11 - ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 10 - ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 9 - ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 8 - ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 7 - ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 6 - ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 5 - ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 4 - ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3 - ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2 - ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1 - ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0 - ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31 - ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30 - ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29 - ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28 - ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27 - ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26 - ; GCN-NEXT: $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25 - ; GCN-NEXT: $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24 - ; GCN-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23 - ; GCN-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22 - ; GCN-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21 - ; GCN-NEXT: $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20 - ; GCN-NEXT: $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19 - ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18 - ; GCN-NEXT: $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17 - ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16 - ; GCN-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15 - ; GCN-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14 - ; GCN-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13 - ; GCN-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12 - ; GCN-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11 - ; GCN-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10 - ; GCN-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9 - ; GCN-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8 - ; GCN-NEXT: $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7 - ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6 - ; GCN-NEXT: $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5 - ; GCN-NEXT: $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4 - ; GCN-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3 - ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2 - ; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1 - ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0 - ; GCN-NEXT: KILL killed renamable $vgpr2 + ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3 + ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2 + ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1 + ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0 + ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31 + ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30 + ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29 + ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28 + ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27 + ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26 + ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25 + ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24 + ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23 + ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22 + ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21 + ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20 + ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19 + ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18 + ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17 + ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16 + ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15 + ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14 + ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13 + ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12 + ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11 + ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10 + ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9 + ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8 + ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7 + ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6 + ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5 + ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4 + ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3 + ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2 + ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1 + ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0 + ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31 + ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30 + ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29 + ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28 + ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27 + ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26 + ; GCN-NEXT: $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25 + ; GCN-NEXT: $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24 + ; GCN-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23 + ; GCN-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 22 + ; GCN-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 21 + ; GCN-NEXT: $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 20 + ; GCN-NEXT: $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 19 + ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 18 + ; GCN-NEXT: $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 17 + ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 16 + ; GCN-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 15 + ; GCN-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 14 + ; GCN-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 13 + ; GCN-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 12 + ; GCN-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 11 + ; GCN-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 10 + ; GCN-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 9 + ; GCN-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 8 + ; GCN-NEXT: $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 7 + ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 6 + ; GCN-NEXT: $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 5 + ; GCN-NEXT: $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 4 + ; GCN-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3 + ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2 + ; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 + ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) - ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) - ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5) - ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) - ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5) + ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) + ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) + ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5) + ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) + ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; GCN-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc ; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir index 2c4b7a22facf4..59c4b715dd12e 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir @@ -218,7 +218,7 @@ body: | ; VMEM-GFX8-LABEL: name: sgpr32_save_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec @@ -235,7 +235,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -271,7 +271,7 @@ body: | ; VMEM-GFX8-LABEL: name: sgpr32_restore_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec @@ -288,7 +288,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -327,7 +327,7 @@ body: | ; VMEM-GFX8-LABEL: name: sgpr64_save_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8_sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec @@ -345,7 +345,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8_sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -381,7 +381,7 @@ body: | ; VMEM-GFX8-LABEL: name: sgpr64_restore_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec @@ -399,7 +399,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8_sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -441,7 +441,7 @@ body: | ; VMEM-GFX8-LABEL: name: sgpr32_save_clobber_scc_emergency_stack_slot_x2 ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8, $sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8, $sgpr9 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec @@ -468,7 +468,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8, $sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8, $sgpr9 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -505,7 +505,7 @@ body: | ; VMEM-GFX8-LABEL: name: sgpr32_restore_clobber_scc_emergency_stack_slot_x2 ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec @@ -532,7 +532,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $sgpr8, $sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8, $sgpr9 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir index 0c6c0069911f0..bed7c0c12b7cb 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir @@ -18,11 +18,9 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 - ; CHECK-NEXT: KILL [[DEF]] ; CHECK-NEXT: S_ENDPGM 0 bb.0: liveins: $sgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index b2f5b6aa7fe36..ff2202f1e177b 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -19,7 +19,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill @@ -135,13 +135,13 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v255, s30, 0 ; GCN-NEXT: v_writelane_b32 v255, s31, 1 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, child_function@gotpcrel32@hi+12 @@ -266,7 +266,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 ; GCN-NEXT: s_mov_b32 s33, s18 @@ -313,7 +313,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill @@ -428,13 +428,13 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v254, s30, 0 ; GCN-NEXT: v_writelane_b32 v254, s31, 1 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, child_function@gotpcrel32@hi+12 @@ -558,7 +558,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 ; GCN-NEXT: s_mov_b32 s33, s18 @@ -602,8 +602,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-LABEL: spill_sgpr_with_sgpr_uses: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill @@ -715,38 +715,30 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-NEXT: buffer_store_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s4 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: ; implicit-def: $vgpr254 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v254, s4, 0 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-NEXT: v_readlane_b32 s4, v254, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s4 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -857,8 +849,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1183,7 +1175,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill @@ -1315,16 +1307,16 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-NEXT: flat_load_dwordx4 v[6:9], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -1446,7 +1438,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir index 887e9c4b5dc5e..0e6d9ce4a7f31 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir @@ -25,9 +25,9 @@ body: | ; SGPR_SPILLED-LABEL: name: stack-slot-share-equal-sized-spills ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62 ; SGPR_SPILLED-NEXT: {{ $}} - ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62 ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62 + ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]] ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, [[DEF]], implicit $sgpr0_sgpr1 @@ -91,9 +91,9 @@ body: | ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-large-spill-first ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62 ; SGPR_SPILLED-NEXT: {{ $}} - ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62 ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62 + ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]] ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3 ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, [[DEF]], implicit $sgpr2_sgpr3 @@ -155,9 +155,9 @@ body: | ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-small-spill-first ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62 ; SGPR_SPILLED-NEXT: {{ $}} - ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62 ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62 + ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]] ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]] ; SGPR_SPILLED-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll deleted file mode 100644 index c5a5a5209f54f..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s - -; Make sure this doesn't crash. -; ALL-LABEL: {{^}}test: -; ALL: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0 -; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 - -; Make sure we are handling hazards correctly. -; SGPR: v_mov_b32_e32 v0, vcc_lo -; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 ; 4-byte Folded Reload -; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]] -; SGPR-NEXT: s_waitcnt vmcnt(0) -; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 -; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 -; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 -; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 -; SGPR-NEXT: s_or_saveexec_b64 s[100:101], -1 -; SGPR-NEXT: s_mov_b64 exec, s[100:101] -; SGPR-NEXT: s_nop 2 -; SGPR-NEXT: buffer_store_dword v0, off, s[{{[0-9]+}}:[[HI]]], 0 -; SGPR-NEXT: ; kill: killed $vgpr1 - -; ALL: s_endpgm -define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { - call void asm sideeffect "", "~{s[0:7]}" () - call void asm sideeffect "", "~{s[8:15]}" () - call void asm sideeffect "", "~{s[16:23]}" () - call void asm sideeffect "", "~{s[24:31]}" () - call void asm sideeffect "", "~{s[32:39]}" () - call void asm sideeffect "", "~{s[40:47]}" () - call void asm sideeffect "", "~{s[48:55]}" () - call void asm sideeffect "", "~{s[56:63]}" () - call void asm sideeffect "", "~{s[64:71]}" () - call void asm sideeffect "", "~{s[72:79]}" () - call void asm sideeffect "", "~{s[80:87]}" () - call void asm sideeffect "", "~{s[88:95]}" () - call void asm sideeffect "", "~{v[0:7]}" () - call void asm sideeffect "", "~{v[8:15]}" () - call void asm sideeffect "", "~{v[16:23]}" () - call void asm sideeffect "", "~{v[24:31]}" () - call void asm sideeffect "", "~{v[32:39]}" () - call void asm sideeffect "", "~{v[40:47]}" () - call void asm sideeffect "", "~{v[48:55]}" () - call void asm sideeffect "", "~{v[56:63]}" () - call void asm sideeffect "", "~{v[64:71]}" () - call void asm sideeffect "", "~{v[72:79]}" () - call void asm sideeffect "", "~{v[80:87]}" () - call void asm sideeffect "", "~{v[88:95]}" () - call void asm sideeffect "", "~{v[96:103]}" () - call void asm sideeffect "", "~{v[104:111]}" () - call void asm sideeffect "", "~{v[112:119]}" () - call void asm sideeffect "", "~{v[120:127]}" () - call void asm sideeffect "", "~{v[128:135]}" () - call void asm sideeffect "", "~{v[136:143]}" () - call void asm sideeffect "", "~{v[144:151]}" () - call void asm sideeffect "", "~{v[152:159]}" () - call void asm sideeffect "", "~{v[160:167]}" () - call void asm sideeffect "", "~{v[168:175]}" () - call void asm sideeffect "", "~{v[176:183]}" () - call void asm sideeffect "", "~{v[184:191]}" () - call void asm sideeffect "", "~{v[192:199]}" () - call void asm sideeffect "", "~{v[200:207]}" () - call void asm sideeffect "", "~{v[208:215]}" () - call void asm sideeffect "", "~{v[216:223]}" () - call void asm sideeffect "", "~{v[224:231]}" () - call void asm sideeffect "", "~{v[232:239]}" () - call void asm sideeffect "", "~{v[240:247]}" () - call void asm sideeffect "", "~{v[248:255]}" () - - store i32 %in, ptr addrspace(1) %out - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index f8ec6bb5d943f..080bd052a7391 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -35,7 +35,7 @@ body: | ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF - ; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr41 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY undef $sgpr8_sgpr9 ; CHECK-NEXT: renamable $sgpr36_sgpr37 = IMPLICIT_DEF @@ -79,9 +79,9 @@ body: | ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4) - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1) - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1) - ; CHECK-NEXT: dead renamable $vgpr0 = COPY killed renamable $sgpr49 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr49 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37 ; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll index 34bcc3f02ac66..03988c3994992 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -2,23 +2,20 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_xor_saveexec_b64 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 +; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; GCN: s_xor_saveexec_b64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 +; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir index e54e5898f8b53..40be0c6b67ee9 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -22,14 +22,11 @@ body: | ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec ; CHECK-NEXT: dead [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 0, [[V_MOV_B32_e32_]].sub1, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[V_MAC_F32_e32_:%[0-9]+]].sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef [[V_MAC_F32_e32_]].sub1, implicit $mode, implicit $exec - ; CHECK-NEXT: SI_SPILL_V64_SAVE [[V_MAC_F32_e32_]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: undef [[V_MOV_B32_e32_1:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec - ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]].sub1 - ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub1 - ; CHECK-NEXT: S_NOP 0, implicit undef %9.sub0:vreg_64 + ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1 + ; CHECK-NEXT: S_NOP 0, implicit [[V_MAC_F32_e32_]].sub1 + ; CHECK-NEXT: S_NOP 0, implicit undef [[V_MAC_F32_e32_]].sub0 bb.0: successors: %bb.1 @@ -59,13 +56,13 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec + ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: undef [[V_MOV_B32_e32_1:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub2 - ; CHECK-NEXT: S_NOP 0, implicit undef %4.sub0:vreg_128 - ; CHECK-NEXT: undef [[V_MOV_B32_e32_1:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]].sub2 + ; CHECK-NEXT: S_NOP 0, implicit undef [[V_MOV_B32_e32_]].sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub2 bb.0: successors: %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir index 05e1082de4478..f4edafd9443ab 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -25,14 +25,13 @@ body: | ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0 + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0 + ; GCN-NEXT: dead $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1 - ; GCN-NEXT: KILL killed renamable $vgpr0 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -64,13 +63,12 @@ body: | ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0 + ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0 + ; GCN-NEXT: dead $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir index 11babc82e919b..dff2bd7f7aef9 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -21,8 +21,8 @@ body: | ; GCN-LABEL: name: sgpr32_spill ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 @@ -55,7 +55,6 @@ body: | ; GCN-LABEL: name: sgpr_spill_lane_crossover ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr63 @@ -89,6 +88,7 @@ body: | ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63 ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr64, 1, [[DEF]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr65, 2, [[DEF]] @@ -187,9 +187,9 @@ body: | ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: @@ -256,7 +256,6 @@ body: | ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} @@ -264,7 +263,7 @@ body: | ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 + ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0 ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} @@ -272,7 +271,7 @@ body: | ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 + ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0 ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} @@ -281,6 +280,7 @@ body: | ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir index 2caaabde38e9d..9b0f52cb39b01 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir @@ -19,12 +19,8 @@ body: | bb.0: liveins: $sgpr30_sgpr31, $vgpr0 ; GCN-LABEL: name: shift_back_exec_copy_reserved_reg - ; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; GCN: liveins: $sgpr30_sgpr31, $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0 ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0 ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec @@ -60,12 +56,10 @@ body: | bb.0: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0 ; GCN-LABEL: name: spill_exec_copy_reserved_reg - ; GCN: liveins: $vcc, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; GCN: liveins: $vcc, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $vgpr0, $vgpr2, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr28_sgpr29 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr28_sgpr29 ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2 ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr35, 1, undef $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index b4a981f1db4ec..882356d994fc6 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,12 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0 - ; GCN-NEXT: SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %12.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %12 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll index 81dd2c4457b2f..4384d1e32cf53 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll @@ -11,9 +11,8 @@ define void @test() { ; CHECK: ; %bb.0: ; %bb.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; CHECK-NEXT: .LBB0_1: ; %bb.1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 @@ -21,42 +20,40 @@ define void @test() { ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: .LBB0_3: ; %bb.3 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_readfirstlane_b32 s6, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], -1 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: s_cmp_eq_u32 s6, s7 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v1, s4, 0 +; CHECK-NEXT: v_writelane_b32 v1, s5, 1 ; CHECK-NEXT: s_mov_b64 s[10:11], exec ; CHECK-NEXT: s_mov_b64 exec, -1 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse ; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb.4 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse ; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: v_writelane_b32 v1, s4, 0 +; CHECK-NEXT: v_writelane_b32 v1, s5, 1 ; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse ; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse ; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 +; CHECK-NEXT: v_readlane_b32 s4, v1, 0 +; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; CHECK-NEXT: s_mov_b32 s4, 1 ; CHECK-NEXT: ; implicit-def: $sgpr5 @@ -64,12 +61,8 @@ define void @test() { ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.6: ; %bb.5 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir index 5040140a3e935..be2a31d7ccbaa 100644 --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -37,8 +37,8 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir index 5e53f93df95f7..f4965dcf61e17 100644 --- a/llvm/test/CodeGen/AMDGPU/spill224.mir +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir @@ -33,8 +33,8 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir index 3d5404a9c1ad5..312531ba5bc99 100644 --- a/llvm/test/CodeGen/AMDGPU/spill288.mir +++ b/llvm/test/CodeGen/AMDGPU/spill288.mir @@ -33,8 +33,8 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir index 4473a4d6648ef..0c0c01760f8ba 100644 --- a/llvm/test/CodeGen/AMDGPU/spill320.mir +++ b/llvm/test/CodeGen/AMDGPU/spill320.mir @@ -33,8 +33,8 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir index 8fa053a908b60..8823ba1a8326e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill352.mir +++ b/llvm/test/CodeGen/AMDGPU/spill352.mir @@ -33,8 +33,8 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir index cd604e4483b9c..e33641cf89237 100644 --- a/llvm/test/CodeGen/AMDGPU/spill384.mir +++ b/llvm/test/CodeGen/AMDGPU/spill384.mir @@ -33,8 +33,8 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 3c16cd29de8f6..6b0fbc44c65b7 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -236,20 +236,15 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-O0: ; %bb.0: ; %bb0 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 -; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE32-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; WAVE32-O0-NEXT: v_mov_b32_e32 v1, v0 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 -; WAVE32-O0-NEXT: v_and_b32_e64 v1, 1, v1 -; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s5, v1, 1 +; WAVE32-O0-NEXT: v_and_b32_e64 v0, 1, v0 +; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s5, v0, 1 ; WAVE32-O0-NEXT: s_mov_b32 s4, exec_lo -; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE32-O0-NEXT: v_writelane_b32 v0, s4, 0 +; WAVE32-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; WAVE32-O0-NEXT: v_writelane_b32 v1, s4, 0 ; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 -; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE32-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 ; WAVE32-O0-NEXT: s_and_b32 s4, s4, s5 ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 @@ -262,14 +257,13 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-O0-NEXT: ;;#ASMEND ; WAVE32-O0-NEXT: .LBB4_2: ; %bb2 ; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 +; WAVE32-O0-NEXT: v_readlane_b32 s4, v1, 0 ; WAVE32-O0-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; WAVE32-O0-NEXT: ; kill: killed $vgpr0 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] @@ -278,21 +272,16 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE64-O0: ; %bb.0: ; %bb0 ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; WAVE64-O0-NEXT: v_mov_b32_e32 v1, v0 -; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] -; WAVE64-O0-NEXT: v_and_b32_e64 v1, 1, v1 -; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, 1 +; WAVE64-O0-NEXT: v_and_b32_e64 v0, 1, v0 +; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, 1 ; WAVE64-O0-NEXT: s_mov_b64 s[4:5], exec -; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE64-O0-NEXT: v_writelane_b32 v0, s4, 0 -; WAVE64-O0-NEXT: v_writelane_b32 v0, s5, 1 +; WAVE64-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; WAVE64-O0-NEXT: v_writelane_b32 v1, s4, 0 +; WAVE64-O0-NEXT: v_writelane_b32 v1, s5, 1 ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] ; WAVE64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -305,15 +294,14 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE64-O0-NEXT: ;;#ASMEND ; WAVE64-O0-NEXT: .LBB4_2: ; %bb2 ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE64-O0-NEXT: v_readlane_b32 s4, v0, 0 -; WAVE64-O0-NEXT: v_readlane_b32 s5, v0, 1 +; WAVE64-O0-NEXT: v_readlane_b32 s4, v1, 0 +; WAVE64-O0-NEXT: v_readlane_b32 s5, v1, 1 ; WAVE64-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; WAVE64-O0-NEXT: ; kill: killed $vgpr0 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; WAVE64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] @@ -324,10 +312,10 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_and_b32_e64 v0, 1, v0 ; WAVE32-WWM-PREALLOC-NEXT: v_cmp_eq_u32_e64 s5, v0, 1 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, exec_lo +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v1, s4, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_and_b32 s4, s4, s5 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 @@ -341,7 +329,6 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-WWM-PREALLOC-NEXT: .LBB4_2: ; %bb2 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v1, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr1 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 @@ -941,7 +928,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: s_bitset0_b32 s23, 21 ; WAVE32-O0-NEXT: s_add_u32 s20, s20, s9 ; WAVE32-O0-NEXT: s_addc_u32 s21, s21, 0 -; WAVE32-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: s_mov_b32 s14, s8 ; WAVE32-O0-NEXT: s_mov_b32 s13, s7 ; WAVE32-O0-NEXT: s_mov_b32 s12, s6 @@ -949,12 +935,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: s_mov_b64 s[8:9], s[2:3] ; WAVE32-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE32-O0-NEXT: s_mov_b32 s0, s32 -; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 0 +; WAVE32-O0-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; WAVE32-O0-NEXT: v_writelane_b32 v32, s0, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5 -; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 +; WAVE32-O0-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42 ; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1037,17 +1021,12 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 -; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE32-O0-NEXT: v_readlane_b32 s1, v0, 1 -; WAVE32-O0-NEXT: v_readlane_b32 s0, v0, 0 +; WAVE32-O0-NEXT: v_readlane_b32 s1, v32, 1 +; WAVE32-O0-NEXT: v_readlane_b32 s0, v32, 0 ; WAVE32-O0-NEXT: ;;#ASMSTART ; WAVE32-O0-NEXT: ; use s1 ; WAVE32-O0-NEXT: ;;#ASMEND ; WAVE32-O0-NEXT: s_mov_b32 s32, s0 -; WAVE32-O0-NEXT: ; kill: killed $vgpr0 ; WAVE32-O0-NEXT: s_endpgm ; ; WAVE64-O0-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: @@ -1059,7 +1038,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-O0-NEXT: s_add_u32 s24, s24, s9 ; WAVE64-O0-NEXT: s_addc_u32 s25, s25, 0 -; WAVE64-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE64-O0-NEXT: s_mov_b32 s14, s8 ; WAVE64-O0-NEXT: s_mov_b32 s13, s7 ; WAVE64-O0-NEXT: s_mov_b32 s12, s6 @@ -1067,12 +1045,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: s_mov_b64 s[8:9], s[2:3] ; WAVE64-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE64-O0-NEXT: s_mov_b32 s0, s32 -; WAVE64-O0-NEXT: v_writelane_b32 v3, s0, 0 +; WAVE64-O0-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; WAVE64-O0-NEXT: v_writelane_b32 v32, s0, 0 ; WAVE64-O0-NEXT: s_lshr_b32 s0, s0, 6 -; WAVE64-O0-NEXT: v_writelane_b32 v3, s0, 1 -; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill -; WAVE64-O0-NEXT: s_mov_b64 exec, s[20:21] +; WAVE64-O0-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE64-O0-NEXT: v_mov_b32_e32 v3, 42 ; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 ; WAVE64-O0-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1155,17 +1131,12 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload -; WAVE64-O0-NEXT: s_mov_b64 exec, s[20:21] -; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE64-O0-NEXT: v_readlane_b32 s1, v0, 1 -; WAVE64-O0-NEXT: v_readlane_b32 s0, v0, 0 +; WAVE64-O0-NEXT: v_readlane_b32 s1, v32, 1 +; WAVE64-O0-NEXT: v_readlane_b32 s0, v32, 0 ; WAVE64-O0-NEXT: ;;#ASMSTART ; WAVE64-O0-NEXT: ; use s1 ; WAVE64-O0-NEXT: ;;#ASMEND ; WAVE64-O0-NEXT: s_mov_b32 s32, s0 -; WAVE64-O0-NEXT: ; kill: killed $vgpr0 ; WAVE64-O0-NEXT: s_endpgm ; ; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: @@ -1178,7 +1149,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s23, 21 ; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s20, s20, s9 ; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s21, s21, 0 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s8 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s7 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s12, s6 @@ -1186,6 +1156,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[8:9], s[2:3] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s0, s32 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1 @@ -1277,7 +1248,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: ; use s1 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s0 -; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr32 ; WAVE32-WWM-PREALLOC-NEXT: s_endpgm %alloca = alloca [32 x i32], addrspace(5) %stacksave = call ptr addrspace(5) @llvm.stacksave.p5() @@ -1362,23 +1332,20 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-O0-NEXT: s_mov_b32 s25, s33 +; WAVE32-O0-NEXT: s_mov_b32 s24, s33 ; WAVE32-O0-NEXT: s_mov_b32 s33, s32 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; WAVE32-O0-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s16 ; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0x1200 -; WAVE32-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: v_writelane_b32 v32, s30, 0 ; WAVE32-O0-NEXT: v_writelane_b32 v32, s31, 1 ; WAVE32-O0-NEXT: s_mov_b32 s16, s32 -; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 0 +; WAVE32-O0-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane +; WAVE32-O0-NEXT: v_writelane_b32 v33, s16, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s16, s16, 5 -; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 -; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 +; WAVE32-O0-NEXT: v_writelane_b32 v33, s16, 1 ; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1456,25 +1423,20 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 -; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE32-O0-NEXT: v_readlane_b32 s5, v0, 1 -; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 +; WAVE32-O0-NEXT: v_readlane_b32 s5, v33, 1 +; WAVE32-O0-NEXT: v_readlane_b32 s4, v33, 0 ; WAVE32-O0-NEXT: ;;#ASMSTART ; WAVE32-O0-NEXT: ; use s5 ; WAVE32-O0-NEXT: ;;#ASMEND ; WAVE32-O0-NEXT: s_mov_b32 s32, s4 ; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s30, v32, 0 -; WAVE32-O0-NEXT: ; kill: killed $vgpr0 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-O0-NEXT: s_mov_b32 s33, s25 +; WAVE32-O0-NEXT: s_mov_b32 s33, s24 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -1485,19 +1447,16 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: s_mov_b32 s33, s32 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; WAVE64-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; WAVE64-O0-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill ; WAVE64-O0-NEXT: s_mov_b64 exec, s[16:17] ; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0x2400 -; WAVE64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; WAVE64-O0-NEXT: v_writelane_b32 v32, s30, 0 ; WAVE64-O0-NEXT: v_writelane_b32 v32, s31, 1 ; WAVE64-O0-NEXT: s_mov_b32 s16, s32 -; WAVE64-O0-NEXT: v_writelane_b32 v0, s16, 0 +; WAVE64-O0-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane +; WAVE64-O0-NEXT: v_writelane_b32 v33, s16, 0 ; WAVE64-O0-NEXT: s_lshr_b32 s16, s16, 6 -; WAVE64-O0-NEXT: v_writelane_b32 v0, s16, 1 -; WAVE64-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; WAVE64-O0-NEXT: s_mov_b64 exec, s[26:27] +; WAVE64-O0-NEXT: v_writelane_b32 v33, s16, 1 ; WAVE64-O0-NEXT: v_mov_b32_e32 v0, 42 ; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE64-O0-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1575,22 +1534,17 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE64-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; WAVE64-O0-NEXT: s_mov_b64 exec, s[26:27] -; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE64-O0-NEXT: v_readlane_b32 s5, v0, 1 -; WAVE64-O0-NEXT: v_readlane_b32 s4, v0, 0 +; WAVE64-O0-NEXT: v_readlane_b32 s5, v33, 1 +; WAVE64-O0-NEXT: v_readlane_b32 s4, v33, 0 ; WAVE64-O0-NEXT: ;;#ASMSTART ; WAVE64-O0-NEXT: ; use s5 ; WAVE64-O0-NEXT: ;;#ASMEND ; WAVE64-O0-NEXT: s_mov_b32 s32, s4 ; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE64-O0-NEXT: v_readlane_b32 s30, v32, 0 -; WAVE64-O0-NEXT: ; kill: killed $vgpr0 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; WAVE64-O0-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0xffffdc00 ; WAVE64-O0-NEXT: s_mov_b32 s33, s19 @@ -1603,14 +1557,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s16, -1 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s16 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0x1200 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v33, s30, 0 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v33, s31, 1 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, s32 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s16, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s16, s16, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s16, 1 @@ -1699,10 +1653,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v33, 0 -; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 -; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24 diff --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir index 2c4a5dba3520c..cc261b0da4a8f 100644 --- a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir +++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir @@ -17,10 +17,10 @@ body: | bb.0: liveins: $sgpr20, $vgpr1 ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_reg - ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1 + ; GCN: liveins: $sgpr20, $vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 @@ -50,11 +50,11 @@ body: | bb.0: liveins: $sgpr20, $sgpr21, $vgpr1 ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg - ; GCN: liveins: $sgpr20, $sgpr21, $vgpr0, $vgpr1, $vgpr2 + ; GCN: liveins: $sgpr20, $sgpr21, $vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF @@ -89,10 +89,10 @@ body: | bb.0: liveins: $sgpr20, $vgpr1 ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_different_outgoing_reg - ; GCN: liveins: $sgpr20, $vgpr1, $vgpr2 + ; GCN: liveins: $sgpr20, $vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr2 @@ -121,17 +121,17 @@ body: | bb.0: liveins: $sgpr20, $vgpr1 ; GCN-LABEL: name: wwm_csr_spill_reload - ; GCN: liveins: $sgpr20, $vgpr1, $vgpr40 + ; GCN: liveins: $sgpr20, $vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GCN-NEXT: $vgpr40 = IMPLICIT_DEF - ; GCN-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr40 - ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr2 + ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0, implicit $exec ; GCN-NEXT: $vgpr0 = COPY killed $vgpr1, implicit $exec - ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: SI_RETURN implicit $vgpr0 $vgpr40 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 52370f6a2ef05..4dfd4c095c87a 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -206,14 +206,14 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-TRAP-GFX1100-O0-LABEL: non_entry_trap: ; HSA-TRAP-GFX1100-O0: ; %bb.0: ; %entry -; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b64 s[2:3], s[0:1] -; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v0, s2, 0 -; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v0, s3, 1 +; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v2, s2, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v2, s3, 1 ; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 -; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill ; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc @@ -236,16 +236,15 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB1_3 ; HSA-TRAP-GFX1100-O0-NEXT: .LBB1_2: ; %ret ; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 -; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload ; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0 -; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1 -; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 0 -; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v2, 3 -; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v1, v2, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v2, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v2, 1 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 3 +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 -; HSA-TRAP-GFX1100-O0-NEXT: ; kill: killed $vgpr0 ; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm ; HSA-TRAP-GFX1100-O0-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5 @@ -352,34 +351,32 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after: ; HSA-TRAP-GFX1100-O0: ; %bb.0: -; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill ; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) -; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v1, s2, 0 -; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v1, s3, 1 +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v2, s2, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v2, s3, 1 ; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 -; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v1, off offset:4 ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill ; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 ; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off offset:4 ; 4-byte Folded Spill ; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_execnz .LBB2_2 ; HSA-TRAP-GFX1100-O0-NEXT: ; %bb.1: +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off offset:8 ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v1, off, off offset:4 ; 4-byte Folded Reload ; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 -; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload -; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 -; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0 -; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1 -; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v1, off, off offset:8 ; 4-byte Folded Reload ; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v1, v2, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v2, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v2, 1 +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 -; HSA-TRAP-GFX1100-O0-NEXT: ; kill: killed $vgpr0 ; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm ; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_2: ; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2 diff --git a/llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir b/llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir index c73b023f18652..4c2d0d2fa0d77 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -start-before=greedy,0 -stop-after=virtregrewriter,2 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s --- | define amdgpu_ps void @e32() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 15a83475f368e..a827ebe96cfcf 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -13,41 +13,37 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_add_u32 s0, s0, s13 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; CHECK-NEXT: v_mov_b32_e32 v2, v0 -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: global_load_ushort v3, v1, s[4:5] offset:4 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_write_b8 v1, v2 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ds_write_b8 v0, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], exec -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v3, s4, 0 +; CHECK-NEXT: v_writelane_b32 v3, s5, 1 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb193 ; CHECK-NEXT: .LBB0_2: ; %bb194 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], 0 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v1, 0 -; CHECK-NEXT: v_readlane_b32 s5, v1, 1 +; CHECK-NEXT: v_readlane_b32 s4, v3, 0 +; CHECK-NEXT: v_readlane_b32 s5, v3, 1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 s4, 0xffff -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_and_b32_e64 v0, s4, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 @@ -66,10 +62,6 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: ; divergent unreachable ; CHECK-NEXT: .LBB0_4: ; %UnifiedReturnBlock -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm bb: %i10 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir index dd3572c027c86..e5caa509835c3 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir @@ -413,7 +413,7 @@ body: | ; MUBUF-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -424,7 +424,7 @@ body: | ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} @@ -434,7 +434,7 @@ body: | ; GFX9-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot ; GFX9-FLATSCR: bb.0: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -446,7 +446,7 @@ body: | ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: bb.1: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_NOP 0 ; GFX9-FLATSCR-NEXT: {{ $}} @@ -456,7 +456,7 @@ body: | ; GFX10-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot ; GFX10-FLATSCR: bb.0: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -467,7 +467,7 @@ body: | ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: bb.1: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_NOP 0 ; GFX10-FLATSCR-NEXT: {{ $}} @@ -477,7 +477,7 @@ body: | ; VMEM-GFX8-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -488,7 +488,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -525,7 +525,7 @@ body: | ; MUBUF-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -537,7 +537,7 @@ body: | ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} @@ -547,7 +547,7 @@ body: | ; GFX9-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot ; GFX9-FLATSCR: bb.0: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -559,7 +559,7 @@ body: | ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: bb.1: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_NOP 0 ; GFX9-FLATSCR-NEXT: {{ $}} @@ -569,7 +569,7 @@ body: | ; GFX10-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot ; GFX10-FLATSCR: bb.0: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -580,7 +580,7 @@ body: | ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: bb.1: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_NOP 0 ; GFX10-FLATSCR-NEXT: {{ $}} @@ -590,7 +590,7 @@ body: | ; VMEM-GFX8-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -602,7 +602,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -639,7 +639,7 @@ body: | ; MUBUF-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -652,7 +652,7 @@ body: | ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} @@ -662,7 +662,7 @@ body: | ; GFX9-FLATSCR-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot ; GFX9-FLATSCR: bb.0: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -674,7 +674,7 @@ body: | ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: bb.1: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_NOP 0 ; GFX9-FLATSCR-NEXT: {{ $}} @@ -684,7 +684,7 @@ body: | ; GFX10-FLATSCR-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot ; GFX10-FLATSCR: bb.0: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -695,7 +695,7 @@ body: | ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: bb.1: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_NOP 0 ; GFX10-FLATSCR-NEXT: {{ $}} @@ -705,7 +705,7 @@ body: | ; VMEM-GFX8-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -718,7 +718,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -755,7 +755,7 @@ body: | ; MUBUF-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -766,7 +766,7 @@ body: | ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} @@ -776,7 +776,7 @@ body: | ; GFX9-FLATSCR-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot ; GFX9-FLATSCR: bb.0: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -788,7 +788,7 @@ body: | ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: bb.1: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_NOP 0 ; GFX9-FLATSCR-NEXT: {{ $}} @@ -798,7 +798,7 @@ body: | ; GFX10-FLATSCR-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot ; GFX10-FLATSCR: bb.0: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -809,7 +809,7 @@ body: | ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: bb.1: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_NOP 0 ; GFX10-FLATSCR-NEXT: {{ $}} @@ -819,7 +819,7 @@ body: | ; VMEM-GFX8-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -830,7 +830,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -867,7 +867,7 @@ body: | ; MUBUF-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -879,7 +879,7 @@ body: | ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} @@ -889,7 +889,7 @@ body: | ; GFX9-FLATSCR-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot ; GFX9-FLATSCR: bb.0: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -901,7 +901,7 @@ body: | ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: bb.1: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_NOP 0 ; GFX9-FLATSCR-NEXT: {{ $}} @@ -911,7 +911,7 @@ body: | ; GFX10-FLATSCR-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot ; GFX10-FLATSCR: bb.0: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -922,7 +922,7 @@ body: | ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: bb.1: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_NOP 0 ; GFX10-FLATSCR-NEXT: {{ $}} @@ -932,7 +932,7 @@ body: | ; VMEM-GFX8-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -944,7 +944,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -981,7 +981,7 @@ body: | ; MUBUF-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -994,7 +994,7 @@ body: | ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} @@ -1004,7 +1004,7 @@ body: | ; GFX9-FLATSCR-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot ; GFX9-FLATSCR: bb.0: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -1016,7 +1016,7 @@ body: | ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: bb.1: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_NOP 0 ; GFX9-FLATSCR-NEXT: {{ $}} @@ -1026,7 +1026,7 @@ body: | ; GFX10-FLATSCR-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot ; GFX10-FLATSCR: bb.0: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -1037,7 +1037,7 @@ body: | ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: bb.1: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_NOP 0 ; GFX10-FLATSCR-NEXT: {{ $}} @@ -1047,7 +1047,7 @@ body: | ; VMEM-GFX8-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -1060,7 +1060,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} @@ -1200,7 +1200,7 @@ body: | ; MUBUF-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -1211,7 +1211,7 @@ body: | ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; MUBUF-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} @@ -1221,7 +1221,7 @@ body: | ; GFX9-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot ; GFX9-FLATSCR: bb.0: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -1235,7 +1235,7 @@ body: | ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: bb.1: ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX9-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX9-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_NOP 0 ; GFX9-FLATSCR-NEXT: {{ $}} @@ -1245,7 +1245,7 @@ body: | ; GFX10-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot ; GFX10-FLATSCR: bb.0: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) @@ -1259,7 +1259,7 @@ body: | ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: bb.1: ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; GFX10-FLATSCR-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-FLATSCR-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_NOP 0 ; GFX10-FLATSCR-NEXT: {{ $}} @@ -1269,7 +1269,7 @@ body: | ; VMEM-GFX8-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot ; VMEM-GFX8: bb.0: ; VMEM-GFX8-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -1280,7 +1280,7 @@ body: | ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: bb.1: ; VMEM-GFX8-NEXT: successors: %bb.2(0x80000000) - ; VMEM-GFX8-NEXT: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; VMEM-GFX8-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 ; VMEM-GFX8-NEXT: {{ $}} ; VMEM-GFX8-NEXT: S_NOP 0 ; VMEM-GFX8-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir index 6659e95323769..fa0922590712a 100644 --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -30,7 +30,7 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: undef_identity_copy - ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128), addrspace 1) + ; CHECK: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128), addrspace 1) ; CHECK-NEXT: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY $sgpr95 @@ -39,13 +39,14 @@ body: | ; CHECK-NEXT: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @bar + 4, target-flags(amdgpu-rel32-hi) @bar + 4, implicit-def dead $scc ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 ; CHECK-NEXT: $sgpr4 = COPY $sgpr95 - ; CHECK-NEXT: $vgpr0 = COPY renamable $vgpr40 - ; CHECK-NEXT: $vgpr1 = COPY renamable $vgpr41 - ; CHECK-NEXT: $vgpr2 = COPY killed renamable $vgpr42 - ; CHECK-NEXT: $vgpr3 = KILL undef renamable $vgpr3 + ; CHECK-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORDX4_]].sub0 + ; CHECK-NEXT: $vgpr1 = COPY [[FLAT_LOAD_DWORDX4_]].sub1 + ; CHECK-NEXT: $vgpr2 = COPY [[FLAT_LOAD_DWORDX4_]].sub2 + ; CHECK-NEXT: $vgpr3 = COPY undef %4:vgpr_32 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @bar, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 - ; CHECK-NEXT: FLAT_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD undef %6:vreg_64, [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:vreg_128 = FLAT_LOAD_DWORDX4 undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128), addrspace 1) %2:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll index 8c285f37b4878..d1ee82e74b3de 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll @@ -13,24 +13,24 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: s_mov_b32 s16, s33 ; GFX90A-NEXT: s_mov_b32 s33, s32 ; GFX90A-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, -1 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[18:19] ; GFX90A-NEXT: v_writelane_b32 v40, s16, 4 ; GFX90A-NEXT: v_writelane_b32 v40, s28, 2 ; GFX90A-NEXT: v_writelane_b32 v40, s29, 3 -; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane ; GFX90A-NEXT: s_addk_i32 s32, 0x400 ; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s20 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s20, 0 +; GFX90A-NEXT: v_writelane_b32 v39, s20, 0 ; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v39 ; GFX90A-NEXT: s_mov_b64 exec, s[28:29] ; GFX90A-NEXT: s_getpc_b64 s[16:17] ; GFX90A-NEXT: s_add_u32 s16, s16, foo@gotpcrel32@lo+4 @@ -39,23 +39,22 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v39, a32 ; GFX90A-NEXT: s_mov_b64 exec, s[28:29] -; GFX90A-NEXT: v_readlane_b32 s20, v0, 0 +; GFX90A-NEXT: v_readlane_b32 s20, v39, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s20 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 -; GFX90A-NEXT: ; kill: killed $vgpr0 ; GFX90A-NEXT: v_readlane_b32 s4, v40, 4 ; GFX90A-NEXT: v_readlane_b32 s28, v40, 2 ; GFX90A-NEXT: v_readlane_b32 s29, v40, 3 ; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, -1 -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_addk_i32 s32, 0xfc00 ; GFX90A-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 5608ea8563548..4837efe6606b8 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -19,24 +19,23 @@ define void @test() #0 { ; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 4 ; GCN-NEXT: v_writelane_b32 v40, s28, 2 ; GCN-NEXT: v_writelane_b32 v40, s29, 3 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_addk_i32 s32, 0x800 +; GCN-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane +; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s16 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s16, 0 +; GCN-NEXT: v_writelane_b32 v39, s16, 0 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 @@ -45,26 +44,24 @@ define void @test() #0 { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-NEXT: v_readlane_b32 s4, v39, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-NEXT: v_readlane_b32 s29, v40, 3 ; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xf800 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -75,23 +72,23 @@ define void @test() #0 { ; GCN-O0-NEXT: s_mov_b32 s16, s33 ; GCN-O0-NEXT: s_mov_b32 s33, s32 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, -1 -; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[18:19] ; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4 ; GCN-O0-NEXT: v_writelane_b32 v40, s28, 2 ; GCN-O0-NEXT: v_writelane_b32 v40, s29, 3 ; GCN-O0-NEXT: s_add_i32 s32, s32, 0x400 -; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-O0-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-O0-NEXT: ;;#ASMSTART ; GCN-O0-NEXT: ; def s16 ; GCN-O0-NEXT: ;;#ASMEND -; GCN-O0-NEXT: v_writelane_b32 v0, s16, 0 +; GCN-O0-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v39, s16, 0 ; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[28:29] ; GCN-O0-NEXT: s_getpc_b64 s[16:17] ; GCN-O0-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 @@ -104,26 +101,25 @@ define void @test() #0 { ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[28:29] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s4, v39, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 -; GCN-O0-NEXT: global_store_dword v[1:2], v3, off +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: global_store_dword v[0:1], v2, off ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, -1 -; GCN-O0-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GCN-O0-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll new file mode 100644 index 0000000000000..145f1e483cd99 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll @@ -0,0 +1,29 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -filetype=null %s 2>&1 | FileCheck %s + +; A negative test to capture the expected error when the VGPRs are insufficient for wwm-regalloc. + +; CHECK: error: can't find enough VGPRs for wwm-regalloc + +define amdgpu_kernel void @test(i32 %in) { +entry: + call void asm sideeffect "", "~{v[0:7]}" () + call void asm sideeffect "", "~{v[8:15]}" () + call void asm sideeffect "", "~{v[16:23]}" () + call void asm sideeffect "", "~{v[24:31]}" () + call void asm sideeffect "", "~{v[32:39]}" () + call void asm sideeffect "", "~{v[40:47]}" () + call void asm sideeffect "", "~{v[48:55]}" () + call void asm sideeffect "", "~{v[56:63]}" () + %val0 = call i32 asm sideeffect "; def $0", "=s" () + %val1 = call i32 asm sideeffect "; def $0", "=s" () + %val2 = call i32 asm sideeffect "; def $0", "=s" () + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret +bb0: + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + call void asm sideeffect "; use $0", "s"(i32 %val2) + br label %ret +ret: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index c295a056eb9e7..025381d5c16df 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -134,15 +134,10 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 @@ -157,38 +152,38 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_mov_b32 s37, s44 ; GFX9-O0-NEXT: s_mov_b32 s38, s43 ; GFX9-O0-NEXT: s_mov_b32 s39, s42 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 3 +; GFX9-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v5, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v5, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 2 +; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[36:37] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 +; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -211,26 +206,26 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s36, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s37, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s36, v5, 4 +; GFX9-O0-NEXT: v_readlane_b32 s37, v5, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[36:37] -; GFX9-O0-NEXT: v_readlane_b32 s38, v0, 0 -; GFX9-O0-NEXT: v_readlane_b32 s39, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 2 -; GFX9-O0-NEXT: v_readlane_b32 s35, v0, 3 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[36:37] +; GFX9-O0-NEXT: v_readlane_b32 s38, v5, 0 +; GFX9-O0-NEXT: v_readlane_b32 s39, v5, 1 +; GFX9-O0-NEXT: v_readlane_b32 s34, v5, 2 +; GFX9-O0-NEXT: v_readlane_b32 s35, v5, 3 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, v3 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[36:37] ; GFX9-O0-NEXT: s_mov_b32 s36, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s36, v3 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s36, v0 ; GFX9-O0-NEXT: s_mov_b32 s36, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s36 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s36 ; GFX9-O0-NEXT: s_mov_b32 s40, s35 ; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, s39 @@ -240,12 +235,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_mov_b32 s38, s35 ; GFX9-O0-NEXT: s_mov_b32 s39, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -353,9 +347,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b32 s48, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 @@ -397,9 +391,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GFX9-O0-NEXT: s_mov_b32 s33, s48 @@ -412,9 +406,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 @@ -435,9 +429,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-O3-NEXT: s_mov_b32 s33, s38 @@ -539,28 +533,26 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s48, s33 +; GFX9-O0-NEXT: s_mov_b32 s46, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 -; GFX9-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 @@ -578,10 +570,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b32 s41, s45 ; GFX9-O0-NEXT: s_mov_b32 s42, s44 ; GFX9-O0-NEXT: s_mov_b32 s43, s35 -; GFX9-O0-NEXT: v_writelane_b32 v1, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v1, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v1, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v1, s43, 3 +; GFX9-O0-NEXT: ; implicit-def: $vgpr11 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v11, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v11, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v11, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v11, s43, 3 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 @@ -599,11 +592,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v1, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v1, s35, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: v_writelane_b32 v11, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v11, s35, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s36 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[34:35] ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 @@ -625,20 +615,13 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5 -; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3 +; GFX9-O0-NEXT: v_readlane_b32 s34, v11, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v11, 5 +; GFX9-O0-NEXT: v_readlane_b32 s36, v11, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v11, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v11, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v11, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -647,30 +630,28 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 -; GFX9-O0-NEXT: s_mov_b32 s33, s48 +; GFX9-O0-NEXT: s_mov_b32 s33, s46 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -680,14 +661,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 @@ -718,13 +699,13 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-O3-NEXT: s_mov_b32 s33, s38 @@ -924,7 +905,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -937,35 +918,35 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_writelane_b32 v47, s65, 1 ; GFX9-O0-NEXT: v_writelane_b32 v47, s66, 2 ; GFX9-O0-NEXT: v_writelane_b32 v47, s67, 3 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 @@ -975,36 +956,36 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v35, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v40, s18 ; GFX9-O0-NEXT: v_mov_b32_e32 v39, s19 ; GFX9-O0-NEXT: v_mov_b32_e32 v38, s20 ; GFX9-O0-NEXT: v_mov_b32_e32 v37, s21 ; GFX9-O0-NEXT: v_mov_b32_e32 v36, s22 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v46, s24 ; GFX9-O0-NEXT: v_mov_b32_e32 v45, s25 ; GFX9-O0-NEXT: v_mov_b32_e32 v44, s26 @@ -1013,56 +994,56 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v41, s29 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v40 -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v39 -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v38 -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v37 -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v36 -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v20, v46 ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v45 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v44 @@ -1080,23 +1061,23 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v30, v36 ; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr35 killed $exec -; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -1276,7 +1257,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index ee9174822a960..312628c7b5451 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -114,15 +114,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s16, s16, s4 ; GFX9-O0-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 0 +; GFX9-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v5, s3, 0 ; GFX9-O0-NEXT: s_mov_b32 s4, s1 -; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 0 +; GFX9-O0-NEXT: v_readlane_b32 s1, v5, 0 ; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GFX9-O0-NEXT: s_mov_b32 s3, s1 ; GFX9-O0-NEXT: s_mov_b32 s8, s3 @@ -135,37 +130,37 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s5, s10 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 4 +; GFX9-O0-NEXT: v_writelane_b32 v5, s2, 1 +; GFX9-O0-NEXT: v_writelane_b32 v5, s3, 2 +; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 +; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 ; GFX9-O0-NEXT: s_mov_b32 s0, 0 ; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v3, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 6 +; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 5 +; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -188,26 +183,26 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 6 +; GFX9-O0-NEXT: v_readlane_b32 s4, v5, 5 +; GFX9-O0-NEXT: v_readlane_b32 s5, v5, 6 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: v_readlane_b32 s2, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 -; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 -; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 4 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_readlane_b32 s2, v5, 1 +; GFX9-O0-NEXT: v_readlane_b32 s3, v5, 2 +; GFX9-O0-NEXT: v_readlane_b32 s0, v5, 3 +; GFX9-O0-NEXT: v_readlane_b32 s1, v5, 4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s4, v3 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s4 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX9-O0-NEXT: s_mov_b32 s6, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 ; GFX9-O0-NEXT: s_mov_b32 s4, s3 @@ -217,8 +212,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s2, s5 ; GFX9-O0-NEXT: s_mov_b32 s3, s4 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: cfg: @@ -310,38 +304,32 @@ define hidden i32 @called(i32 %a) noinline { define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-LABEL: call: ; GFX9-O0: ; %bb.0: -; GFX9-O0-NEXT: s_mov_b32 s32, 0x400 +; GFX9-O0-NEXT: s_mov_b32 s32, 0 ; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v7, s11, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s8 ; GFX9-O0-NEXT: s_mov_b32 s13, s7 ; GFX9-O0-NEXT: s_mov_b32 s12, s6 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v7, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v7, 1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v7, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s5, 3 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O0-NEXT: v_readlane_b32 s0, v7, 2 -; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c @@ -355,23 +343,19 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: s_mov_b32 s17, s7 ; GFX9-O0-NEXT: s_mov_b32 s18, s6 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v3, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v3, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v3, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v3, s19, 7 ; GFX9-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v3, s6, 8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 9 -; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 10 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[2:3] +; GFX9-O0-NEXT: v_writelane_b32 v3, s2, 9 +; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 10 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -387,35 +371,28 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O0-NEXT: s_mov_b32 s6, 20 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s6, v3 -; GFX9-O0-NEXT: s_mov_b32 s6, 10 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s6, v4 -; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 +; GFX9-O0-NEXT: s_mov_b32 s6, 10 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s6, v5 +; GFX9-O0-NEXT: v_or3_b32 v4, v6, v5, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 -; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v31, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 7 +; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 9 +; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 10 +; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v6 +; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v7 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: call: @@ -559,37 +536,31 @@ define i64 @called_i64(i64 %a) noinline { define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { ; GFX9-O0-LABEL: call_i64: ; GFX9-O0: ; %bb.0: -; GFX9-O0-NEXT: s_mov_b32 s32, 0x400 +; GFX9-O0-NEXT: s_mov_b32 s32, 0 ; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr12 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v12, s11, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v8, s10, 0 +; GFX9-O0-NEXT: v_writelane_b32 v8, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s8 ; GFX9-O0-NEXT: s_mov_b32 s13, s7 ; GFX9-O0-NEXT: s_mov_b32 s12, s6 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v12, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v12, 1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v12, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 +; GFX9-O0-NEXT: v_writelane_b32 v8, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v8, s5, 3 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O0-NEXT: v_readlane_b32 s0, v12, 2 -; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3 +; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 2 +; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 @@ -604,11 +575,10 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_mov_b32 s17, s8 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v8, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v8, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v8, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v8, s19, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s15, s7 ; GFX9-O0-NEXT: s_mov_b32 s8, s3 @@ -623,20 +593,17 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 8 -; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_writelane_b32 v8, s2, 8 +; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 ; GFX9-O0-NEXT: s_mov_b32 s2, 32 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s2, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -664,33 +631,25 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: v_add_co_u32_e64 v3, s[6:7], v3, v5 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: call_i64: @@ -1007,15 +966,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s16, s16, s4 ; GFX9-O0-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 0 +; GFX9-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v5, s3, 0 ; GFX9-O0-NEXT: s_mov_b32 s4, s1 -; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 0 +; GFX9-O0-NEXT: v_readlane_b32 s1, v5, 0 ; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GFX9-O0-NEXT: s_mov_b32 s3, s1 ; GFX9-O0-NEXT: s_mov_b32 s8, s3 @@ -1028,37 +982,37 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s5, s10 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 4 +; GFX9-O0-NEXT: v_writelane_b32 v5, s2, 1 +; GFX9-O0-NEXT: v_writelane_b32 v5, s3, 2 +; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 +; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 ; GFX9-O0-NEXT: s_mov_b32 s0, 0 ; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v3, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 6 +; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 5 +; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] @@ -1081,26 +1035,26 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB8_2: ; %merge +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 6 +; GFX9-O0-NEXT: v_readlane_b32 s4, v5, 5 +; GFX9-O0-NEXT: v_readlane_b32 s5, v5, 6 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: v_readlane_b32 s2, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 -; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 -; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 4 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_readlane_b32 s2, v5, 1 +; GFX9-O0-NEXT: v_readlane_b32 s3, v5, 2 +; GFX9-O0-NEXT: v_readlane_b32 s0, v5, 3 +; GFX9-O0-NEXT: v_readlane_b32 s1, v5, 4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s4, v3 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s4 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX9-O0-NEXT: s_mov_b32 s6, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 ; GFX9-O0-NEXT: s_mov_b32 s4, s3 @@ -1110,8 +1064,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s2, s5 ; GFX9-O0-NEXT: s_mov_b32 s3, s4 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: strict_wwm_cfg: @@ -1203,38 +1156,32 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline { define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-LABEL: strict_wwm_call: ; GFX9-O0: ; %bb.0: -; GFX9-O0-NEXT: s_mov_b32 s32, 0x400 +; GFX9-O0-NEXT: s_mov_b32 s32, 0 ; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v7, s11, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s8 ; GFX9-O0-NEXT: s_mov_b32 s13, s7 ; GFX9-O0-NEXT: s_mov_b32 s12, s6 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v7, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v7, 1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v7, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s5, 3 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O0-NEXT: v_readlane_b32 s0, v7, 2 -; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c @@ -1248,23 +1195,19 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: s_mov_b32 s17, s7 ; GFX9-O0-NEXT: s_mov_b32 s18, s6 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v3, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v3, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v3, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v3, s19, 7 ; GFX9-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v3, s6, 8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 9 -; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 10 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[2:3] +; GFX9-O0-NEXT: v_writelane_b32 v3, s2, 9 +; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 10 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1280,35 +1223,28 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O0-NEXT: s_mov_b32 s6, 20 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s6, v3 -; GFX9-O0-NEXT: s_mov_b32 s6, 10 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s6, v4 -; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 +; GFX9-O0-NEXT: s_mov_b32 s6, 10 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s6, v5 +; GFX9-O0-NEXT: v_or3_b32 v4, v6, v5, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 -; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v31, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 7 +; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 9 +; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 10 +; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v6 +; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v7 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: strict_wwm_call: @@ -1452,37 +1388,31 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline { define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: -; GFX9-O0-NEXT: s_mov_b32 s32, 0x400 +; GFX9-O0-NEXT: s_mov_b32 s32, 0 ; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr12 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v12, s11, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v8, s10, 0 +; GFX9-O0-NEXT: v_writelane_b32 v8, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s8 ; GFX9-O0-NEXT: s_mov_b32 s13, s7 ; GFX9-O0-NEXT: s_mov_b32 s12, s6 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v12, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v12, 1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v12, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 +; GFX9-O0-NEXT: v_writelane_b32 v8, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v8, s5, 3 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O0-NEXT: v_readlane_b32 s0, v12, 2 -; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3 +; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 2 +; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 @@ -1497,11 +1427,10 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_mov_b32 s17, s8 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v8, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v8, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v8, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v8, s19, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s15, s7 ; GFX9-O0-NEXT: s_mov_b32 s8, s3 @@ -1516,20 +1445,17 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 8 -; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_writelane_b32 v8, s2, 8 +; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 ; GFX9-O0-NEXT: s_mov_b32 s2, 32 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s2, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1557,33 +1483,25 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: v_add_co_u32_e64 v3, s[6:7], v3, v5 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: strict_wwm_call_i64: diff --git a/llvm/test/CodeGen/BPF/BTF/print_btf.py b/llvm/test/CodeGen/BPF/BTF/print_btf.py index 6ce08b76c363e..c574d0f8524b0 100644 --- a/llvm/test/CodeGen/BPF/BTF/print_btf.py +++ b/llvm/test/CodeGen/BPF/BTF/print_btf.py @@ -88,7 +88,7 @@ def print_btf(filename): buf = file.read() fmt_cache = {} - endian_pfx = "" + endian_pfx = ">" # big endian off = 0 def unpack(fmt): @@ -104,9 +104,9 @@ def unpack(fmt): # Use magic number at the header start to determine endianness (magic,) = unpack("H") if magic == 0xEB9F: - endian_pfx = "<" + endian_pfx = ">" # big endian elif magic == 0x9FEB: - endian_pfx = ">" + endian_pfx = "<" # little endian else: warn(f"Unexpected BTF magic: {magic:02x}") return @@ -290,6 +290,6 @@ def warn_nonzero(val, name): if __name__ == "__main__": if len(sys.argv) != 2: - warn("Usage: {sys.argv[0]} ") + warn(f"Usage: {sys.argv[0]} ") sys.exit(1) print_btf(sys.argv[1]) diff --git a/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll b/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll deleted file mode 100644 index 040bbde13800c..0000000000000 --- a/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: opt < %s -O3 -S | FileCheck %s - -; Address space intrinsics were erroneously marked NoCapture, leading to bad -; optimizations (such as the store below being eliminated as dead code). This -; test makes sure we don't regress. - -declare void @foo(ptr addrspace(1)) - -declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr) - -; CHECK: @bar -define void @bar() { - %t1 = alloca i32 -; CHECK: call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr nonnull %t1) -; CHECK-NEXT: store i32 10, ptr %t1 - %t2 = call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr %t1) - store i32 10, ptr %t1 - call void @foo(ptr addrspace(1) %t2) - ret void -} - diff --git a/llvm/test/CodeGen/RISCV/pr97304.ll b/llvm/test/CodeGen/RISCV/pr97304.ll index 120a0e787384d..694f6384b6855 100644 --- a/llvm/test/CodeGen/RISCV/pr97304.ll +++ b/llvm/test/CodeGen/RISCV/pr97304.ll @@ -17,7 +17,7 @@ define i32 @_ZNK2cv12LMSolverImpl3runERKNS_17_InputOutputArrayE(i1 %cmp436) { ; CHECK-NEXT: ADJCALLSTACKDOWN 8, 0, implicit-def dead $x2, implicit $x2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprjalr = COPY $x0 - ; CHECK-NEXT: SD [[COPY3]], [[COPY2]], 0 :: (store (s64)) + ; CHECK-NEXT: SD [[COPY3]], [[COPY2]], 0 :: (store (s64) into stack) ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1 ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 32 ; CHECK-NEXT: BNE [[ANDI]], $x0, %bb.3 diff --git a/llvm/test/CodeGen/SPARC/data-align.ll b/llvm/test/CodeGen/SPARC/data-align.ll new file mode 100644 index 0000000000000..d4a39524da44f --- /dev/null +++ b/llvm/test/CodeGen/SPARC/data-align.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -march=sparc | FileCheck %s +; RUN: llc < %s -march=sparcel | FileCheck %s +; RUN: llc < %s -march=sparcv9 | FileCheck %s + +; CHECK: .Li8: +; CHECK-DAG: .size .Li8, 1 +@i8 = private constant i8 42 + +; CHECK: .p2align 1 +; CHECK-NEXT: .Li16: +; CHECK-DAG: .size .Li16, 2 +@i16 = private constant i16 42 + +; CHECK: .p2align 2 +; CHECK-NEXT: .Li32: +; CHECK-DAG: .size .Li32, 4 +@i32 = private constant i32 42 + +; CHECK: .p2align 3 +; CHECK-NEXT: .Li64: +; CHECK-DAG: .size .Li64, 8 +@i64 = private constant i64 42 + +; CHECK: .p2align 4 +; CHECK-NEXT: .Li128: +; CHECK-DAG: .size .Li128, 16 +@i128 = private constant i128 42 diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll index 2cf55f662df02..54eb0e45dccee 100644 --- a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll +++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll @@ -10,7 +10,7 @@ ; CHECK-MIR-DAG: [[source_language_sycl:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 7 ; CHECK-MIR-DAG: [[source_language_cpp:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 4 ; CHECK-MIR-DAG: [[filename_str_sycl:%[0-9]+\:id\(s32\)]] = OpString 1094795567, 1094795585, 792805697, 1111638594, 1111638594, 1128481583, 1128481603, {{1697596227|1700545347}}, 1886216568, 1663985004, 0 -; CHECK-MIR-DAG: [[filename_str_cpp:%[0-9]+\:id\(s32\)]] = OpString 1145324591, 1145324612, 793003076, 1162167621, 1162167621, 1179010607, 1179010630, 1697596998, 1886216568, 774989164, 7368803 +; CHECK-MIR-DAG: [[filename_str_cpp:%[0-9]+\:id\(s32\)]] = OpString 1145324591, 1145324612, 793003076, 1162167621, 1162167621, 1179010607, 1179010630, {{1697596998|1700546118}}, 1886216568, 774989164, 7368803 ; CHECK-MIR-DAG: [[debug_source_sycl:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 35, [[filename_str_sycl]] ; CHECK-MIR-DAG: OpExtInst [[type_void]], 3, 1, [[debug_info_version]], [[dwarf_version]], [[debug_source_sycl]], [[source_language_sycl]] ; CHECK-MIR-DAG: [[debug_source_cpp:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 35, [[filename_str_cpp]] diff --git a/llvm/test/CodeGen/SystemZ/args-15.ll b/llvm/test/CodeGen/SystemZ/args-15.ll index c810aeb8c46c5..64217a2a29a6f 100644 --- a/llvm/test/CodeGen/SystemZ/args-15.ll +++ b/llvm/test/CodeGen/SystemZ/args-15.ll @@ -8,4 +8,6 @@ define i32 @callee_MissingRetAttr() { ret i32 -1 } -; CHECK: Narrow integer argument must have a valid extension type. +; CHECK: ERROR: Missing extension attribute of returned value from function: +; CHECK: i32 @callee_MissingRetAttr() +; CHECK: UNREACHABLE executed diff --git a/llvm/test/CodeGen/SystemZ/args-16.ll b/llvm/test/CodeGen/SystemZ/args-16.ll index b76a2afea5077..846100146e790 100644 --- a/llvm/test/CodeGen/SystemZ/args-16.ll +++ b/llvm/test/CodeGen/SystemZ/args-16.ll @@ -8,5 +8,7 @@ define i16 @callee_MissingRetAttr() { ret i16 -1 } -; CHECK: Narrow integer argument must have a valid extension type. +; CHECK: ERROR: Missing extension attribute of returned value from function: +; CHECK: i16 @callee_MissingRetAttr() +; CHECK: UNREACHABLE executed diff --git a/llvm/test/CodeGen/SystemZ/args-17.ll b/llvm/test/CodeGen/SystemZ/args-17.ll index bce54b3d2aa1f..4231d7e9e4772 100644 --- a/llvm/test/CodeGen/SystemZ/args-17.ll +++ b/llvm/test/CodeGen/SystemZ/args-17.ll @@ -8,4 +8,6 @@ define i8 @callee_MissingRetAttr() { ret i8 -1 } -; CHECK: Narrow integer argument must have a valid extension type. +; CHECK: ERROR: Missing extension attribute of returned value from function: +; CHECK: i8 @callee_MissingRetAttr() +; CHECK: UNREACHABLE executed diff --git a/llvm/test/CodeGen/SystemZ/args-18.ll b/llvm/test/CodeGen/SystemZ/args-18.ll index 82e9729d3a2df..bd368fa056c6c 100644 --- a/llvm/test/CodeGen/SystemZ/args-18.ll +++ b/llvm/test/CodeGen/SystemZ/args-18.ll @@ -11,4 +11,6 @@ define void @caller() { declare void @bar_Struct(i32 %Arg) -; CHECK: Narrow integer argument must have a valid extension type +; CHECK: ERROR: Missing extension attribute of passed value in call to function: +; CHECK: Callee: void @bar_Struct(i32) +; CHECK: Caller: void @caller() diff --git a/llvm/test/CodeGen/SystemZ/args-19.ll b/llvm/test/CodeGen/SystemZ/args-19.ll index 40a794417b4c6..8b5f421f59fdd 100644 --- a/llvm/test/CodeGen/SystemZ/args-19.ll +++ b/llvm/test/CodeGen/SystemZ/args-19.ll @@ -11,4 +11,6 @@ define void @caller() { declare void @bar_Struct(i16 %Arg) -; CHECK: Narrow integer argument must have a valid extension type +; CHECK: ERROR: Missing extension attribute of passed value in call to function: +; CHECK: Callee: void @bar_Struct(i16) +; CHECK: Caller: void @caller() diff --git a/llvm/test/CodeGen/SystemZ/args-20.ll b/llvm/test/CodeGen/SystemZ/args-20.ll index ce8b828a2d539..ed6f2e52bf6ee 100644 --- a/llvm/test/CodeGen/SystemZ/args-20.ll +++ b/llvm/test/CodeGen/SystemZ/args-20.ll @@ -11,4 +11,6 @@ define void @caller() { declare void @bar_Struct(i8 %Arg) -; CHECK: Narrow integer argument must have a valid extension type +; CHECK: ERROR: Missing extension attribute of passed value in call to function: +; CHECK: Callee: void @bar_Struct(i8) +; CHECK: Caller: void @caller() diff --git a/llvm/test/CodeGen/SystemZ/args-21.ll b/llvm/test/CodeGen/SystemZ/args-21.ll index c64233094c7df..da5c8fe5ffc7f 100644 --- a/llvm/test/CodeGen/SystemZ/args-21.ll +++ b/llvm/test/CodeGen/SystemZ/args-21.ll @@ -16,4 +16,6 @@ define void @foo() { ret void } -; CHECK: Narrow integer argument must have a valid extension type +; CHECK: ERROR: Missing extension attribute of returned value from function: +; CHECK: i32 @bar(i32) +; CHECK: UNREACHABLE executed diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index 8b71987246ee5..c0bc34c2b06ef 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -4,11 +4,10 @@ define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { ; CHECK-LABEL: float_float_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: bxeq lr -; CHECK-NEXT: .LBB0_1: @ %for.body.preheader ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: beq .LBB0_10 +; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB0_3 ; CHECK-NEXT: @ %bb.2: @@ -33,39 +32,42 @@ define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: beq .LBB0_11 ; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: add.w r8, r7, r3 -; CHECK-NEXT: and r5, r3, #3 -; CHECK-NEXT: wls lr, r5, .LBB0_7 +; CHECK-NEXT: adds r4, r7, r3 +; CHECK-NEXT: and r7, r3, #3 +; CHECK-NEXT: add.w r8, r12, r7 +; CHECK-NEXT: wls lr, r7, .LBB0_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r4, r12, r5 -; CHECK-NEXT: add.w r5, r0, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12, lsl #2 -; CHECK-NEXT: add.w r7, r2, r12, lsl #2 -; CHECK-NEXT: mov r12, r4 +; CHECK-NEXT: add.w r6, r0, r12, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12, lsl #2 +; CHECK-NEXT: add.w r5, r2, r12, lsl #2 +; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB0_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r6!, {s0} -; CHECK-NEXT: vldmia r5!, {s2} +; CHECK-NEXT: vldmia r7!, {s0} +; CHECK-NEXT: vldmia r6!, {s2} ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstmia r7!, {s0} +; CHECK-NEXT: vstmia r5!, {s0} ; CHECK-NEXT: le lr, .LBB0_6 ; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp.w r8, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 -; CHECK-NEXT: sub.w r3, r3, r12 -; CHECK-NEXT: lsl.w r12, r12, #2 +; CHECK-NEXT: sub.w r3, r8, r3 +; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: rsb r3, r3, r3, lsl #30 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w lr, r7, r3, lsr #2 +; CHECK-NEXT: lsl.w r3, r12, #2 ; CHECK-NEXT: .LBB0_9: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r7, r1, r12 -; CHECK-NEXT: add.w r6, r0, r12 -; CHECK-NEXT: add.w r5, r2, r12 +; CHECK-NEXT: adds r7, r1, r3 +; CHECK-NEXT: adds r6, r0, r3 +; CHECK-NEXT: adds r5, r2, r3 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: vldr s0, [r7] ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vldr s2, [r6] ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r5] ; CHECK-NEXT: vldr s0, [r7, #4] @@ -80,10 +82,9 @@ define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: vldr s2, [r6, #12] ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r5, #12] -; CHECK-NEXT: bne .LBB0_9 -; CHECK-NEXT: .LBB0_10: -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: le lr, .LBB0_9 +; CHECK-NEXT: .LBB0_10: @ %for.cond.cleanup +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} ; CHECK-NEXT: .LBB0_11: @ %vector.ph ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 @@ -217,11 +218,10 @@ for.body: ; preds = %for.body.prol.loope define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { ; CHECK-LABEL: float_float_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: bxeq lr -; CHECK-NEXT: .LBB1_1: @ %for.body.preheader ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: beq .LBB1_10 +; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB1_3 ; CHECK-NEXT: @ %bb.2: @@ -246,39 +246,42 @@ define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: beq .LBB1_11 ; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: add.w r8, r7, r3 -; CHECK-NEXT: and r5, r3, #3 -; CHECK-NEXT: wls lr, r5, .LBB1_7 +; CHECK-NEXT: adds r4, r7, r3 +; CHECK-NEXT: and r7, r3, #3 +; CHECK-NEXT: add.w r8, r12, r7 +; CHECK-NEXT: wls lr, r7, .LBB1_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r4, r12, r5 -; CHECK-NEXT: add.w r5, r0, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12, lsl #2 -; CHECK-NEXT: add.w r7, r2, r12, lsl #2 -; CHECK-NEXT: mov r12, r4 +; CHECK-NEXT: add.w r6, r0, r12, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12, lsl #2 +; CHECK-NEXT: add.w r5, r2, r12, lsl #2 +; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB1_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r6!, {s0} -; CHECK-NEXT: vldmia r5!, {s2} +; CHECK-NEXT: vldmia r7!, {s0} +; CHECK-NEXT: vldmia r6!, {s2} ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vstmia r7!, {s0} +; CHECK-NEXT: vstmia r5!, {s0} ; CHECK-NEXT: le lr, .LBB1_6 ; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp.w r8, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB1_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 -; CHECK-NEXT: sub.w r3, r3, r12 -; CHECK-NEXT: lsl.w r12, r12, #2 +; CHECK-NEXT: sub.w r3, r8, r3 +; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: rsb r3, r3, r3, lsl #30 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w lr, r7, r3, lsr #2 +; CHECK-NEXT: lsl.w r3, r12, #2 ; CHECK-NEXT: .LBB1_9: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r7, r1, r12 -; CHECK-NEXT: add.w r6, r0, r12 -; CHECK-NEXT: add.w r5, r2, r12 +; CHECK-NEXT: adds r7, r1, r3 +; CHECK-NEXT: adds r6, r0, r3 +; CHECK-NEXT: adds r5, r2, r3 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: vldr s0, [r7] ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vldr s2, [r6] ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r5] ; CHECK-NEXT: vldr s0, [r7, #4] @@ -293,10 +296,9 @@ define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: vldr s2, [r6, #12] ; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r5, #12] -; CHECK-NEXT: bne .LBB1_9 -; CHECK-NEXT: .LBB1_10: -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: le lr, .LBB1_9 +; CHECK-NEXT: .LBB1_10: @ %for.cond.cleanup +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} ; CHECK-NEXT: .LBB1_11: @ %vector.ph ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 @@ -430,11 +432,10 @@ for.body: ; preds = %for.body.prol.loope define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { ; CHECK-LABEL: float_float_sub: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: bxeq lr -; CHECK-NEXT: .LBB2_1: @ %for.body.preheader ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: beq .LBB2_10 +; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB2_3 ; CHECK-NEXT: @ %bb.2: @@ -459,39 +460,42 @@ define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: beq .LBB2_11 ; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: add.w r8, r7, r3 -; CHECK-NEXT: and r5, r3, #3 -; CHECK-NEXT: wls lr, r5, .LBB2_7 +; CHECK-NEXT: adds r4, r7, r3 +; CHECK-NEXT: and r7, r3, #3 +; CHECK-NEXT: add.w r8, r12, r7 +; CHECK-NEXT: wls lr, r7, .LBB2_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r4, r12, r5 -; CHECK-NEXT: add.w r5, r0, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12, lsl #2 -; CHECK-NEXT: add.w r7, r2, r12, lsl #2 -; CHECK-NEXT: mov r12, r4 +; CHECK-NEXT: add.w r6, r0, r12, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12, lsl #2 +; CHECK-NEXT: add.w r5, r2, r12, lsl #2 +; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB2_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r6!, {s0} -; CHECK-NEXT: vldmia r5!, {s2} +; CHECK-NEXT: vldmia r7!, {s0} +; CHECK-NEXT: vldmia r6!, {s2} ; CHECK-NEXT: vsub.f32 s0, s2, s0 -; CHECK-NEXT: vstmia r7!, {s0} +; CHECK-NEXT: vstmia r5!, {s0} ; CHECK-NEXT: le lr, .LBB2_6 ; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp.w r8, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB2_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 -; CHECK-NEXT: sub.w r3, r3, r12 -; CHECK-NEXT: lsl.w r12, r12, #2 +; CHECK-NEXT: sub.w r3, r8, r3 +; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: rsb r3, r3, r3, lsl #30 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w lr, r7, r3, lsr #2 +; CHECK-NEXT: lsl.w r3, r12, #2 ; CHECK-NEXT: .LBB2_9: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r7, r1, r12 -; CHECK-NEXT: add.w r6, r0, r12 -; CHECK-NEXT: add.w r5, r2, r12 +; CHECK-NEXT: adds r7, r1, r3 +; CHECK-NEXT: adds r6, r0, r3 +; CHECK-NEXT: adds r5, r2, r3 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: vldr s0, [r7] ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vldr s2, [r6] ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vsub.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r5] ; CHECK-NEXT: vldr s0, [r7, #4] @@ -506,10 +510,9 @@ define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: vldr s2, [r6, #12] ; CHECK-NEXT: vsub.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r5, #12] -; CHECK-NEXT: bne .LBB2_9 -; CHECK-NEXT: .LBB2_10: -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: le lr, .LBB2_9 +; CHECK-NEXT: .LBB2_10: @ %for.cond.cleanup +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} ; CHECK-NEXT: .LBB2_11: @ %vector.ph ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 @@ -643,11 +646,10 @@ for.body: ; preds = %for.body.prol.loope define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { ; CHECK-LABEL: float_int_mul: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: bxeq lr -; CHECK-NEXT: .LBB3_1: @ %for.body.preheader -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: beq.w .LBB3_13 +; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bls .LBB3_6 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck @@ -681,42 +683,45 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: add.w r8, r7, r3 -; CHECK-NEXT: and r5, r3, #3 -; CHECK-NEXT: wls lr, r5, .LBB3_10 +; CHECK-NEXT: add.w r9, r7, r3 +; CHECK-NEXT: and r7, r3, #3 +; CHECK-NEXT: add.w r8, r12, r7 +; CHECK-NEXT: wls lr, r7, .LBB3_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r4, r12, r5 -; CHECK-NEXT: add.w r5, r0, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12, lsl #2 -; CHECK-NEXT: add.w r7, r2, r12, lsl #2 -; CHECK-NEXT: mov r12, r4 +; CHECK-NEXT: add.w r6, r0, r12, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12, lsl #2 +; CHECK-NEXT: add.w r5, r2, r12, lsl #2 +; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB3_9: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r4, [r6], #4 -; CHECK-NEXT: vldmia r5!, {s2} +; CHECK-NEXT: ldr r4, [r7], #4 +; CHECK-NEXT: vldmia r6!, {s2} ; CHECK-NEXT: vmov s0, r4 ; CHECK-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstmia r7!, {s0} +; CHECK-NEXT: vstmia r5!, {s0} ; CHECK-NEXT: le lr, .LBB3_9 ; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp.w r8, #3 +; CHECK-NEXT: cmp.w r9, #3 ; CHECK-NEXT: blo .LBB3_13 ; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1 +; CHECK-NEXT: sub.w r3, r8, r3 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2 -; CHECK-NEXT: sub.w r3, r3, r12 +; CHECK-NEXT: movs r7, #1 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: lsl.w r12, r12, #2 +; CHECK-NEXT: rsb r3, r3, r3, lsl #30 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w lr, r7, r3, lsr #2 +; CHECK-NEXT: lsl.w r3, r12, #2 ; CHECK-NEXT: .LBB3_12: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s0, [r1, #-8] -; CHECK-NEXT: add.w r7, r0, r12 -; CHECK-NEXT: add.w r6, r2, r12 +; CHECK-NEXT: adds r7, r0, r3 +; CHECK-NEXT: adds r6, r2, r3 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-NEXT: vldr s2, [r7] ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r6] ; CHECK-NEXT: vldr s0, [r1, #-4] @@ -730,15 +735,14 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r6, #8] ; CHECK-NEXT: vldr s0, [r1, #4] -; CHECK-NEXT: add.w r1, r1, #16 +; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vldr s2, [r7, #12] ; CHECK-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r6, #12] -; CHECK-NEXT: bne .LBB3_12 -; CHECK-NEXT: .LBB3_13: -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: bx lr +; CHECK-NEXT: le lr, .LBB3_12 +; CHECK-NEXT: .LBB3_13: @ %for.cond.cleanup +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll index d7633cb11e44c..bd794712a3109 100644 --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -789,12 +789,55 @@ define double @load_double_seq_cst(ptr %fptr) { } define void @store_bfloat(ptr %fptr, bfloat %v) { -; X86-LABEL: store_bfloat: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movw %cx, (%eax) -; X86-NEXT: retl +; X86-SSE1-LABEL: store_bfloat: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: subl $8, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE1-NEXT: .cfi_offset %esi, -8 +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE1-NEXT: calll __truncsfbf2 +; X86-SSE1-NEXT: movw %ax, (%esi) +; X86-SSE1-NEXT: addl $8, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: store_bfloat: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movw %cx, (%eax) +; X86-SSE2-NEXT: retl +; +; X86-AVX-LABEL: store_bfloat: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movw %cx, (%eax) +; X86-AVX-NEXT: retl +; +; X86-NOSSE-LABEL: store_bfloat: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 +; X86-NOSSE-NEXT: subl $8, %esp +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 16 +; X86-NOSSE-NEXT: .cfi_offset %esi, -8 +; X86-NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: fstps (%esp) +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: calll __truncsfbf2 +; X86-NOSSE-NEXT: movw %ax, (%esi) +; X86-NOSSE-NEXT: addl $8, %esp +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4 +; X86-NOSSE-NEXT: retl ; ; X64-SSE-LABEL: store_bfloat: ; X64-SSE: # %bb.0: @@ -811,8 +854,7 @@ define void @store_bfloat(ptr %fptr, bfloat %v) { ret void } -; Work around issue #92899 by casting to float -define float @load_bfloat(ptr %fptr) { +define bfloat @load_bfloat(ptr %fptr) { ; X86-SSE1-LABEL: load_bfloat: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: pushl %eax @@ -828,30 +870,16 @@ define float @load_bfloat(ptr %fptr) { ; ; X86-SSE2-LABEL: load_bfloat: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %eax -; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movzwl (%eax), %eax -; X86-SSE2-NEXT: shll $16, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, (%esp) -; X86-SSE2-NEXT: flds (%esp) -; X86-SSE2-NEXT: popl %eax -; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE2-NEXT: pinsrw $0, %eax, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: load_bfloat: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %eax -; X86-AVX-NEXT: .cfi_def_cfa_offset 8 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movzwl (%eax), %eax -; X86-AVX-NEXT: shll $16, %eax -; X86-AVX-NEXT: vmovd %eax, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, (%esp) -; X86-AVX-NEXT: flds (%esp) -; X86-AVX-NEXT: popl %eax -; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X86-NOSSE-LABEL: load_bfloat: @@ -870,17 +898,14 @@ define float @load_bfloat(ptr %fptr) { ; X64-SSE-LABEL: load_bfloat: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movzwl (%rdi), %eax -; X64-SSE-NEXT: shll $16, %eax -; X64-SSE-NEXT: movd %eax, %xmm0 +; X64-SSE-NEXT: pinsrw $0, %eax, %xmm0 ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: load_bfloat: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: movzwl (%rdi), %eax -; X64-AVX-NEXT: shll $16, %eax -; X64-AVX-NEXT: vmovd %eax, %xmm0 +; X64-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %v = load atomic bfloat, ptr %fptr unordered, align 2 - %ext = fpext bfloat %v to float - ret float %ext + ret bfloat %v } diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll new file mode 100644 index 0000000000000..f363cad816dfb --- /dev/null +++ b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll @@ -0,0 +1,1243 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=i386-linux-gnu < %s | FileCheck -check-prefixes=CHECK,NOSSE %s +; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse < %s | FileCheck -check-prefixes=CHECK,SSE %s + +; Make sure no assert without SSE2 and bfloat. Issue 92899 + +define bfloat @return_arg_bf16(bfloat %x) #0 { +; CHECK-LABEL: return_arg_bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: retl + ret bfloat %x +} + +define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) #0 { +; CHECK-LABEL: return_arg_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: retl + ret <2 x bfloat> %x +} + +define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 { +; NOSSE-LABEL: return_arg_v3bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %edi +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: pushl %eax +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %esi +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: # kill: def $ax killed $ax def $eax +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movzwl %si, %edi +; NOSSE-NEXT: orl %eax, %edi +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, 4(%esi) +; NOSSE-NEXT: movl %edi, (%esi) +; NOSSE-NEXT: movl %esi, %eax +; NOSSE-NEXT: addl $4, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: popl %edi +; NOSSE-NEXT: retl $4 +; +; SSE-LABEL: return_arg_v3bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %edi +; SSE-NEXT: pushl %esi +; SSE-NEXT: pushl %eax +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %esi +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: # kill: def $ax killed $ax def $eax +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movzwl %si, %edi +; SSE-NEXT: orl %eax, %edi +; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, 4(%esi) +; SSE-NEXT: movl %edi, (%esi) +; SSE-NEXT: movl %esi, %eax +; SSE-NEXT: addl $4, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: popl %edi +; SSE-NEXT: retl $4 + ret <3 x bfloat> %x +} + +define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 { +; NOSSE-LABEL: return_arg_v4bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %ebp +; NOSSE-NEXT: pushl %ebx +; NOSSE-NEXT: pushl %edi +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: subl $12, %esp +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %esi +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %edi +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %ebx +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebp +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, 6(%ebp) +; NOSSE-NEXT: movw %bx, 4(%ebp) +; NOSSE-NEXT: movw %di, 2(%ebp) +; NOSSE-NEXT: movw %si, (%ebp) +; NOSSE-NEXT: movl %ebp, %eax +; NOSSE-NEXT: addl $12, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: popl %edi +; NOSSE-NEXT: popl %ebx +; NOSSE-NEXT: popl %ebp +; NOSSE-NEXT: retl $4 +; +; SSE-LABEL: return_arg_v4bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %ebp +; SSE-NEXT: pushl %ebx +; SSE-NEXT: pushl %edi +; SSE-NEXT: pushl %esi +; SSE-NEXT: subl $12, %esp +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %esi +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %edi +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %ebx +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, 6(%ebp) +; SSE-NEXT: movw %bx, 4(%ebp) +; SSE-NEXT: movw %di, 2(%ebp) +; SSE-NEXT: movw %si, (%ebp) +; SSE-NEXT: movl %ebp, %eax +; SSE-NEXT: addl $12, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: popl %edi +; SSE-NEXT: popl %ebx +; SSE-NEXT: popl %ebp +; SSE-NEXT: retl $4 + ret <4 x bfloat> %x +} + +define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 { +; NOSSE-LABEL: return_arg_v8bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %ebp +; NOSSE-NEXT: pushl %ebx +; NOSSE-NEXT: pushl %edi +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: subl $12, %esp +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %esi +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %edi +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %ebx +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebp +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, 14(%ebp) +; NOSSE-NEXT: movw %bx, 12(%ebp) +; NOSSE-NEXT: movw %di, 10(%ebp) +; NOSSE-NEXT: movw %si, 8(%ebp) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 6(%ebp) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 4(%ebp) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 2(%ebp) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, (%ebp) +; NOSSE-NEXT: movl %ebp, %eax +; NOSSE-NEXT: addl $12, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: popl %edi +; NOSSE-NEXT: popl %ebx +; NOSSE-NEXT: popl %ebp +; NOSSE-NEXT: retl $4 +; +; SSE-LABEL: return_arg_v8bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %ebp +; SSE-NEXT: pushl %ebx +; SSE-NEXT: pushl %edi +; SSE-NEXT: pushl %esi +; SSE-NEXT: subl $12, %esp +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %esi +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %edi +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %ebx +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, 14(%ebp) +; SSE-NEXT: movw %bx, 12(%ebp) +; SSE-NEXT: movw %di, 10(%ebp) +; SSE-NEXT: movw %si, 8(%ebp) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 6(%ebp) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 4(%ebp) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 2(%ebp) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, (%ebp) +; SSE-NEXT: movl %ebp, %eax +; SSE-NEXT: addl $12, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: popl %edi +; SSE-NEXT: popl %ebx +; SSE-NEXT: popl %ebp +; SSE-NEXT: retl $4 + ret <8 x bfloat> %x +} + +define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 { +; NOSSE-LABEL: return_arg_v16bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %ebp +; NOSSE-NEXT: pushl %ebx +; NOSSE-NEXT: pushl %edi +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: subl $28, %esp +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %esi +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %ebx +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %ebp +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, 30(%edi) +; NOSSE-NEXT: movw %bp, 28(%edi) +; NOSSE-NEXT: movw %bx, 26(%edi) +; NOSSE-NEXT: movw %si, 24(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 22(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 20(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 18(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 16(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 14(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 12(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 10(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 8(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 6(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 4(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, 2(%edi) +; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; NOSSE-NEXT: movw %ax, (%edi) +; NOSSE-NEXT: movl %edi, %eax +; NOSSE-NEXT: addl $28, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: popl %edi +; NOSSE-NEXT: popl %ebx +; NOSSE-NEXT: popl %ebp +; NOSSE-NEXT: retl $4 +; +; SSE-LABEL: return_arg_v16bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %ebp +; SSE-NEXT: pushl %ebx +; SSE-NEXT: pushl %edi +; SSE-NEXT: pushl %esi +; SSE-NEXT: subl $28, %esp +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %esi +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %ebx +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %ebp +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, 30(%edi) +; SSE-NEXT: movw %bp, 28(%edi) +; SSE-NEXT: movw %bx, 26(%edi) +; SSE-NEXT: movw %si, 24(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 22(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 20(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 18(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 16(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 14(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 12(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 10(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 8(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 6(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 4(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, 2(%edi) +; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; SSE-NEXT: movw %ax, (%edi) +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: addl $28, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: popl %edi +; SSE-NEXT: popl %ebx +; SSE-NEXT: popl %ebp +; SSE-NEXT: retl $4 + ret <16 x bfloat> %x +} + +declare bfloat @returns_bf16(bfloat) +declare <2 x bfloat> @returns_v2bf16(<2 x bfloat>) +declare <3 x bfloat> @returns_v3bf16(<3 x bfloat>) +declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>) +declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>) +declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>) + +define void @call_ret_bf16(ptr %ptr) #0 { +; NOSSE-LABEL: call_ret_bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: subl $8, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; NOSSE-NEXT: movzwl (%esi), %eax +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll returns_bf16@PLT +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, (%esi) +; NOSSE-NEXT: addl $8, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: retl +; +; SSE-LABEL: call_ret_bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %esi +; SSE-NEXT: subl $8, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-NEXT: movzwl (%esi), %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll returns_bf16@PLT +; SSE-NEXT: fstps (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, (%esi) +; SSE-NEXT: addl $8, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: retl + %val = load bfloat, ptr %ptr + %bf16 = call bfloat @returns_bf16(bfloat %val) + store bfloat %bf16, ptr %ptr + ret void +} + +define void @call_ret_v2bf16(ptr %ptr) #0 { +; NOSSE-LABEL: call_ret_v2bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %edi +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: subl $20, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; NOSSE-NEXT: movzwl 2(%edi), %eax +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl (%edi), %eax +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll returns_v2bf16@PLT +; NOSSE-NEXT: fxch %st(1) +; NOSSE-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movl %eax, %esi +; NOSSE-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; NOSSE-NEXT: fstps (%esp) +; NOSSE-NEXT: calll __truncsfbf2 +; NOSSE-NEXT: movw %ax, 2(%edi) +; NOSSE-NEXT: movw %si, (%edi) +; NOSSE-NEXT: addl $20, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: popl %edi +; NOSSE-NEXT: retl +; +; SSE-LABEL: call_ret_v2bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %edi +; SSE-NEXT: pushl %esi +; SSE-NEXT: subl $36, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; SSE-NEXT: movzwl 2(%edi), %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movl (%edi), %eax +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, (%esp) +; SSE-NEXT: calll returns_v2bf16@PLT +; SSE-NEXT: fxch %st(1) +; SSE-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; SSE-NEXT: fstps (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movl %eax, %esi +; SSE-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; SSE-NEXT: fstps (%esp) +; SSE-NEXT: calll __truncsfbf2 +; SSE-NEXT: movw %ax, 2(%edi) +; SSE-NEXT: movw %si, (%edi) +; SSE-NEXT: addl $36, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: popl %edi +; SSE-NEXT: retl + %val = load <2 x bfloat>, ptr %ptr + %bf16 = call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val) + store <2 x bfloat> %bf16, ptr %ptr + ret void +} + +define void @call_ret_v3bf16(ptr %ptr) #0 { +; NOSSE-LABEL: call_ret_v3bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: subl $40, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; NOSSE-NEXT: movl (%esi), %eax +; NOSSE-NEXT: movl 4(%esi), %ecx +; NOSSE-NEXT: leal {{[0-9]+}}(%esp), %edx +; NOSSE-NEXT: movl %edx, (%esp) +; NOSSE-NEXT: shll $16, %ecx +; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl %eax, %ecx +; NOSSE-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: calll returns_v3bf16@PLT +; NOSSE-NEXT: subl $4, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; NOSSE-NEXT: movw %cx, 4(%esi) +; NOSSE-NEXT: movl %eax, (%esi) +; NOSSE-NEXT: addl $40, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: retl +; +; SSE-LABEL: call_ret_v3bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %esi +; SSE-NEXT: subl $40, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-NEXT: movl (%esi), %eax +; SSE-NEXT: movl 4(%esi), %ecx +; SSE-NEXT: leal {{[0-9]+}}(%esp), %edx +; SSE-NEXT: movl %edx, (%esp) +; SSE-NEXT: shll $16, %ecx +; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: calll returns_v3bf16@PLT +; SSE-NEXT: subl $4, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-NEXT: movw %cx, 4(%esi) +; SSE-NEXT: movl %eax, (%esi) +; SSE-NEXT: addl $40, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: retl + %val = load <3 x bfloat>, ptr %ptr + %bf16 = call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val) + store <3 x bfloat> %bf16, ptr %ptr + ret void +} + +define void @call_ret_v4bf16(ptr %ptr) #0 { +; NOSSE-LABEL: call_ret_v4bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %ebx +; NOSSE-NEXT: pushl %edi +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: subl $48, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; NOSSE-NEXT: movzwl 2(%esi), %ecx +; NOSSE-NEXT: movl (%esi), %eax +; NOSSE-NEXT: movl 4(%esi), %edx +; NOSSE-NEXT: movzwl 6(%esi), %edi +; NOSSE-NEXT: leal {{[0-9]+}}(%esp), %ebx +; NOSSE-NEXT: movl %ebx, (%esp) +; NOSSE-NEXT: shll $16, %edi +; NOSSE-NEXT: movl %edi, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %edx +; NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %ecx +; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: calll returns_v4bf16@PLT +; NOSSE-NEXT: subl $4, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; NOSSE-NEXT: movw %dx, 6(%esi) +; NOSSE-NEXT: movw %cx, 4(%esi) +; NOSSE-NEXT: movl %eax, (%esi) +; NOSSE-NEXT: addl $48, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: popl %edi +; NOSSE-NEXT: popl %ebx +; NOSSE-NEXT: retl +; +; SSE-LABEL: call_ret_v4bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %ebx +; SSE-NEXT: pushl %edi +; SSE-NEXT: pushl %esi +; SSE-NEXT: subl $48, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-NEXT: movzwl 2(%esi), %ecx +; SSE-NEXT: movl (%esi), %eax +; SSE-NEXT: movl 4(%esi), %edx +; SSE-NEXT: movzwl 6(%esi), %edi +; SSE-NEXT: leal {{[0-9]+}}(%esp), %ebx +; SSE-NEXT: movl %ebx, (%esp) +; SSE-NEXT: shll $16, %edi +; SSE-NEXT: movl %edi, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %edx +; SSE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %ecx +; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: calll returns_v4bf16@PLT +; SSE-NEXT: subl $4, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; SSE-NEXT: movw %dx, 6(%esi) +; SSE-NEXT: movw %cx, 4(%esi) +; SSE-NEXT: movl %eax, (%esi) +; SSE-NEXT: addl $48, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: popl %edi +; SSE-NEXT: popl %ebx +; SSE-NEXT: retl + %val = load <4 x bfloat>, ptr %ptr + %bf16 = call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val) + store <4 x bfloat> %bf16, ptr %ptr + ret void +} + +define void @call_ret_v8bf16(ptr %ptr) #0 { +; NOSSE-LABEL: call_ret_v8bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %ebp +; NOSSE-NEXT: pushl %ebx +; NOSSE-NEXT: pushl %edi +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: subl $108, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; NOSSE-NEXT: movzwl 2(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl (%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl 4(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movzwl 6(%esi), %edi +; NOSSE-NEXT: movl 8(%esi), %ebx +; NOSSE-NEXT: movzwl 10(%esi), %ebp +; NOSSE-NEXT: movl 12(%esi), %ecx +; NOSSE-NEXT: movzwl 14(%esi), %eax +; NOSSE-NEXT: leal {{[0-9]+}}(%esp), %edx +; NOSSE-NEXT: movl %edx, (%esp) +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %ecx +; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %ebp +; NOSSE-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %ebx +; NOSSE-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %edi +; NOSSE-NEXT: movl %edi, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: calll returns_v8bf16@PLT +; NOSSE-NEXT: subl $4, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %edi +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ebx +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ebp +; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; NOSSE-NEXT: movw %ax, 14(%esi) +; NOSSE-NEXT: movw %bp, 12(%esi) +; NOSSE-NEXT: movw %bx, 10(%esi) +; NOSSE-NEXT: movw %di, 8(%esi) +; NOSSE-NEXT: movw %dx, 6(%esi) +; NOSSE-NEXT: movw %cx, 4(%esi) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: movl %eax, (%esi) +; NOSSE-NEXT: addl $108, %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: popl %edi +; NOSSE-NEXT: popl %ebx +; NOSSE-NEXT: popl %ebp +; NOSSE-NEXT: retl +; +; SSE-LABEL: call_ret_v8bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %ebp +; SSE-NEXT: pushl %ebx +; SSE-NEXT: pushl %edi +; SSE-NEXT: pushl %esi +; SSE-NEXT: subl $108, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-NEXT: movzwl 2(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl (%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl 4(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movzwl 6(%esi), %edi +; SSE-NEXT: movl 8(%esi), %ebx +; SSE-NEXT: movzwl 10(%esi), %ebp +; SSE-NEXT: movl 12(%esi), %ecx +; SSE-NEXT: movzwl 14(%esi), %eax +; SSE-NEXT: leal {{[0-9]+}}(%esp), %edx +; SSE-NEXT: movl %edx, (%esp) +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %ecx +; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %ebp +; SSE-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %ebx +; SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %edi +; SSE-NEXT: movl %edi, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: calll returns_v8bf16@PLT +; SSE-NEXT: subl $4, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %edi +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ebx +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ebp +; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movw %ax, 14(%esi) +; SSE-NEXT: movw %bp, 12(%esi) +; SSE-NEXT: movw %bx, 10(%esi) +; SSE-NEXT: movw %di, 8(%esi) +; SSE-NEXT: movw %dx, 6(%esi) +; SSE-NEXT: movw %cx, 4(%esi) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: movl %eax, (%esi) +; SSE-NEXT: addl $108, %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: popl %edi +; SSE-NEXT: popl %ebx +; SSE-NEXT: popl %ebp +; SSE-NEXT: retl + %val = load <8 x bfloat>, ptr %ptr + %bf16 = call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val) + store <8 x bfloat> %bf16, ptr %ptr + ret void +} + +define void @call_ret_v16bf16(ptr %ptr) #0 { +; NOSSE-LABEL: call_ret_v16bf16: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %ebp +; NOSSE-NEXT: movl %esp, %ebp +; NOSSE-NEXT: pushl %ebx +; NOSSE-NEXT: pushl %edi +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: andl $-32, %esp +; NOSSE-NEXT: subl $256, %esp # imm = 0x100 +; NOSSE-NEXT: movl 8(%ebp), %esi +; NOSSE-NEXT: movzwl 2(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl (%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl 4(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movzwl 6(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl 8(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movzwl 10(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl 12(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movzwl 14(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl 16(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movzwl 18(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl 20(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movzwl 22(%esi), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl 24(%esi), %edi +; NOSSE-NEXT: movzwl 26(%esi), %edx +; NOSSE-NEXT: movl 28(%esi), %ecx +; NOSSE-NEXT: movzwl 30(%esi), %eax +; NOSSE-NEXT: leal {{[0-9]+}}(%esp), %ebx +; NOSSE-NEXT: movl %ebx, (%esp) +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %ecx +; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %edx +; NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: shll $16, %edi +; NOSSE-NEXT: movl %edi, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: shll $16, %eax +; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOSSE-NEXT: flds {{[0-9]+}}(%esp) +; NOSSE-NEXT: fstps {{[0-9]+}}(%esp) +; NOSSE-NEXT: calll returns_v16bf16@PLT +; NOSSE-NEXT: subl $4, %esp +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; NOSSE-NEXT: movl %edx, 28(%esi) +; NOSSE-NEXT: movl %eax, 24(%esi) +; NOSSE-NEXT: movl %ecx, 20(%esi) +; NOSSE-NEXT: movl %ebx, 16(%esi) +; NOSSE-NEXT: movl %edi, 12(%esi) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: movl %eax, 8(%esi) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: movl %eax, 4(%esi) +; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; NOSSE-NEXT: movl %eax, (%esi) +; NOSSE-NEXT: leal -12(%ebp), %esp +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: popl %edi +; NOSSE-NEXT: popl %ebx +; NOSSE-NEXT: popl %ebp +; NOSSE-NEXT: retl +; +; SSE-LABEL: call_ret_v16bf16: +; SSE: # %bb.0: +; SSE-NEXT: pushl %ebp +; SSE-NEXT: movl %esp, %ebp +; SSE-NEXT: pushl %ebx +; SSE-NEXT: pushl %edi +; SSE-NEXT: pushl %esi +; SSE-NEXT: andl $-32, %esp +; SSE-NEXT: subl $256, %esp # imm = 0x100 +; SSE-NEXT: movl 8(%ebp), %esi +; SSE-NEXT: movzwl 2(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl (%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl 4(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movzwl 6(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl 8(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movzwl 10(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl 12(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movzwl 14(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl 16(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movzwl 18(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl 20(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movzwl 22(%esi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl 24(%esi), %edi +; SSE-NEXT: movzwl 26(%esi), %edx +; SSE-NEXT: movl 28(%esi), %ecx +; SSE-NEXT: movzwl 30(%esi), %eax +; SSE-NEXT: leal {{[0-9]+}}(%esp), %ebx +; SSE-NEXT: movl %ebx, (%esp) +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %ecx +; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %edx +; SSE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: shll $16, %edi +; SSE-NEXT: movl %edi, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-NEXT: calll returns_v16bf16@PLT +; SSE-NEXT: subl $4, %esp +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-NEXT: movl %edx, 28(%esi) +; SSE-NEXT: movl %eax, 24(%esi) +; SSE-NEXT: movl %ecx, 20(%esi) +; SSE-NEXT: movl %ebx, 16(%esi) +; SSE-NEXT: movl %edi, 12(%esi) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: movl %eax, 8(%esi) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: movl %eax, 4(%esi) +; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SSE-NEXT: movl %eax, (%esi) +; SSE-NEXT: leal -12(%ebp), %esp +; SSE-NEXT: popl %esi +; SSE-NEXT: popl %edi +; SSE-NEXT: popl %ebx +; SSE-NEXT: popl %ebp +; SSE-NEXT: retl + %val = load <16 x bfloat>, ptr %ptr + %bf16 = call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val) + store <16 x bfloat> %bf16, ptr %ptr + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/issue56055.ll b/llvm/test/CodeGen/X86/issue56055.ll new file mode 100644 index 0000000000000..27eaf13e3b00b --- /dev/null +++ b/llvm/test/CodeGen/X86/issue56055.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -fast-isel < %s | FileCheck -check-prefixes=CHECK,FASTISEL %s +; RUN: llc < %s | FileCheck -check-prefixes=CHECK,SDAG %s + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-windows-msvc" + +define void @issue56055(ptr addrspace(270) %ptr, ptr %out) { +; CHECK-LABEL: issue56055: +; CHECK: # %bb.0: +; CHECK-NEXT: addl $2, %ecx +; CHECK-NEXT: movl %ecx, (%rdx) +; CHECK-NEXT: retq + %add.ptr = getelementptr inbounds i8, ptr addrspace(270) %ptr, i32 2 + store ptr addrspace(270) %add.ptr, ptr %out + ret void +} + +define void @issue56055_vector(<2 x ptr addrspace(270)> %ptr, ptr %out) { +; CHECK-LABEL: issue56055_vector: +; CHECK: # %bb.0: +; CHECK-NEXT: movdqa (%rcx), %xmm0 +; CHECK-NEXT: paddd __xmm@00000000000000000000000200000002(%rip), %xmm0 +; CHECK-NEXT: movq %xmm0, (%rdx) +; CHECK-NEXT: retq + %add.ptr = getelementptr inbounds i8, <2 x ptr addrspace(270)> %ptr, <2 x i32> + store <2 x ptr addrspace(270)> %add.ptr, ptr %out + ret void +} + +define void @issue56055_small_idx(ptr addrspace(270) %ptr, ptr %out, i16 %idx) { +; CHECK-LABEL: issue56055_small_idx: +; CHECK: # %bb.0: +; CHECK-NEXT: movswl %r8w, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: movl %eax, (%rdx) +; CHECK-NEXT: retq + %add.ptr = getelementptr inbounds i8, ptr addrspace(270) %ptr, i16 %idx + store ptr addrspace(270) %add.ptr, ptr %out + ret void +} + +define void @issue56055_small_idx_vector(<2 x ptr addrspace(270)> %ptr, ptr %out, <2 x i16> %idx) { +; CHECK-LABEL: issue56055_small_idx_vector: +; CHECK: # %bb.0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,1,4,5,6,7] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: paddd (%rcx), %xmm0 +; CHECK-NEXT: movq %xmm0, (%rdx) +; CHECK-NEXT: retq + %add.ptr = getelementptr inbounds i8, <2 x ptr addrspace(270)> %ptr, <2 x i16> %idx + store <2 x ptr addrspace(270)> %add.ptr, ptr %out + ret void +} + +define void @issue56055_large_idx(ptr addrspace(270) %ptr, ptr %out, i64 %idx) { +; CHECK-LABEL: issue56055_large_idx: +; CHECK: # %bb.0: +; CHECK-NEXT: addl %ecx, %r8d +; CHECK-NEXT: movl %r8d, (%rdx) +; CHECK-NEXT: retq + %add.ptr = getelementptr inbounds i8, ptr addrspace(270) %ptr, i64 %idx + store ptr addrspace(270) %add.ptr, ptr %out + ret void +} + +define void @issue56055_large_idx_vector(<2 x ptr addrspace(270)> %ptr, ptr %out, <2 x i64> %idx) { +; CHECK-LABEL: issue56055_large_idx_vector: +; CHECK: # %bb.0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; CHECK-NEXT: paddd (%rcx), %xmm0 +; CHECK-NEXT: movq %xmm0, (%rdx) +; CHECK-NEXT: retq + %add.ptr = getelementptr inbounds i8, <2 x ptr addrspace(270)> %ptr, <2 x i64> %idx + store <2 x ptr addrspace(270)> %add.ptr, ptr %out + ret void +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FASTISEL: {{.*}} +; SDAG: {{.*}} diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 502249a87c489..4e9ce012aff21 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -937,6 +937,41 @@ define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ret <16 x i32> %d } +; PR109790 +define void @PR109790(ptr sret([32 x i8]) %ret, ptr %a) { +; SSE-LABEL: PR109790: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767,32767,32767,32767,32767] +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pand 16(%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [64536,64536,64536,64536,64536,64536,64536,64536] +; SSE-NEXT: pmulhw %xmm2, %xmm0 +; SSE-NEXT: pmulhw %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rdi) +; SSE-NEXT: movdqa %xmm0, 16(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: PR109790: +; AVX: # %bb.0: +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: vmovdqa (%rsi), %ymm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536] +; AVX-NEXT: vmovdqa %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %load = load <16 x i16>, ptr %a, align 32 + %and = and <16 x i16> %load, + %ext = zext nneg <16 x i16> %and to <16 x i32> + %mul = mul nsw <16 x i32> %ext, + %srl = lshr <16 x i32> %mul, + %res = trunc nuw <16 x i32> %srl to <16 x i16> + store <16 x i16> %res, ptr %ret, align 32 + ret void +} + ; PR109790 define <16 x i16> @zext_mulhuw_v16i16_negative_constant(<16 x i16> %a) { ; SSE-LABEL: zext_mulhuw_v16i16_negative_constant: diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll index 643ed6484ae9f..a255717926d6b 100644 --- a/llvm/test/DebugInfo/NVPTX/debug-info.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll @@ -25,6 +25,10 @@ ; CHECK-DAG: .reg .b64 %rd<8>; ; CHECK: .loc [[DEBUG_INFO_CU:[0-9]+]] 5 0 ; CHECK: ld.param.u32 %r{{.+}}, [{{.+}}]; +; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; +; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; +; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; +; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; ; CHECK: .loc [[BUILTUIN_VARS_H:[0-9]+]] 78 180 ; CHECK: mov.u32 %r{{.+}}, %ctaid.x; ; CHECK: .loc [[BUILTUIN_VARS_H]] 89 180 @@ -38,10 +42,6 @@ ; CHECK: .loc [[DEBUG_INFO_CU]] 7 7 ; CHECK: @%p{{.+}} bra [[BB:\$L__.+]]; ; CHECK: ld.param.f32 %f{{.+}}, [{{.+}}]; -; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; -; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; -; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; -; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; ; CHECK: .loc [[DEBUG_INFO_CU]] 8 13 ; CHECK: mul.wide.u32 %rd{{.+}}, %r{{.+}}, 4; ; CHECK: add.s64 %rd{{.+}}, %rd{{.+}}, %rd{{.+}}; @@ -2665,22 +2665,22 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT:.b32 4586 // DW_AT_type ; CHECK-NEXT:.b8 25 // Abbrev [25] 0x8aa:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT:.b32 707 // DW_AT_abstract_origin -; CHECK-NEXT:.b64 $L__tmp0 // DW_AT_low_pc -; CHECK-NEXT:.b64 $L__tmp1 // DW_AT_high_pc +; CHECK-NEXT:.b64 $L__tmp1 // DW_AT_low_pc +; CHECK-NEXT:.b64 $L__tmp2 // DW_AT_high_pc ; CHECK-NEXT:.b8 1 // DW_AT_call_file ; CHECK-NEXT:.b8 6 // DW_AT_call_line ; CHECK-NEXT:.b8 11 // DW_AT_call_column ; CHECK-NEXT:.b8 25 // Abbrev [25] 0x8c2:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT:.b32 1466 // DW_AT_abstract_origin -; CHECK-NEXT:.b64 $L__tmp1 // DW_AT_low_pc -; CHECK-NEXT:.b64 $L__tmp2 // DW_AT_high_pc +; CHECK-NEXT:.b64 $L__tmp2 // DW_AT_low_pc +; CHECK-NEXT:.b64 $L__tmp3 // DW_AT_high_pc ; CHECK-NEXT:.b8 1 // DW_AT_call_file ; CHECK-NEXT:.b8 6 // DW_AT_call_line ; CHECK-NEXT:.b8 24 // DW_AT_call_column ; CHECK-NEXT:.b8 25 // Abbrev [25] 0x8da:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT:.b32 2060 // DW_AT_abstract_origin -; CHECK-NEXT:.b64 $L__tmp2 // DW_AT_low_pc -; CHECK-NEXT:.b64 $L__tmp3 // DW_AT_high_pc +; CHECK-NEXT:.b64 $L__tmp3 // DW_AT_low_pc +; CHECK-NEXT:.b64 $L__tmp4 // DW_AT_high_pc ; CHECK-NEXT:.b8 1 // DW_AT_call_file ; CHECK-NEXT:.b8 6 // DW_AT_call_line ; CHECK-NEXT:.b8 37 // DW_AT_call_column diff --git a/llvm/test/MC/AMDGPU/ds_swizzle.s b/llvm/test/MC/AMDGPU/ds_swizzle.s new file mode 100644 index 0000000000000..d768339fe5b77 --- /dev/null +++ b/llvm/test/MC/AMDGPU/ds_swizzle.s @@ -0,0 +1,131 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck -check-prefix=GFX7 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GFX8 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck -check-prefix=GFX9 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10PLUS %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-prefix=GFX10PLUS %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck -check-prefix=GFX10PLUS %s + +// RUN: not llvm-mc -triple=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR-PREGFX9 %s --implicit-check-not=error: +// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR-PREGFX9 %s --implicit-check-not=error: +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR %s --implicit-check-not=error: +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR %s --implicit-check-not=error: +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR %s --implicit-check-not=error: +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR %s --implicit-check-not=error: + +//============================================================================== +// FFT mode + +ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:swizzle(FFT,5) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,5) ; encoding: [0x05,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,5) ; encoding: [0x05,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0x1f,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0x1f,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:0xf000 +// GFX7: ds_swizzle_b32 v5, v1 offset:61440 ; encoding: [0x00,0xf0,0xd4,0xd8,0x01,0x00,0x00,0x05] +// GFX8: ds_swizzle_b32 v5, v1 offset:61440 ; encoding: [0x00,0xf0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xf0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xf0,0xd4,0xd8,0x01,0x00,0x00,0x05] + + +ds_swizzle_b32 v5, v1 offset:swizzle(FFT,32) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: FFT swizzle must be in the interval [0,31] + +ds_swizzle_b32 v5, v1 offset:swizzle(FFT,-2) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: FFT swizzle must be in the interval [0,31] + +ds_swizzle_b32 v5, v1 offset:swizzle(FFT) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a comma + +ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16,31) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a closing parentheses + +//============================================================================== +// ROTATE mode + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05] + +ds_swizzle_b32 v5, v1 offset:0xd000 +// GFX7: ds_swizzle_b32 v5, v1 offset:53248 ; encoding: [0x00,0xd0,0xd4,0xd8,0x01,0x00,0x00,0x05] +// GFX8: ds_swizzle_b32 v5, v1 offset:53248 ; encoding: [0x00,0xd0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX9: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xd0,0x7a,0xd8,0x01,0x00,0x00,0x05] +// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xd0,0xd4,0xd8,0x01,0x00,0x00,0x05] + + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,2,31) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: direction must be 0 (left) or 1 (right) + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,-1,31) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: direction must be 0 (left) or 1 (right) + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,32) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: number of threads to rotate must be in the interval [0,31] + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,-2) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: number of threads to rotate must be in the interval [0,31] + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a comma + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a comma + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a comma + +ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1,2) +// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU +// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a closing parentheses diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s index ee8218613a6dc..0b518acc884df 100644 --- a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s +++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s @@ -8257,15 +8257,15 @@ ds_read_u16 a5, v1 // NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU ds_read_u16 a5, v1 offset:4 -// GFX90A: ds_swizzle_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05] +// GFX90A: ds_swizzle_b32 a5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05] // NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU ds_swizzle_b32 a5, v1 offset:65535 -// GFX90A: ds_swizzle_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff] +// GFX90A: ds_swizzle_b32 a255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff] // NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU ds_swizzle_b32 a255, v1 offset:65535 -// GFX90A: ds_swizzle_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05] +// GFX90A: ds_swizzle_b32 a5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05] // NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU ds_swizzle_b32 a5, v255 offset:65535 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt index 688b5f916630a..bf9bad5240416 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt @@ -4208,24 +4208,48 @@ # GFX10: ds_sub_u64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00] 0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00 -# GFX10: ds_swizzle_b32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff] +# GFX10: ds_swizzle_b32 v255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff] 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff # GFX10: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05] 0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05 -# GFX10: ds_swizzle_b32 v5, v1 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05] +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05 -# GFX10: ds_swizzle_b32 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05] +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) gds ; encoding: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05 # GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05] 0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05 -# GFX10: ds_swizzle_b32 v5, v255 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05] +# GFX10: ds_swizzle_b32 v5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05] 0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05 +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05] +0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05] +0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05 + # GFX10: ds_wrap_rtn_b32 v255, v1, v2, v3 offset:65535 ; encoding: [0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff] 0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt index 1b632b56400fb..448cfc95de095 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt @@ -3802,16 +3802,16 @@ # GFX11: ds_sub_u64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00] 0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00 -# GFX11: ds_swizzle_b32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff] +# GFX11: ds_swizzle_b32 v255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff] 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff # GFX11: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05] 0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05 -# GFX11: ds_swizzle_b32 v5, v1 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05] +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05 -# GFX11: ds_swizzle_b32 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05] +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) gds ; encoding: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05 # GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05] @@ -3829,9 +3829,33 @@ # GFX11: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] 0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08 -# GFX11: ds_swizzle_b32 v5, v255 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05] +# GFX11: ds_swizzle_b32 v5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05] 0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05 +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05] +0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05] +0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05 + # GFX11: ds_wrap_rtn_b32 v255, v1, v2, v3 offset:65535 ; encoding: [0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff] 0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt index 473ede00603a7..080a4cab2a319 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt @@ -2415,13 +2415,13 @@ # GFX12: ds_sub_u64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00] 0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00 -# GFX12: ds_swizzle_b32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff] +# GFX12: ds_swizzle_b32 v255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff] 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff # GFX12: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05] 0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05 -# GFX12: ds_swizzle_b32 v5, v1 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05] +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05 # GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05] @@ -2439,9 +2439,33 @@ # GFX12: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] 0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08 -# GFX12: ds_swizzle_b32 v5, v255 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05] +# GFX12: ds_swizzle_b32 v5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05] 0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05 +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05] +0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05] +0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05 + +# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05] +0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05 + # GFX12: ds_store_2addr_b32 v0, v1, v2 ; encoding: [0x00,0x00,0x38,0xd8,0x00,0x01,0x02,0x00] 0x00,0x00,0x38,0xd8,0x00,0x01,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt index b8cc7ac604da2..081b40ac78f8c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt @@ -6192,13 +6192,13 @@ # GFX90A: ds_read_u16 a5, v1 offset:4 ; encoding: [0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05] 0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05 -# GFX90A: ds_swizzle_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05] +# GFX90A: ds_swizzle_b32 a5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05] 0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05 -# GFX90A: ds_swizzle_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff] +# GFX90A: ds_swizzle_b32 a255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff] 0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff -# GFX90A: ds_swizzle_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05] +# GFX90A: ds_swizzle_b32 a5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05] 0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05 # GFX90A: ds_swizzle_b32 a5, v1 ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_ds.txt index 58a44e1250542..2af4b8396d71a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_ds.txt @@ -1074,13 +1074,13 @@ # CHECK: ds_read_u16 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05] +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff] +# CHECK: ds_swizzle_b32 v255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff] 0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff -# CHECK: ds_swizzle_b32 v5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05] +# CHECK: ds_swizzle_b32 v5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05] 0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05 # CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] @@ -1089,9 +1089,33 @@ # CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] 0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05] +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) gds ; encoding: [0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x10,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05 + +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x00,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05 + +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x00,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05 + +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x00,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05 + +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x20,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05 + +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x20,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05 + +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0x7a,0xd8,0x01,0x00,0x00,0x05] +0xe0,0xc3,0x7a,0xd8,0x01,0x00,0x00,0x05 + +# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0x7a,0xd8,0x01,0x00,0x00,0x05] +0xe0,0xc7,0x7a,0xd8,0x01,0x00,0x00,0x05 + # CHECK: ds_permute_b32 v5, v1, v2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05] 0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05 diff --git a/llvm/test/Transforms/Attributor/address_space_info.ll b/llvm/test/Transforms/Attributor/address_space_info.ll index 73dd93c55b819..0c8b06ac6666a 100644 --- a/llvm/test/Transforms/Attributor/address_space_info.ll +++ b/llvm/test/Transforms/Attributor/address_space_info.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --prefix-filecheck-ir-name true -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefix=CHECK + +; REQUIRES: amdgpu-registered-target @dst = dso_local addrspace(1) externally_initialized global i32 0, align 4 @g1 = dso_local addrspace(1) externally_initialized global ptr null, align 4 diff --git a/llvm/test/Transforms/Attributor/nocapture-1.ll b/llvm/test/Transforms/Attributor/nocapture-1.ll index 3401ddfdd7d75..de5f31e470edf 100644 --- a/llvm/test/Transforms/Attributor/nocapture-1.ll +++ b/llvm/test/Transforms/Attributor/nocapture-1.ll @@ -257,7 +257,7 @@ define i32 @nc1_addrspace(ptr %q, ptr addrspace(1) %p, i1 %b) { ; TUNIT-NEXT: [[TMP:%.*]] = addrspacecast ptr addrspace(1) [[P]] to ptr ; TUNIT-NEXT: [[TMP2:%.*]] = select i1 [[B]], ptr [[TMP]], ptr [[Q]] ; TUNIT-NEXT: [[VAL:%.*]] = load i32, ptr [[TMP2]], align 4 -; TUNIT-NEXT: store i32 0, ptr addrspace(1) [[P]], align 4 +; TUNIT-NEXT: store i32 0, ptr [[TMP]], align 4 ; TUNIT-NEXT: store ptr [[Q]], ptr @g, align 8 ; TUNIT-NEXT: ret i32 [[VAL]] ; @@ -272,7 +272,7 @@ define i32 @nc1_addrspace(ptr %q, ptr addrspace(1) %p, i1 %b) { ; CGSCC-NEXT: [[TMP:%.*]] = addrspacecast ptr addrspace(1) [[P]] to ptr ; CGSCC-NEXT: [[TMP2:%.*]] = select i1 [[B]], ptr [[TMP]], ptr [[Q]] ; CGSCC-NEXT: [[VAL:%.*]] = load i32, ptr [[TMP2]], align 4 -; CGSCC-NEXT: store i32 0, ptr addrspace(1) [[P]], align 4 +; CGSCC-NEXT: store i32 0, ptr [[TMP]], align 4 ; CGSCC-NEXT: store ptr [[Q]], ptr @g, align 8 ; CGSCC-NEXT: ret i32 [[VAL]] ; diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll index 68f179c88116e..a5789790cc92a 100644 --- a/llvm/test/Transforms/Attributor/value-simplify.ll +++ b/llvm/test/Transforms/Attributor/value-simplify.ll @@ -838,8 +838,7 @@ define void @user() { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write) ; TUNIT-LABEL: define {{[^@]+}}@user ; TUNIT-SAME: () #[[ATTR5]] { -; TUNIT-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspacecast (ptr addrspace(3) @ConstAS3Ptr to ptr) to ptr addrspace(3) -; TUNIT-NEXT: store i32 0, ptr addrspace(3) [[TMP1]], align 4 +; TUNIT-NEXT: store i32 0, ptr addrspacecast (ptr addrspace(3) @ConstAS3Ptr to ptr), align 4 ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write) diff --git a/llvm/test/Transforms/GlobalOpt/externally-initialized.ll b/llvm/test/Transforms/GlobalOpt/externally-initialized.ll index 7a8244ea8297a..f0ee7830f273a 100644 --- a/llvm/test/Transforms/GlobalOpt/externally-initialized.ll +++ b/llvm/test/Transforms/GlobalOpt/externally-initialized.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -passes=globalopt | FileCheck %s +; RUN: opt < %s -passes=early-cse | opt -S -passes=globalopt | FileCheck %s --check-prefix=CHECK-CONSTANT ; This global is externally_initialized, which may modify the value between ; it's static initializer and any code in this module being run, so the only @@ -12,6 +13,10 @@ ; CHECK: @b = internal unnamed_addr externally_initialized global i32 undef @b = internal externally_initialized global i32 undef +; This constant global is externally_initialized, which may modify the value +; between its static const initializer and any code in this module being run, so +; the read from it cannot be const propagated +@c = internal externally_initialized constant i32 42 define void @foo() { ; CHECK-LABEL: foo @@ -35,3 +40,11 @@ entry: %val = load i32, ptr @b ret i32 %val } + +define i32 @bam() { +; CHECK-CONSTANT-LABEL: bam +entry: +; CHECK-CONSTANT: %val = load i32, ptr @c + %val = load i32, ptr @c + ret i32 %val +} diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/issue110433.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/issue110433.ll new file mode 100644 index 0000000000000..4297d1c27639e --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/issue110433.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s + +define <8 x i1> @load_vector_of_flat_ptr_from_constant(ptr addrspace(4) %ptr) { +; CHECK-LABEL: define <8 x i1> @load_vector_of_flat_ptr_from_constant( +; CHECK-SAME: ptr addrspace(4) [[PTR:%.*]]) { +; CHECK-NEXT: [[LD:%.*]] = load <8 x ptr>, ptr addrspace(4) [[PTR]], align 128 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast <8 x ptr> [[LD]] to <8 x ptr addrspace(1)> +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast <8 x ptr addrspace(1)> [[TMP1]] to <8 x ptr> +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <8 x ptr> [[TMP2]], zeroinitializer +; CHECK-NEXT: ret <8 x i1> [[CMP]] +; + %ld = load <8 x ptr>, ptr addrspace(4) %ptr, align 128 + %cmp = icmp eq <8 x ptr> %ld, zeroinitializer + ret <8 x i1> %cmp +} + +define <8 x i1> @load_vector_of_flat_ptr_from_global(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define <8 x i1> @load_vector_of_flat_ptr_from_global( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) { +; CHECK-NEXT: [[LD:%.*]] = load <8 x ptr>, ptr addrspace(1) [[PTR]], align 128 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <8 x ptr> [[LD]], zeroinitializer +; CHECK-NEXT: ret <8 x i1> [[CMP]] +; + %ld = load <8 x ptr>, ptr addrspace(1) %ptr, align 128 + %cmp = icmp eq <8 x ptr> %ld, zeroinitializer + ret <8 x i1> %cmp +} + +define <8 x i1> @load_vector_of_flat_ptr_from_global_invariant(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define <8 x i1> @load_vector_of_flat_ptr_from_global_invariant( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) { +; CHECK-NEXT: [[LD:%.*]] = load <8 x ptr>, ptr addrspace(1) [[PTR]], align 128, !invariant [[META0:![0-9]+]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <8 x ptr> [[LD]], zeroinitializer +; CHECK-NEXT: ret <8 x i1> [[CMP]] +; + %ld = load <8 x ptr>, ptr addrspace(1) %ptr, align 128, !invariant !0 + %cmp = icmp eq <8 x ptr> %ld, zeroinitializer + ret <8 x i1> %cmp +} + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll index 3ba21abb069ba..7ce43908c62cd 100644 --- a/llvm/test/Transforms/InstCombine/icmp-mul.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll @@ -1223,3 +1223,110 @@ define <2 x i1> @mul_mixed_nsw_nuw_xy_z_setnonzero_vec_ule(<2 x i8> %x, <2 x i8> %cmp = icmp ule <2 x i8> %muly, %mulx ret <2 x i1> %cmp } + +define i1 @icmp_eq_mul_nsw_nonequal(i8 %a, i8 %c) { +; CHECK-LABEL: @icmp_eq_mul_nsw_nonequal( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[C:%.*]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %b = add i8 %a, 1 + %mul1 = mul nsw i8 %a, %c + %mul2 = mul nsw i8 %b, %c + %cmp = icmp eq i8 %mul1, %mul2 + ret i1 %cmp +} + +define i1 @icmp_eq_mul_nuw_nonequal(i8 %a, i8 %c) { +; CHECK-LABEL: @icmp_eq_mul_nuw_nonequal( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[C:%.*]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %b = add i8 %a, 1 + %mul1 = mul nuw i8 %a, %c + %mul2 = mul nuw i8 %b, %c + %cmp = icmp eq i8 %mul1, %mul2 + ret i1 %cmp +} + +define i1 @icmp_eq_mul_nsw_nonequal_commuted(i8 %a, i8 %c) { +; CHECK-LABEL: @icmp_eq_mul_nsw_nonequal_commuted( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[C:%.*]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %b = add i8 %a, 1 + %mul1 = mul nsw i8 %a, %c + %mul2 = mul nsw i8 %c, %b + %cmp = icmp eq i8 %mul1, %mul2 + ret i1 %cmp +} + +define i1 @icmp_ne_mul_nsw_nonequal(i8 %a, i8 %c) { +; CHECK-LABEL: @icmp_ne_mul_nsw_nonequal( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i8 [[C:%.*]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %b = add i8 %a, 1 + %mul1 = mul nsw i8 %a, %c + %mul2 = mul nsw i8 %b, %c + %cmp = icmp ne i8 %mul1, %mul2 + ret i1 %cmp +} + +; Negative tests + +define i1 @icmp_eq_mul_nsw_mayequal(i8 %a, i8 %b, i8 %c) { +; CHECK-LABEL: @icmp_eq_mul_nsw_mayequal( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i8 [[A:%.*]], [[C:%.*]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nsw i8 [[B:%.*]], [[C]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[MUL1]], [[MUL2]] +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %mul1 = mul nsw i8 %a, %c + %mul2 = mul nsw i8 %b, %c + %cmp = icmp eq i8 %mul1, %mul2 + ret i1 %cmp +} + +define i1 @icmp_eq_mul_nsw_nuw_nonequal(i8 %a, i8 %c) { +; CHECK-LABEL: @icmp_eq_mul_nsw_nuw_nonequal( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B:%.*]] = add i8 [[A:%.*]], 1 +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i8 [[A]], [[C:%.*]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nuw i8 [[B]], [[C]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[MUL1]], [[MUL2]] +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %b = add i8 %a, 1 + %mul1 = mul nsw i8 %a, %c + %mul2 = mul nuw i8 %b, %c + %cmp = icmp eq i8 %mul1, %mul2 + ret i1 %cmp +} + +define i1 @icmp_ult_mul_nsw_nonequal(i8 %a, i8 %c) { +; CHECK-LABEL: @icmp_ult_mul_nsw_nonequal( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B:%.*]] = add i8 [[A:%.*]], 1 +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i8 [[A]], [[C:%.*]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nsw i8 [[B]], [[C]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[MUL1]], [[MUL2]] +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %b = add i8 %a, 1 + %mul1 = mul nsw i8 %a, %c + %mul2 = mul nsw i8 %b, %c + %cmp = icmp ult i8 %mul1, %mul2 + ret i1 %cmp +} diff --git a/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll index e9e105b91f3c1..5d32158e61715 100644 --- a/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll @@ -18,6 +18,23 @@ define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) { ret i32 %t } +define i32 @shrinkShiftExtractElt_i64_to_i32_0(<3 x i64> %x) { +; LE-LABEL: @shrinkShiftExtractElt_i64_to_i32_0( +; LE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; LE-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i64 1 +; LE-NEXT: ret i32 [[T]] +; +; BE-LABEL: @shrinkShiftExtractElt_i64_to_i32_0( +; BE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; BE-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i64 0 +; BE-NEXT: ret i32 [[T]] +; + %e = extractelement <3 x i64> %x, i32 0 + %s = lshr i64 %e, 32 + %t = trunc i64 %s to i32 + ret i32 %t +} + define i32 @vscale_shrinkExtractElt_i64_to_i32_0( %x) { ; LE-LABEL: @vscale_shrinkExtractElt_i64_to_i32_0( ; LE-NEXT: [[TMP1:%.*]] = bitcast [[X:%.*]] to @@ -34,6 +51,22 @@ define i32 @vscale_shrinkExtractElt_i64_to_i32_0( %x) { ret i32 %t } +define i32 @vscale_shrinkShiftExtractElt_i64_to_i32_0( %x) { +; LE-LABEL: @vscale_shrinkShiftExtractElt_i64_to_i32_0( +; LE-NEXT: [[TMP1:%.*]] = bitcast [[X:%.*]] to +; LE-NEXT: [[T:%.*]] = extractelement [[TMP1]], i64 1 +; LE-NEXT: ret i32 [[T]] +; +; BE-LABEL: @vscale_shrinkShiftExtractElt_i64_to_i32_0( +; BE-NEXT: [[TMP1:%.*]] = bitcast [[X:%.*]] to +; BE-NEXT: [[T:%.*]] = extractelement [[TMP1]], i64 0 +; BE-NEXT: ret i32 [[T]] +; + %e = extractelement %x, i32 0 + %s = lshr i64 %e, 32 + %t = trunc i64 %s to i32 + ret i32 %t +} define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) { ; LE-LABEL: @shrinkExtractElt_i64_to_i32_1( @@ -83,6 +116,23 @@ define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) { ret i16 %t } +define i16 @shrinkShiftExtractElt_i64_to_i16_0(<3 x i64> %x) { +; LE-LABEL: @shrinkShiftExtractElt_i64_to_i16_0( +; LE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; LE-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 3 +; LE-NEXT: ret i16 [[T]] +; +; BE-LABEL: @shrinkShiftExtractElt_i64_to_i16_0( +; BE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; BE-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 0 +; BE-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 0 + %s = ashr i64 %e, 48 + %t = trunc i64 %s to i16 + ret i16 %t +} + define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) { ; LE-LABEL: @shrinkExtractElt_i64_to_i16_1( ; LE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> @@ -157,6 +207,20 @@ define i30 @shrinkExtractElt_i40_to_i30_1(<3 x i40> %x) { ret i30 %t } +; Do not optimize if the shift amount isn't a whole number of truncated bits. +define i16 @shrinkShiftExtractElt_i64_to_i16_0_badshift(<3 x i64> %x) { +; ANY-LABEL: @shrinkShiftExtractElt_i64_to_i16_0_badshift( +; ANY-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 0 +; ANY-NEXT: [[S:%.*]] = lshr i64 [[E]], 31 +; ANY-NEXT: [[T:%.*]] = trunc i64 [[S]] to i16 +; ANY-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 0 + %s = lshr i64 %e, 31 + %t = trunc i64 %s to i16 + ret i16 %t +} + ; Do not canonicalize if that would increase the instruction count. declare void @use(i64) define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) { @@ -172,6 +236,45 @@ define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) { ret i16 %t } +; Do not canonicalize if that would increase the instruction count. +define i16 @shrinkShiftExtractElt_i64_to_i16_2_extra_shift_use(<3 x i64> %x) { +; ANY-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_shift_use( +; ANY-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2 +; ANY-NEXT: [[S:%.*]] = lshr i64 [[E]], 48 +; ANY-NEXT: call void @use(i64 [[S]]) +; ANY-NEXT: [[T:%.*]] = trunc nuw i64 [[S]] to i16 +; ANY-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i64 2 + %s = lshr i64 %e, 48 + call void @use(i64 %s) + %t = trunc i64 %s to i16 + ret i16 %t +} + +; OK to reuse the extract if we remove the shift+trunc. +define i16 @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use(<3 x i64> %x) { +; LE-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use( +; LE-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2 +; LE-NEXT: call void @use(i64 [[E]]) +; LE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X]] to <12 x i16> +; LE-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 11 +; LE-NEXT: ret i16 [[T]] +; +; BE-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use( +; BE-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2 +; BE-NEXT: call void @use(i64 [[E]]) +; BE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X]] to <12 x i16> +; BE-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 8 +; BE-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i64 2 + call void @use(i64 %e) + %s = lshr i64 %e, 48 + %t = trunc i64 %s to i16 + ret i16 %t +} + ; Check to ensure PR45314 remains fixed. define <4 x i64> @PR45314(<4 x i64> %x) { ; LE-LABEL: @PR45314( diff --git a/llvm/test/Transforms/InstCombine/trunc-extractelement.ll b/llvm/test/Transforms/InstCombine/trunc-extractelement.ll index 5e62ca9cd591d..ba2d07009d9c7 100644 --- a/llvm/test/Transforms/InstCombine/trunc-extractelement.ll +++ b/llvm/test/Transforms/InstCombine/trunc-extractelement.ll @@ -18,6 +18,23 @@ define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) { ret i32 %t } +define i32 @shrinkShiftExtractElt_i64_to_i32_0(<3 x i64> %x) { +; LE-LABEL: @shrinkShiftExtractElt_i64_to_i32_0( +; LE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; LE-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i64 1 +; LE-NEXT: ret i32 [[T]] +; +; BE-LABEL: @shrinkShiftExtractElt_i64_to_i32_0( +; BE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; BE-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i64 0 +; BE-NEXT: ret i32 [[T]] +; + %e = extractelement <3 x i64> %x, i32 0 + %s = lshr i64 %e, 32 + %t = trunc i64 %s to i32 + ret i32 %t +} + define i32 @vscale_shrinkExtractElt_i64_to_i32_0( %x) { ; LE-LABEL: @vscale_shrinkExtractElt_i64_to_i32_0( ; LE-NEXT: [[TMP1:%.*]] = bitcast [[X:%.*]] to @@ -34,6 +51,22 @@ define i32 @vscale_shrinkExtractElt_i64_to_i32_0( %x) { ret i32 %t } +define i32 @vscale_shrinkShiftExtractElt_i64_to_i32_0( %x) { +; LE-LABEL: @vscale_shrinkShiftExtractElt_i64_to_i32_0( +; LE-NEXT: [[TMP1:%.*]] = bitcast [[X:%.*]] to +; LE-NEXT: [[T:%.*]] = extractelement [[TMP1]], i64 1 +; LE-NEXT: ret i32 [[T]] +; +; BE-LABEL: @vscale_shrinkShiftExtractElt_i64_to_i32_0( +; BE-NEXT: [[TMP1:%.*]] = bitcast [[X:%.*]] to +; BE-NEXT: [[T:%.*]] = extractelement [[TMP1]], i64 0 +; BE-NEXT: ret i32 [[T]] +; + %e = extractelement %x, i32 0 + %s = lshr i64 %e, 32 + %t = trunc i64 %s to i32 + ret i32 %t +} define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) { ; LE-LABEL: @shrinkExtractElt_i64_to_i32_1( @@ -83,6 +116,23 @@ define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) { ret i16 %t } +define i16 @shrinkShiftExtractElt_i64_to_i16_0(<3 x i64> %x) { +; LE-LABEL: @shrinkShiftExtractElt_i64_to_i16_0( +; LE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; LE-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 3 +; LE-NEXT: ret i16 [[T]] +; +; BE-LABEL: @shrinkShiftExtractElt_i64_to_i16_0( +; BE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; BE-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 0 +; BE-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 0 + %s = ashr i64 %e, 48 + %t = trunc i64 %s to i16 + ret i16 %t +} + define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) { ; LE-LABEL: @shrinkExtractElt_i64_to_i16_1( ; LE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> @@ -157,6 +207,20 @@ define i30 @shrinkExtractElt_i40_to_i30_1(<3 x i40> %x) { ret i30 %t } +; Do not optimize if the shift amount isn't a whole number of truncated bits. +define i16 @shrinkShiftExtractElt_i64_to_i16_0_badshift(<3 x i64> %x) { +; ANY-LABEL: @shrinkShiftExtractElt_i64_to_i16_0_badshift( +; ANY-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 0 +; ANY-NEXT: [[S:%.*]] = lshr i64 [[E]], 31 +; ANY-NEXT: [[T:%.*]] = trunc i64 [[S]] to i16 +; ANY-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 0 + %s = lshr i64 %e, 31 + %t = trunc i64 %s to i16 + ret i16 %t +} + ; Do not canonicalize if that would increase the instruction count. declare void @use(i64) define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) { @@ -172,6 +236,45 @@ define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) { ret i16 %t } +; Do not canonicalize if that would increase the instruction count. +define i16 @shrinkShiftExtractElt_i64_to_i16_2_extra_shift_use(<3 x i64> %x) { +; ANY-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_shift_use( +; ANY-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2 +; ANY-NEXT: [[S:%.*]] = lshr i64 [[E]], 48 +; ANY-NEXT: call void @use(i64 [[S]]) +; ANY-NEXT: [[T:%.*]] = trunc nuw i64 [[S]] to i16 +; ANY-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i64 2 + %s = lshr i64 %e, 48 + call void @use(i64 %s) + %t = trunc i64 %s to i16 + ret i16 %t +} + +; OK to reuse the extract if we remove the shift+trunc. +define i16 @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use(<3 x i64> %x) { +; LE-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use( +; LE-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2 +; LE-NEXT: call void @use(i64 [[E]]) +; LE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X]] to <12 x i16> +; LE-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 11 +; LE-NEXT: ret i16 [[T]] +; +; BE-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use( +; BE-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2 +; BE-NEXT: call void @use(i64 [[E]]) +; BE-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X]] to <12 x i16> +; BE-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 8 +; BE-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i64 2 + call void @use(i64 %e) + %s = lshr i64 %e, 48 + %t = trunc i64 %s to i16 + ret i16 %t +} + ; Check to ensure PR45314 remains fixed. define <4 x i64> @PR45314(<4 x i64> %x) { ; LE-LABEL: @PR45314( diff --git a/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg b/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg new file mode 100644 index 0000000000000..78dd74cd6dc63 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg @@ -0,0 +1,2 @@ +if not "SPIRV" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll b/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll new file mode 100644 index 0000000000000..dd3a4d9699fdb --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll @@ -0,0 +1,105 @@ +; RUN: opt -passes=loop-idiom -mtriple=spirv32-- -S < %s | FileCheck %s +; RUN: opt -passes=loop-idiom -mtriple=spirv64-- -S < %s | FileCheck %s + +; Mostly copied from x86 version. + +;To recognize this pattern: +;int popcount(unsigned long long a) { +; int c = 0; +; while (a) { +; c++; +; a &= a - 1; +; } +; return c; +;} +; + +; CHECK-LABEL: @popcount_i64 +; CHECK: entry +; CHECK: llvm.ctpop.i64 +; CHECK: ret +define i32 @popcount_i64(i64 %a) nounwind uwtable readnone ssp { +entry: + %tobool3 = icmp eq i64 %a, 0 + br i1 %tobool3, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ] + %inc = add nsw i32 %c.05, 1 + %sub = add i64 %a.addr.04, -1 + %and = and i64 %sub, %a.addr.04 + %tobool = icmp eq i64 %and, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ] + ret i32 %c.0.lcssa +} + +; CHECK-LABEL: @popcount_i32 +; CHECK: entry +; CHECK: llvm.ctpop.i32 +; CHECK: ret +define i32 @popcount_i32(i32 %a) nounwind uwtable readnone ssp { +entry: + %tobool3 = icmp eq i32 %a, 0 + br i1 %tobool3, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %a.addr.04 = phi i32 [ %and, %while.body ], [ %a, %entry ] + %inc = add nsw i32 %c.05, 1 + %sub = add i32 %a.addr.04, -1 + %and = and i32 %sub, %a.addr.04 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ] + ret i32 %c.0.lcssa +} + +; To recognize this pattern: +;int popcount(unsigned long long a, int mydata1, int mydata2) { +; int c = 0; +; while (a) { +; c++; +; a &= a - 1; +; mydata1 *= c; +; mydata2 *= (int)a; +; } +; return c + mydata1 + mydata2; +;} + +; CHECK-LABEL: @popcount2 +; CHECK: entry +; CHECK: llvm.ctpop.i64 +; CHECK: ret +define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp { +entry: + %tobool9 = icmp eq i64 %a, 0 + br i1 %tobool9, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ] + %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ] + %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ] + %inc = add nsw i32 %c.013, 1 + %sub = add i64 %a.addr.010, -1 + %and = and i64 %sub, %a.addr.010 + %mul = mul nsw i32 %inc, %mydata1.addr.011 + %conv = trunc i64 %and to i32 + %mul1 = mul nsw i32 %conv, %mydata2.addr.012 + %tobool = icmp eq i64 %and, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ] + %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ] + %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ] + %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa + %add2 = add i32 %add, %c.0.lcssa + ret i32 %add2 +} diff --git a/llvm/test/Transforms/LoopLoadElim/unknown-stride-known-dep.ll b/llvm/test/Transforms/LoopLoadElim/unknown-stride-known-dep.ll new file mode 100644 index 0000000000000..e7b0968c8e826 --- /dev/null +++ b/llvm/test/Transforms/LoopLoadElim/unknown-stride-known-dep.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-load-elim -S %s | FileCheck %s + +; The test was originally written as part of the investigation of #96656. +; The bug has now been marked as invalid, and we keep the test to show +; LLE's operation on known dependence returned by LAA. + +define void @unknown_stride_known_dependence(ptr %x, ptr %y, i1 %cond) { +; CHECK-LABEL: define void @unknown_stride_known_dependence( +; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[X]], align 4 +; CHECK-NEXT: br i1 [[COND]], label %[[NOLOOP_EXIT:.*]], label %[[LOOP_LVER_CHECK:.*]] +; CHECK: [[LOOP_LVER_CHECK]]: +; CHECK-NEXT: [[SEXT_X:%.*]] = sext i32 [[LOAD]] to i64 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[Y]], i64 8 +; CHECK-NEXT: [[GEP_16:%.*]] = getelementptr i8, ptr [[Y]], i64 16 +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[LOAD]], 1 +; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[LOOP_PH_LVER_ORIG:.*]], label %[[LOOP_PH:.*]] +; CHECK: [[LOOP_PH_LVER_ORIG]]: +; CHECK-NEXT: br label %[[LOOP_LVER_ORIG:.*]] +; CHECK: [[LOOP_LVER_ORIG]]: +; CHECK-NEXT: [[IV_LVER_ORIG:%.*]] = phi i64 [ 0, %[[LOOP_PH_LVER_ORIG]] ], [ [[IV_NEXT_LVER_ORIG:%.*]], %[[LOOP_LVER_ORIG]] ] +; CHECK-NEXT: [[MUL_LVER_ORIG:%.*]] = mul i64 [[IV_LVER_ORIG]], [[SEXT_X]] +; CHECK-NEXT: [[GEP_8_MUL_LVER_ORIG:%.*]] = getelementptr double, ptr [[GEP_8]], i64 [[MUL_LVER_ORIG]] +; CHECK-NEXT: [[LOAD_8_LVER_ORIG:%.*]] = load double, ptr [[GEP_8_MUL_LVER_ORIG]], align 8 +; CHECK-NEXT: [[GEP_16_MUL_LVER_ORIG:%.*]] = getelementptr double, ptr [[GEP_16]], i64 [[MUL_LVER_ORIG]] +; CHECK-NEXT: store double [[LOAD_8_LVER_ORIG]], ptr [[GEP_16_MUL_LVER_ORIG]], align 8 +; CHECK-NEXT: [[IV_NEXT_LVER_ORIG]] = add i64 [[IV_LVER_ORIG]], 1 +; CHECK-NEXT: [[ICMP_LVER_ORIG:%.*]] = icmp eq i64 [[IV_LVER_ORIG]], 1 +; CHECK-NEXT: br i1 [[ICMP_LVER_ORIG]], label %[[EXIT_LOOPEXIT_LOOPEXIT:.*]], label %[[LOOP_LVER_ORIG]] +; CHECK: [[LOOP_PH]]: +; CHECK-NEXT: [[LOAD_INITIAL:%.*]] = load double, ptr [[GEP_8]], align 8 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[STORE_FORWARDED:%.*]] = phi double [ [[LOAD_INITIAL]], %[[LOOP_PH]] ], [ [[STORE_FORWARDED]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[IV]], [[SEXT_X]] +; CHECK-NEXT: [[GEP_8_MUL:%.*]] = getelementptr double, ptr [[GEP_8]], i64 [[MUL]] +; CHECK-NEXT: [[LOAD_8:%.*]] = load double, ptr [[GEP_8_MUL]], align 8 +; CHECK-NEXT: [[GEP_16_MUL:%.*]] = getelementptr double, ptr [[GEP_16]], i64 [[MUL]] +; CHECK-NEXT: store double [[STORE_FORWARDED]], ptr [[GEP_16_MUL]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[ICMP]], label %[[EXIT_LOOPEXIT_LOOPEXIT1:.*]], label %[[LOOP]] +; CHECK: [[NOLOOP_EXIT]]: +; CHECK-NEXT: [[SEXT:%.*]] = sext i32 [[LOAD]] to i64 +; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr double, ptr [[Y]], i64 [[SEXT]] +; CHECK-NEXT: [[LOAD_Y:%.*]] = load double, ptr [[GEP_Y]], align 8 +; CHECK-NEXT: store double [[LOAD_Y]], ptr [[X]], align 8 +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT_LOOPEXIT_LOOPEXIT]]: +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] +; CHECK: [[EXIT_LOOPEXIT_LOOPEXIT1]]: +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %load = load i32, ptr %x, align 4 + br i1 %cond, label %noloop.exit, label %loop.ph + +loop.ph: ; preds = %entry + %sext.x = sext i32 %load to i64 + %gep.8 = getelementptr i8, ptr %y, i64 8 + %gep.16 = getelementptr i8, ptr %y, i64 16 + br label %loop + +loop: ; preds = %loop, %loop.ph + %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop ] + %mul = mul i64 %iv, %sext.x + %gep.8.mul = getelementptr double, ptr %gep.8, i64 %mul + %load.8 = load double, ptr %gep.8.mul, align 8 + %gep.16.mul = getelementptr double, ptr %gep.16, i64 %mul + store double %load.8, ptr %gep.16.mul + %iv.next = add i64 %iv, 1 + %icmp = icmp eq i64 %iv, 1 + br i1 %icmp, label %exit, label %loop + +noloop.exit: ; preds = %loop.ph + %sext = sext i32 %load to i64 + %gep.y = getelementptr double, ptr %y, i64 %sext + %load.y = load double, ptr %gep.y + store double %load.y, ptr %x + br label %exit + +exit: ; preds = %loop.body + ret void +} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll index 272e47b3ee1e7..5eebf13313555 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll @@ -4514,7 +4514,7 @@ define void @test8() { ; EPILOG-NEXT: %i4.7 = add nuw nsw i64 %i3, 8 ; EPILOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.7 ; EPILOG: latch.7: -; EPILOG-NEXT: %niter.next.7 = add i64 %niter, 8 +; EPILOG-NEXT: %niter.next.7 = add nuw nsw i64 %niter, 8 ; EPILOG-NEXT: %niter.ncmp.7 = icmp ne i64 %niter.next.7, %unroll_iter ; EPILOG-NEXT: br i1 %niter.ncmp.7, label %innerH, label %exit.unr-lcssa.loopexit ; EPILOG: exit.unr-lcssa.loopexit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicated-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/predicated-instruction-cost.ll new file mode 100644 index 0000000000000..0072dd95bd098 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/predicated-instruction-cost.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Test case for https://github.com/llvm/llvm-project/issues/110295. +define void @predicated_urem_shl_cost(ptr %A, i32 %x, i1 %c) { +; CHECK-LABEL: define void @predicated_urem_shl_cost( +; CHECK-SAME: ptr [[A:%.*]], i32 [[X:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[REM:%.*]] = urem i32 2, [[X]] +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[L]], [[REM]] +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[P:%.*]] = phi i32 [ 0, %[[LOOP_HEADER]] ], [ [[SHL]], %[[THEN]] ] +; CHECK-NEXT: store i32 [[P]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ] + %gep = getelementptr inbounds i32, ptr %A, i32 %iv + %l = load i32, ptr %gep + br i1 %c, label %then, label %loop.latch + +then: + %rem = urem i32 2, %x + %shl = shl i32 %l, %rem + br label %loop.latch + +loop.latch: + %p = phi i32 [ 0, %loop.header ], [ %shl, %then ] + store i32 %p, ptr %gep + %iv.next = add i32 %iv, 1 + %ec = icmp eq i32 %iv, 0 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll index 0d5871e24c524..383b62b368ef0 100644 --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -14,11 +15,59 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fadd_select1( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]] -; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] define float @fcmp_0_fadd_select1(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define float @fcmp_0_fadd_select1( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt float [[TMP8]], 0.000000e+00 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[SUM_1]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], float [[ADD]], float [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -56,11 +105,59 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fadd_select2( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]] -; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] define double @fcmp_0_fadd_select2(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define double @fcmp_0_fadd_select2( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt double [[TMP8]], 0.000000e+00 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[TMP8]], [[SUM_1]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], double [[ADD]], double [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret double [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -99,11 +196,61 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_val_fadd_select1( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat -; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]] -; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] define float @fcmp_val_fadd_select1(ptr noalias %x, float %y, i32 %N) nounwind readonly { +; CHECK-LABEL: define float @fcmp_val_fadd_select1( +; CHECK-SAME: ptr noalias [[X:%.*]], float [[Y:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt float [[TMP8]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[SUM_1]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], float [[ADD]], float [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -142,11 +289,61 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_val_fadd_select2( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat -; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]] -; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] define double @fcmp_val_fadd_select2(ptr noalias %x, double %y, i32 %N) nounwind readonly { +; CHECK-LABEL: define double @fcmp_val_fadd_select2( +; CHECK-SAME: ptr noalias [[X:%.*]], double [[Y:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt double [[TMP8]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[TMP8]], [[SUM_1]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], double [[ADD]], double [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret double [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -186,11 +383,64 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_array_elm_fadd_select1( -; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]] -; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]] -; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]] define float @fcmp_array_elm_fadd_select1(ptr noalias %x, ptr noalias %y, i32 %N) nounwind readonly { +; CHECK-LABEL: define float @fcmp_array_elm_fadd_select1( +; CHECK-SAME: ptr noalias [[X:%.*]], ptr noalias [[Y:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt float [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[SUM_1]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], float [[ADD]], float [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[TMP12]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -232,11 +482,64 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_array_elm_fadd_select2( -; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]] -; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]] -; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]] define double @fcmp_array_elm_fadd_select2(ptr noalias %x, ptr noalias %y, i32 %N) nounwind readonly { +; CHECK-LABEL: define double @fcmp_array_elm_fadd_select2( +; CHECK-SAME: ptr noalias [[X:%.*]], ptr noalias [[Y:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt double [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[TMP10]], [[SUM_1]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], double [[ADD]], double [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret double [[TMP12]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -276,11 +579,59 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fsub_select1( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = fsub fast <4 x float> %[[V2:.*]], %[[V0]] -; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] define float @fcmp_0_fsub_select1(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define float @fcmp_0_fsub_select1( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt float [[TMP8]], 0.000000e+00 +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[SUM_1]], [[TMP8]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], float [[SUB]], float [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -316,9 +667,33 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fsub_select1_novectorize( -; CHECK-NOT: <4 x float> define float @fcmp_0_fsub_select1_novectorize(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define float @fcmp_0_fsub_select1_novectorize( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi float [ 0.000000e+00, %[[FOR_HEADER]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00 +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[SUM_1]], [[TMP0]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], float [[SUB]], float [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[TMP1]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -356,11 +731,59 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fsub_select2( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = fsub fast <4 x double> %[[V2:.*]], %[[V0]] -; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] define double @fcmp_0_fsub_select2(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define double @fcmp_0_fsub_select2( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt double [[TMP8]], 0.000000e+00 +; CHECK-NEXT: [[SUB:%.*]] = fsub fast double [[SUM_1]], [[TMP8]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], double [[SUB]], double [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret double [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -397,9 +820,33 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fsub_select2_notvectorize( -; CHECK-NOT: <4 x doubole> define double @fcmp_0_fsub_select2_notvectorize(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define double @fcmp_0_fsub_select2_notvectorize( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi double [ 0.000000e+00, %[[FOR_HEADER]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp ogt double [[TMP0]], 0.000000e+00 +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[SUM_1]], [[TMP0]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], double [[SUB]], double [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret double [[TMP1]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -437,11 +884,59 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fmult_select1( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = fmul fast <4 x float> %[[V2:.*]], %[[V0]] -; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] define float @fcmp_0_fmult_select1(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define float @fcmp_0_fmult_select1( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt float [[TMP8]], 0.000000e+00 +; CHECK-NEXT: [[MULT:%.*]] = fmul fast float [[SUM_1]], [[TMP8]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], float [[MULT]], float [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -478,9 +973,33 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fmult_select1_notvectorize( -; CHECK-NOT: <4 x float> define float @fcmp_0_fmult_select1_notvectorize(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define float @fcmp_0_fmult_select1_notvectorize( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi float [ 0.000000e+00, %[[FOR_HEADER]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00 +; CHECK-NEXT: [[MULT:%.*]] = fmul float [[SUM_1]], [[TMP0]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], float [[MULT]], float [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[TMP1]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -518,11 +1037,59 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fmult_select2( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = fmul fast <4 x double> %[[V2:.*]], %[[V0]] -; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] define double @fcmp_0_fmult_select2(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define double @fcmp_0_fmult_select2( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ , %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp fast ogt double [[TMP8]], 0.000000e+00 +; CHECK-NEXT: [[MULT:%.*]] = fmul fast double [[SUM_1]], [[TMP8]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], double [[MULT]], double [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret double [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -559,9 +1126,33 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_0_fmult_select2_notvectorize( -; CHECK-NOT: <4 x double> define double @fcmp_0_fmult_select2_notvectorize(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define double @fcmp_0_fmult_select2_notvectorize( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi double [ 0.000000e+00, %[[FOR_HEADER]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp ogt double [[TMP0]], 0.000000e+00 +; CHECK-NEXT: [[MULT:%.*]] = fmul double [[SUM_1]], [[TMP0]] +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], double [[MULT]], double [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret double [[TMP1]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -604,18 +1195,77 @@ for.end: ; preds = %for.body, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_multi( -; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], %[[C1]], %[[V0]], %[[C2]], %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer -; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], %[[V0]], %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]] -; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]] -; CHECK: fadd fast <4 x float> %[[S2]], define float @fcmp_multi(ptr nocapture readonly %a, i32 %n) nounwind readonly { +; CHECK-LABEL: define float @fcmp_multi( +; CHECK-SAME: ptr nocapture readonly [[A:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP10]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP9]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[WIDE_LOAD]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[TMP10]] = fadd fast <4 x float> [[PREDPHI1]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP10]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[SUM_011:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP13]], 1.000000e+00 +; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_INC]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[CMP8:%.*]] = fcmp olt float [[TMP13]], 3.000000e+00 +; CHECK-NEXT: br i1 [[CMP8]], label %[[IF_THEN10:.*]], label %[[IF_ELSE14:.*]] +; CHECK: [[IF_THEN10]]: +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP13]], 2.000000e+00 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[IF_ELSE14]]: +; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP13]], 3.000000e+00 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[DOTPN:%.*]] = phi float [ [[MUL]], %[[IF_THEN10]] ], [ [[MUL17]], %[[IF_ELSE14]] ], [ [[TMP13]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1]] = fadd fast float [[DOTPN]], [[SUM_011]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi float [ [[SUM_1]], %[[FOR_INC]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_1_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[SUM_0_LCSSA]] +; entry: %cmp10 = icmp sgt i32 %n, 0 br i1 %cmp10, label %for.body.preheader, label %for.end @@ -671,17 +1321,75 @@ for.end: ; preds = %for.inc, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_fadd_fsub( -; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], %[[C1]], %[[V0]], %[[C2]], -; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float> -; CHECK-DAG: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer -; CHECK: %[[S1:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[ADD]], <4 x float> %[[SUB]] -; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]] define float @fcmp_fadd_fsub(ptr nocapture readonly %a, i32 %n) nounwind readonly { +; CHECK-LABEL: define float @fcmp_fadd_fsub( +; CHECK-SAME: ptr nocapture readonly [[A:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI1:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP6]] +; CHECK-NEXT: [[PREDPHI1]] = select <4 x i1> [[TMP9]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI1]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[SUM_010:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+00 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP12]], [[SUM_010]] +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[CMP8:%.*]] = fcmp olt float [[TMP12]], 3.000000e+00 +; CHECK-NEXT: br i1 [[CMP8]], label %[[IF_THEN10:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN10]]: +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[SUM_010]], [[TMP12]] +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[SUM_1]] = phi float [ [[ADD]], %[[IF_THEN]] ], [ [[SUB]], %[[IF_THEN10]] ], [ [[SUM_010]], %[[IF_ELSE]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi float [ [[SUM_1]], %[[FOR_INC]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_1_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[SUM_0_LCSSA]] +; entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.preheader, label %for.end @@ -736,9 +1444,43 @@ for.end: ; preds = %for.inc, %entry ; return sum; ; } -; CHECK-LABEL: @fcmp_fadd_fmul( -; CHECK-NOT: <4 x float> define float @fcmp_fadd_fmul(ptr nocapture readonly %a, i32 %n) nounwind readonly { +; CHECK-LABEL: define float @fcmp_fadd_fmul( +; CHECK-SAME: ptr nocapture readonly [[A:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[SUM_010:%.*]] = phi float [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[SUM_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], [[SUM_010]] +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[CMP8:%.*]] = fcmp olt float [[TMP0]], 3.000000e+00 +; CHECK-NEXT: br i1 [[CMP8]], label %[[IF_THEN10:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN10]]: +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP0]], [[SUM_010]] +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[SUM_1]] = phi float [ [[ADD]], %[[IF_THEN]] ], [ [[MUL]], %[[IF_THEN10]] ], [ [[SUM_010]], %[[IF_ELSE]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi float [ [[SUM_1]], %[[FOR_INC]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_1_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[SUM_0_LCSSA]] +; entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.preheader, label %for.end @@ -792,8 +1534,30 @@ for.end: ; preds = %for.inc, %entry ; } define float @fcmp_store_back(ptr nocapture %a, i32 %LEN) nounwind readonly { -; CHECK-LABEL: @fcmp_store_back( -; CHECK-NOT: <4 x float> +; CHECK-LABEL: define float @fcmp_store_back( +; CHECK-SAME: ptr nocapture [[A:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[LEN]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_08:%.*]] = phi float [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP0]], [[SUM_08]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret float [[SUM_0_LCSSA]] ; entry: %cmp7 = icmp sgt i32 %LEN, 0 @@ -819,11 +1583,58 @@ for.end: ; preds = %for.body, %entry ret float %sum.0.lcssa } -; CHECK-LABEL: @fcmp_0_add_select2( -; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = add <4 x i64> %[[V2:.*]], -; CHECK: select <4 x i1> %[[V1]], <4 x i64> %[[V3]], <4 x i64> %[[V2]] define i64 @fcmp_0_add_select2(ptr noalias %x, i64 %N) nounwind readonly { +; CHECK-LABEL: define i64 @fcmp_0_add_select2( +; CHECK-SAME: ptr noalias [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP4]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp ogt float [[TMP8]], 0.000000e+00 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SUM_1]], 2 +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], i64 [[ADD]], i64 [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi i64 [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i64 [[TMP9]] +; entry: %cmp.1 = icmp sgt i64 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -848,11 +1659,64 @@ for.end: ; preds = %for.body, %entry ret i64 %1 } -; CHECK-LABEL: @fcmp_0_sub_select1( -; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = sub <4 x i32> %[[V2:.*]], -; CHECK: select <4 x i1> %[[V1]], <4 x i32> %[[V3]], <4 x i32> %[[V2]] define i32 @fcmp_0_sub_select1(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define i32 @fcmp_0_sub_select1( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 0, [[ZEXT]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = sub i64 0, [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 0, [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[REVERSE]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp ogt float [[TMP10]], 0.000000e+00 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[SUM_1]], 2 +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], i32 [[SUB]], i32 [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = sub nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[TMP11]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -878,11 +1742,59 @@ for.end: ; preds = %for.body, %entry ret i32 %1 } -; CHECK-LABEL: @fcmp_0_mult_select1( -; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer -; CHECK: %[[V3:.*]] = mul <4 x i32> %[[V2:.*]], -; CHECK: select <4 x i1> %[[V1]], <4 x i32> %[[V3]], <4 x i32> %[[V2]] define i32 @fcmp_0_mult_select1(ptr noalias %x, i32 %N) nounwind readonly { +; CHECK-LABEL: define i32 @fcmp_0_mult_select1( +; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_HEADER]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_2:%.*]] = fcmp ogt float [[TMP8]], 0.000000e+00 +; CHECK-NEXT: [[MULT:%.*]] = mul nsw i32 [[SUM_1]], 2 +; CHECK-NEXT: [[SUM_2]] = select i1 [[CMP_2]], i32 [[MULT]], i32 [[SUM_1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[TMP9]] +; entry: %cmp.1 = icmp sgt i32 %N, 0 br i1 %cmp.1, label %for.header, label %for.end @@ -908,11 +1820,27 @@ for.end: ; preds = %for.body, %entry ret i32 %1 } -@table = constant [13 x i16] [i16 10, i16 35, i16 69, i16 147, i16 280, i16 472, i16 682, i16 1013, i16 1559, i16 2544, i16 4553, i16 6494, i16 10000], align 1 +@table = constant [13 x i16] [i16 10, i16 35, i16 69, i16 147, i16 280, i16 472, i16 682, i16 1013, i16 1559, i16 2544, i16 4553, i16 6494, i16 10000], align 1 -; CHECK-LABEL: @non_reduction_index( -; CHECK-NOT: <4 x i16> define i16 @non_reduction_index(i16 noundef %val) { +; CHECK-LABEL: define i16 @non_reduction_index( +; CHECK-SAME: i16 noundef [[VAL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP:.*]]: +; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: ret i16 [[SPEC_SELECT_LCSSA]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_05:%.*]] = phi i16 [ 12, %[[ENTRY]] ], [ [[SUB:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[K_04:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[I_05]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ugt i16 [[TMP0]], [[VAL]] +; CHECK-NEXT: [[SUB]] = add nsw i16 [[I_05]], -1 +; CHECK-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i16 [[SUB]], i16 [[K_04]] +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i16 [[SUB]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; entry: br label %for.body @@ -932,11 +1860,27 @@ for.body: ; preds = %entry, %for.body br i1 %cmp.not, label %for.cond.cleanup, label %for.body } -@tablef = constant [13 x half] [half 10.0, half 35.0, half 69.0, half 147.0, half 280.0, half 472.0, half 682.0, half 1013.0, half 1559.0, half 2544.0, half 4556.0, half 6496.0, half 10000.0], align 1 +@tablef = constant [13 x half] [half 10.0, half 35.0, half 69.0, half 147.0, half 280.0, half 472.0, half 682.0, half 1013.0, half 1559.0, half 2544.0, half 4556.0, half 6496.0, half 10000.0], align 1 -; CHECK-LABEL: @non_reduction_index_half( -; CHECK-NOT: <4 x half> define i16 @non_reduction_index_half(half noundef %val) { +; CHECK-LABEL: define i16 @non_reduction_index_half( +; CHECK-SAME: half noundef [[VAL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP:.*]]: +; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: ret i16 [[SPEC_SELECT_LCSSA]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_05:%.*]] = phi i16 [ 12, %[[ENTRY]] ], [ [[SUB:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[K_04:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[I_05]] +; CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[FCMP1:%.*]] = fcmp ugt half [[TMP0]], [[VAL]] +; CHECK-NEXT: [[SUB]] = add nsw i16 [[I_05]], -1 +; CHECK-NEXT: [[SPEC_SELECT]] = select i1 [[FCMP1]], i16 [[SUB]], i16 [[K_04]] +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i16 [[SUB]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; entry: br label %for.body @@ -956,5 +1900,37 @@ for.body: ; preds = %entry, %for.body br i1 %cmp.not, label %for.cond.cleanup, label %for.body } -; Make sure any check-not directives are not triggered by function declarations. -; CHECK: declare +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} +; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} +; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} +; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} +; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} +; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} +; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} +; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} +; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} +; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll b/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll index 13e79a4a47b39..f62c3c7f42ec4 100644 --- a/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll +++ b/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll @@ -4,18 +4,80 @@ define void @test_ptr_iv_no_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end) { ; CHECK-LABEL: @test_ptr_iv_no_inbounds( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1_START7:%.*]] = ptrtoint ptr [[P1_START:%.*]] to i64 +; CHECK-NEXT: [[P1_END6:%.*]] = ptrtoint ptr [[P1_END:%.*]] to i64 +; CHECK-NEXT: [[P1_START4:%.*]] = ptrtoint ptr [[P1_START]] to i64 +; CHECK-NEXT: [[P1_END3:%.*]] = ptrtoint ptr [[P1_END]] to i64 +; CHECK-NEXT: [[P1_START2:%.*]] = ptrtoint ptr [[P1_START]] to i64 +; CHECK-NEXT: [[P1_END1:%.*]] = ptrtoint ptr [[P1_END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[P1_END6]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[P1_START7]] +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[P1_END1]] to i2 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[P1_START2]] to i2 +; CHECK-NEXT: [[TMP6:%.*]] = sub i2 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = zext i2 [[TMP6]] to i64 +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[P1_END3]], -4 +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[P1_START4]] +; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P1_START]], i64 [[TMP12]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[P2_START:%.*]], i64 [[TMP12]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[P1_START]], [[SCEVGEP5]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[P2_START]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[P1_START]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: [[IND_END8:%.*]] = getelementptr i8, ptr [[P2_START]], i64 [[TMP14]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1_START]], i64 [[TMP15]] +; CHECK-NEXT: [[OFFSET_IDX10:%.*]] = mul i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX10]], 0 +; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[P2_START]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[NEXT_GEP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x float>, ptr [[TMP18]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD12]] +; CHECK-NEXT: store <2 x float> [[TMP19]], ptr [[TMP17]], align 4, !alias.scope [[META0]], !noalias [[META3]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P1_START]], [[ENTRY:%.*]] ], [ [[P1_START]], [[VECTOR_SCEVCHECK]] ], [ [[P1_START]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END8]], [[MIDDLE_BLOCK]] ], [ [[P2_START]], [[ENTRY]] ], [ [[P2_START]], [[VECTOR_SCEVCHECK]] ], [ [[P2_START]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[P1:%.*]] = phi ptr [ [[P1_START:%.*]], [[ENTRY:%.*]] ], [ [[P1_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[P2:%.*]] = phi ptr [ [[P2_START:%.*]], [[ENTRY]] ], [ [[P2_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[P1:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P1_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[P2:%.*]] = phi ptr [ [[BC_RESUME_VAL9]], [[SCALAR_PH]] ], [ [[P2_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[P1_VAL:%.*]] = load float, ptr [[P1]], align 4 ; CHECK-NEXT: [[P2_VAL:%.*]] = load float, ptr [[P2]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = fadd float [[P1_VAL]], [[P2_VAL]] ; CHECK-NEXT: store float [[SUM]], ptr [[P1]], align 4 ; CHECK-NEXT: [[P1_NEXT]] = getelementptr float, ptr [[P1]], i64 1 ; CHECK-NEXT: [[P2_NEXT]] = getelementptr float, ptr [[P2]], i64 1 -; CHECK-NEXT: [[C:%.*]] = icmp ne ptr [[P1_NEXT]], [[P1_END:%.*]] -; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ne ptr [[P1_NEXT]], [[P1_END]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -80,14 +142,14 @@ define void @test_ptr_iv_with_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end ; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX8]], 0 ; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[P2_START]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP13]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP13]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[NEXT_GEP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP14]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP14]], align 4, !alias.scope [[META12]] ; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD10]] -; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[TMP13]], align 4, !alias.scope [[META0]], !noalias [[META3]] +; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[TMP13]], align 4, !alias.scope [[META9]], !noalias [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -105,7 +167,7 @@ define void @test_ptr_iv_with_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end ; CHECK-NEXT: [[P1_NEXT]] = getelementptr inbounds float, ptr [[P1]], i64 1 ; CHECK-NEXT: [[P2_NEXT]] = getelementptr inbounds float, ptr [[P2]], i64 1 ; CHECK-NEXT: [[C:%.*]] = icmp ne ptr [[P1_NEXT]], [[P1_END]] -; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -155,7 +217,7 @@ define void @store_pointer_induction(ptr %start, ptr %end) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -167,7 +229,7 @@ define void @store_pointer_induction(ptr %start, ptr %end) { ; CHECK-NEXT: store ptr [[IV]], ptr [[IV]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = getelementptr inbounds ptr, ptr [[IV]], i32 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/phi-cost.ll b/llvm/test/Transforms/LoopVectorize/phi-cost.ll index 8d407c969b527..6ee2f36dd5b41 100644 --- a/llvm/test/Transforms/LoopVectorize/phi-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/phi-cost.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; REQUIRES: asserts ; RUN: opt < %s -passes='function(loop-vectorize,instcombine)' -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s @@ -6,17 +7,54 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: phi_two_incoming_values ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ] ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ] -; CHECK: define void @phi_two_incoming_values( -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr {{.*}} -; CHECK: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i32> -; CHECK-NEXT: [[PREDPHI:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[TMP6]] -; CHECK: store <2 x i32> [[PREDPHI]], ptr {{.*}} +; +define void @phi_two_incoming_values(ptr noalias %a, ptr noalias %b, i64 %n) { +; CHECK-LABEL: define void @phi_two_incoming_values( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], -2 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i1> [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[PREDPHI:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[TMP5]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[IF_END:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label %[[IF_THEN:.*]], label %[[IF_END]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[TMP5:%.*]] = add nuw i32 [[TMP2]], 1 +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[TMP2]], %[[FOR_BODY]] ], [ [[TMP5]], %[[IF_THEN]] ] +; CHECK-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[I]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; -define void @phi_two_incoming_values(ptr %a, ptr %b, i64 %n) { entry: br label %for.body @@ -46,15 +84,63 @@ for.end: ; CHECK-LABEL: phi_three_incoming_values ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ] ; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ] -; CHECK: define void @phi_three_incoming_values( -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> {{.*}}, <2 x i32> -; CHECK: [[PREDPHI7:%.*]] = select <2 x i1> {{.*}}, <2 x i32> [[PREDPHI]], <2 x i32> -; CHECK: store <2 x i32> [[PREDPHI7]], ptr {{.*}} +; +define void @phi_three_incoming_values(ptr noalias %a, ptr noalias %b, i64 %n) { +; CHECK-LABEL: define void @phi_three_incoming_values( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], -2 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD2]], +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> , <2 x i32> +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[TMP8]], <2 x i32> +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[PREDPHI]], <2 x i32> +; CHECK-NEXT: store <2 x i32> [[PREDPHI3]], ptr [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[IF_END:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[IF_THEN:.*]], label %[[IF_END]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP3]], 19 +; CHECK-NEXT: br i1 [[TMP7]], label %[[IF_END]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[TMP8:%.*]] = icmp slt i32 [[TMP5]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 4, i32 5 +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ 9, %[[FOR_BODY]] ], [ 3, %[[IF_THEN]] ], [ [[TMP9]], %[[IF_ELSE]] ] +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP0]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[I]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; -define void @phi_three_incoming_values(ptr %a, ptr %b, i64 %n) { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll index 8cf4e77a0d499..0d9918b74a2ff 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -596,10 +596,10 @@ exit: ; preds = %for.body define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) { ; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr ; CHECK: middle.block: -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) -; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]]) -; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]]) +; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4 +; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4 ; entry: br label %for.body @@ -625,10 +625,10 @@ exit: define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) { ; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr ; CHECK: middle.block: -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) -; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]]) -; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4 +; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4 ; entry: br label %for.body @@ -655,10 +655,10 @@ exit: define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) { ; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr ; CHECK: middle.block: -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) -; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]]) -; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]]) +; CHECK-NEXT: store i32 [[TMP6]], ptr %dst1, align 4 +; CHECK-NEXT: store i32 [[TMP7]], ptr %dst2, align 4 ; entry: br label %for.body @@ -684,10 +684,10 @@ exit: define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) { ; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr ; CHECK: middle.block: -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) -; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]]) -; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: store i32 [[TMP7]], ptr %dst1, align 4 +; CHECK-NEXT: store i32 [[TMP6]], ptr %dst2, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 26974c2307065..2a55f6d756826 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -212,6 +212,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: CLONE store vp<[[RED_RES]]>, ir<%dst> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph diff --git a/llvm/test/Transforms/LoopVersioning/pr96656.ll b/llvm/test/Transforms/LoopVersioning/single-iteration.ll similarity index 59% rename from llvm/test/Transforms/LoopVersioning/pr96656.ll rename to llvm/test/Transforms/LoopVersioning/single-iteration.ll index 0264fe40a9430..ffb9c7d4cd5ab 100644 --- a/llvm/test/Transforms/LoopVersioning/pr96656.ll +++ b/llvm/test/Transforms/LoopVersioning/single-iteration.ll @@ -1,16 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=loop-versioning -S %s | FileCheck %s -define void @lver.check.unnecessary(ptr %arg, ptr %arg1, i1 %arg2) { -; CHECK-LABEL: define void @lver.check.unnecessary( -; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]], i1 [[ARG2:%.*]]) { +; Callers should not call LoopVersioning on single-iteration loops, as it +; is very likely not profitable. +; LoopVersioning faithfully versions single-iteration loops when the stride +; is unknown. + +define double @single_iteration_unknown_stride(i32 %x, ptr %y, i1 %cond) { +; CHECK-LABEL: define double @single_iteration_unknown_stride( +; CHECK-SAME: i32 [[X:%.*]], ptr [[Y:%.*]], i1 [[COND:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ARG]], align 4 -; CHECK-NEXT: br i1 [[ARG2]], label %[[NOLOOP_EXIT:.*]], label %[[LOOP_BODY_LVER_CHECK:.*]] +; CHECK-NEXT: br i1 [[COND]], label %[[NOLOOP_EXIT:.*]], label %[[LOOP_BODY_LVER_CHECK:.*]] ; CHECK: [[LOOP_BODY_LVER_CHECK]]: -; CHECK-NEXT: [[SEXT7:%.*]] = sext i32 [[LOAD]] to i64 -; CHECK-NEXT: [[GEP8:%.*]] = getelementptr i8, ptr [[ARG1]], i64 8 -; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[LOAD]], 1 +; CHECK-NEXT: [[SEXT7:%.*]] = sext i32 [[X]] to i64 +; CHECK-NEXT: [[GEP8:%.*]] = getelementptr i8, ptr [[Y]], i64 8 +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[X]], 1 ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[LOOP_BODY_PH_LVER_ORIG:.*]], label %[[LOOP_BODY_PH:.*]] ; CHECK: [[LOOP_BODY_PH_LVER_ORIG]]: ; CHECK-NEXT: br label %[[LOOP_BODY_LVER_ORIG:.*]] @@ -19,7 +23,7 @@ define void @lver.check.unnecessary(ptr %arg, ptr %arg1, i1 %arg2) { ; CHECK-NEXT: [[MUL_LVER_ORIG:%.*]] = mul i64 [[PHI_LVER_ORIG]], [[SEXT7]] ; CHECK-NEXT: [[GEP10_LVER_ORIG:%.*]] = getelementptr double, ptr [[GEP8]], i64 [[MUL_LVER_ORIG]] ; CHECK-NEXT: [[LOAD11_LVER_ORIG:%.*]] = load double, ptr [[GEP10_LVER_ORIG]], align 8 -; CHECK-NEXT: store double [[LOAD11_LVER_ORIG]], ptr [[ARG1]], align 8 +; CHECK-NEXT: store double [[LOAD11_LVER_ORIG]], ptr [[Y]], align 8 ; CHECK-NEXT: [[ADD_LVER_ORIG]] = add i64 [[PHI_LVER_ORIG]], 1 ; CHECK-NEXT: [[ICMP_LVER_ORIG:%.*]] = icmp eq i64 [[PHI_LVER_ORIG]], 0 ; CHECK-NEXT: br i1 [[ICMP_LVER_ORIG]], label %[[LOOP_EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY_LVER_ORIG]] @@ -30,49 +34,52 @@ define void @lver.check.unnecessary(ptr %arg, ptr %arg1, i1 %arg2) { ; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[PHI]], [[SEXT7]] ; CHECK-NEXT: [[GEP10:%.*]] = getelementptr double, ptr [[GEP8]], i64 [[MUL]] ; CHECK-NEXT: [[LOAD11:%.*]] = load double, ptr [[GEP10]], align 8 -; CHECK-NEXT: store double [[LOAD11]], ptr [[ARG1]], align 8 +; CHECK-NEXT: store double [[LOAD11]], ptr [[Y]], align 8 ; CHECK-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i64 [[PHI]], 0 ; CHECK-NEXT: br i1 [[ICMP]], label %[[LOOP_EXIT_LOOPEXIT1:.*]], label %[[LOOP_BODY]] ; CHECK: [[NOLOOP_EXIT]]: -; CHECK-NEXT: [[SEXT:%.*]] = sext i32 [[LOAD]] to i64 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[ARG1]], i64 [[SEXT]] +; CHECK-NEXT: [[SEXT:%.*]] = sext i32 [[X]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[Y]], i64 [[SEXT]] ; CHECK-NEXT: [[LOAD5:%.*]] = load double, ptr [[GEP]], align 8 -; CHECK-NEXT: store double [[LOAD5]], ptr [[ARG]], align 8 -; CHECK-NEXT: ret void +; CHECK-NEXT: ret double [[LOAD5]] ; CHECK: [[LOOP_EXIT_LOOPEXIT]]: ; CHECK-NEXT: br label %[[LOOP_EXIT:.*]] ; CHECK: [[LOOP_EXIT_LOOPEXIT1]]: ; CHECK-NEXT: br label %[[LOOP_EXIT]] ; CHECK: [[LOOP_EXIT]]: -; CHECK-NEXT: ret void +; CHECK-NEXT: [[SEXT2:%.*]] = sext i32 [[X]] to i64 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr double, ptr [[Y]], i64 [[SEXT2]] +; CHECK-NEXT: [[LOAD6:%.*]] = load double, ptr [[GEP2]], align 8 +; CHECK-NEXT: ret double [[LOAD6]] ; entry: - %load = load i32, ptr %arg, align 4 - br i1 %arg2, label %noloop.exit, label %loop.ph + br i1 %cond, label %noloop.exit, label %loop.ph loop.ph: ; preds = %entry - %sext7 = sext i32 %load to i64 - %gep8 = getelementptr i8, ptr %arg1, i64 8 + %sext7 = sext i32 %x to i64 + %gep8 = getelementptr i8, ptr %y, i64 8 br label %loop.body loop.body: ; preds = %loop.body, %loop.ph - %phi = phi i64 [ 0, %loop.ph ], [ %add, %loop.body ] - %mul = mul i64 %phi, %sext7 + %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop.body ] + %mul = mul i64 %iv, %sext7 %gep10 = getelementptr double, ptr %gep8, i64 %mul %load11 = load double, ptr %gep10, align 8 - store double %load11, ptr %arg1, align 8 - %add = add i64 %phi, 1 - %icmp = icmp eq i64 %phi, 0 + store double %load11, ptr %y, align 8 + %iv.next = add i64 %iv, 1 + %icmp = icmp eq i64 %iv, 0 br i1 %icmp, label %loop.exit, label %loop.body noloop.exit: ; preds = %entry - %sext = sext i32 %load to i64 - %gep = getelementptr double, ptr %arg1, i64 %sext + %sext = sext i32 %x to i64 + %gep = getelementptr double, ptr %y, i64 %sext %load5 = load double, ptr %gep, align 8 - store double %load5, ptr %arg, align 8 - ret void + ret double %load5 loop.exit: ; preds = %loop.body - ret void + %sext2 = sext i32 %x to i64 + %gep2 = getelementptr double, ptr %y, i64 %sext2 + %load6 = load double, ptr %gep2, align 8 + ret double %load6 } diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll index 0114b9c244358..39b90adc74ef3 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -837,7 +837,7 @@ define void @memcpy_memcpy_escape_after1(ptr noalias %P, ptr noalias %Q) { ; CHECK-NEXT: [[MEMTMP:%.*]] = alloca [32 x i8], align 16 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[MEMTMP]], ptr align 16 [[P:%.*]], i32 32, i1 false) ; CHECK-NEXT: call void @do_something() -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[Q:%.*]], ptr align 16 [[MEMTMP]], i32 32, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[Q:%.*]], ptr align 16 [[P]], i32 32, i1 false) ; CHECK-NEXT: call void @capture(ptr [[MEMTMP]]) ; CHECK-NEXT: ret void ; @@ -851,10 +851,8 @@ define void @memcpy_memcpy_escape_after1(ptr noalias %P, ptr noalias %Q) { define void @memcpy_memcpy_escape_after2(ptr noalias %P, ptr noalias %Q) { ; CHECK-LABEL: @memcpy_memcpy_escape_after2( -; CHECK-NEXT: [[MEMTMP:%.*]] = alloca [32 x i8], align 16 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[MEMTMP]], ptr align 16 [[P:%.*]], i32 32, i1 false) ; CHECK-NEXT: call void @do_something() -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[Q:%.*]], ptr align 16 [[MEMTMP]], i32 32, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[Q:%.*]], ptr align 16 [[P:%.*]], i32 32, i1 false) ; CHECK-NEXT: call void @capture(ptr [[P]]) ; CHECK-NEXT: ret void ; @@ -868,10 +866,8 @@ define void @memcpy_memcpy_escape_after2(ptr noalias %P, ptr noalias %Q) { define void @memcpy_byval_escape_after(ptr noalias %P) { ; CHECK-LABEL: @memcpy_byval_escape_after( -; CHECK-NEXT: [[A:%.*]] = alloca [8 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 [[P:%.*]], i64 8, i1 false) ; CHECK-NEXT: call void @do_something() -; CHECK-NEXT: call void @test4a(ptr byval(i8) align 1 [[A]]) +; CHECK-NEXT: call void @test4a(ptr byval(i8) align 1 [[P:%.*]]) ; CHECK-NEXT: call void @capture(ptr [[P]]) ; CHECK-NEXT: ret void ; @@ -885,10 +881,8 @@ define void @memcpy_byval_escape_after(ptr noalias %P) { define void @memcpy_immut_escape_after(ptr align 4 noalias %val) { ; CHECK-LABEL: @memcpy_immut_escape_after( -; CHECK-NEXT: [[VAL1:%.*]] = alloca i8, align 4 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VAL1]], ptr align 4 [[VAL:%.*]], i64 1, i1 false) ; CHECK-NEXT: call void @do_something() -; CHECK-NEXT: call void @f(ptr noalias nocapture readonly align 4 [[VAL1]]) +; CHECK-NEXT: call void @f(ptr noalias nocapture readonly align 4 [[VAL:%.*]]) ; CHECK-NEXT: call void @capture(ptr [[VAL]]) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll index 887f59bbda94d..995cd7cfbc880 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-10 -pass-remarks-output=%t %s | FileCheck %s +; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-20 -pass-remarks-output=%t %s | FileCheck %s ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s ; YAML: --- !Passed ; YAML: Pass: slp-vectorizer ; YAML: Name: StoresVectorized -; YAML: Function: test +; YAML: Function: test1 ; YAML: Args: ; YAML: - String: 'Stores SLP vectorized with cost ' ; YAML: - Cost: '6' ; YAML: - String: ' and with tree size ' ; YAML: - TreeSize: '5' -define void @test(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4 x float> %load17, <4 x float> %fmuladd7, <4 x float> %fmuladd16, ptr %out_ptr) { -; CHECK-LABEL: @test( +define void @test1(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4 x float> %load17, <4 x float> %fmuladd7, <4 x float> %fmuladd16, ptr %out_ptr) { +; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VEXT165_I:%.*]] = shufflevector <4 x float> [[LOAD6:%.*]], <4 x float> [[LOAD7:%.*]], <4 x i32> ; CHECK-NEXT: [[VEXT309_I:%.*]] = shufflevector <4 x float> [[LOAD7]], <4 x float> [[LOAD8:%.*]], <4 x i32> @@ -40,3 +40,42 @@ entry: } declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; YAML: --- !Passed +; YAML: Pass: slp-vectorizer +; YAML: Name: StoresVectorized +; YAML: Function: test2 +; YAML: Args: +; YAML: - String: 'Stores SLP vectorized with cost ' +; YAML: - Cost: '16' +; YAML: - String: ' and with tree size ' +; YAML: - TreeSize: '5' + +define void @test2(<8 x float> %load6, <8 x float> %load7, <8 x float> %load8, <8 x float> %load17, <8 x float> %fmuladd7, <8 x float> %fmuladd16, ptr %out_ptr) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VEXT165_I:%.*]] = shufflevector <8 x float> [[LOAD6:%.*]], <8 x float> [[LOAD7:%.*]], <8 x i32> +; CHECK-NEXT: [[VEXT309_I:%.*]] = shufflevector <8 x float> [[LOAD7]], <8 x float> [[LOAD8:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> [[VEXT165_I]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> [[VEXT309_I]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> poison, i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP2]], <8 x float> [[LOAD17:%.*]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> [[FMULADD7:%.*]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP5]], <8 x float> [[FMULADD16:%.*]], i64 8) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP4]], <16 x float> [[TMP6]]) +; CHECK-NEXT: store <16 x float> [[TMP7]], ptr [[OUT_PTR:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + %vext165.i = shufflevector <8 x float> %load6, <8 x float> %load7, <8 x i32> + %vext309.i = shufflevector <8 x float> %load7, <8 x float> %load8, <8 x i32> + %fmuladd8 = tail call noundef <8 x float> @llvm.fmuladd.v8f32(<8 x float> %vext165.i, <8 x float> %load17, <8 x float> %fmuladd7) + %fmuladd17 = tail call noundef <8 x float> @llvm.fmuladd.v8f32(<8 x float> %vext309.i, <8 x float> %load17, <8 x float> %fmuladd16) + %add.ptr.i.i = getelementptr inbounds i8, ptr %out_ptr, i64 32 + store <8 x float> %fmuladd8, ptr %out_ptr, align 4 + store <8 x float> %fmuladd17, ptr %add.ptr.i.i, align 4 + ret void +} + +declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-num-elems.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-num-elems.ll new file mode 100644 index 0000000000000..893dc5b3ea44d --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-num-elems.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64--linux-gnu < %s -mcpu=sapphirerapids | FileCheck %s + +define void @test(ptr %src, ptr %dst, ptr %c, i64 %arrayidx14.15p) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], ptr [[C:%.*]], i64 [[ARRAYIDX14_15P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ARRAYIDX12_5:%.*]] = getelementptr i8, ptr [[C]], i64 192 +; CHECK-NEXT: [[ARRAYIDX14_5:%.*]] = getelementptr i8, ptr [[C]], i64 320 +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr i8, ptr [[C]], i64 72 +; CHECK-NEXT: [[ARRAYIDX12_6:%.*]] = getelementptr i8, ptr [[C]], i64 200 +; CHECK-NEXT: [[ARRAYIDX14_6:%.*]] = getelementptr i8, ptr [[C]], i64 328 +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr i8, ptr [[C]], i64 80 +; CHECK-NEXT: [[ARRAYIDX12_7:%.*]] = getelementptr i8, ptr [[C]], i64 208 +; CHECK-NEXT: [[ARRAYIDX14_7:%.*]] = getelementptr i8, ptr [[C]], i64 336 +; CHECK-NEXT: [[ARRAYIDX12_8:%.*]] = getelementptr i8, ptr [[C]], i64 216 +; CHECK-NEXT: [[ARRAYIDX14_8:%.*]] = getelementptr i8, ptr [[C]], i64 344 +; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr i8, ptr [[C]], i64 96 +; CHECK-NEXT: [[ARRAYIDX12_9:%.*]] = getelementptr i8, ptr [[C]], i64 224 +; CHECK-NEXT: [[ARRAYIDX14_9:%.*]] = getelementptr i8, ptr [[C]], i64 352 +; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr i8, ptr [[C]], i64 104 +; CHECK-NEXT: [[ARRAYIDX12_10:%.*]] = getelementptr i8, ptr [[C]], i64 232 +; CHECK-NEXT: [[ARRAYIDX14_10:%.*]] = getelementptr i8, ptr [[C]], i64 360 +; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr i8, ptr [[C]], i64 112 +; CHECK-NEXT: [[ARRAYIDX12_11:%.*]] = getelementptr i8, ptr [[C]], i64 240 +; CHECK-NEXT: [[ARRAYIDX14_11:%.*]] = getelementptr i8, ptr [[C]], i64 368 +; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr i8, ptr [[C]], i64 120 +; CHECK-NEXT: [[ARRAYIDX12_12:%.*]] = getelementptr i8, ptr [[C]], i64 248 +; CHECK-NEXT: [[ARRAYIDX14_12:%.*]] = getelementptr i8, ptr [[C]], i64 376 +; CHECK-NEXT: [[ARRAYIDX14_13:%.*]] = getelementptr i8, ptr [[C]], i64 384 +; CHECK-NEXT: [[ARRAYIDX12_5P:%.*]] = load i64, ptr [[ARRAYIDX12_5]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_5P:%.*]] = load i64, ptr [[ARRAYIDX14_5]], align 8 +; CHECK-NEXT: [[ARRAYIDX_6P:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 +; CHECK-NEXT: [[ARRAYIDX12_6P:%.*]] = load i64, ptr [[ARRAYIDX12_6]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_6P:%.*]] = load i64, ptr [[ARRAYIDX14_6]], align 8 +; CHECK-NEXT: [[ARRAYIDX_7P:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 +; CHECK-NEXT: [[ARRAYIDX12_7P:%.*]] = load i64, ptr [[ARRAYIDX12_7]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_7P:%.*]] = load i64, ptr [[ARRAYIDX14_7]], align 8 +; CHECK-NEXT: [[ARRAYIDX_8P:%.*]] = load i64, ptr [[C]], align 8 +; CHECK-NEXT: [[ARRAYIDX12_8P:%.*]] = load i64, ptr [[ARRAYIDX12_8]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_8P:%.*]] = load i64, ptr [[ARRAYIDX14_8]], align 8 +; CHECK-NEXT: [[ARRAYIDX_9P:%.*]] = load i64, ptr [[ARRAYIDX_9]], align 8 +; CHECK-NEXT: [[ARRAYIDX12_9P:%.*]] = load i64, ptr [[ARRAYIDX12_9]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_9P:%.*]] = load i64, ptr [[ARRAYIDX14_9]], align 8 +; CHECK-NEXT: [[ARRAYIDX_10P:%.*]] = load i64, ptr [[ARRAYIDX_10]], align 8 +; CHECK-NEXT: [[ARRAYIDX12_10P:%.*]] = load i64, ptr [[ARRAYIDX12_10]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_10P:%.*]] = load i64, ptr [[ARRAYIDX14_10]], align 8 +; CHECK-NEXT: [[ARRAYIDX_11P:%.*]] = load i64, ptr [[ARRAYIDX_11]], align 8 +; CHECK-NEXT: [[ARRAYIDX12_11P:%.*]] = load i64, ptr [[ARRAYIDX12_11]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_11P:%.*]] = load i64, ptr [[ARRAYIDX14_11]], align 8 +; CHECK-NEXT: [[ARRAYIDX_12P:%.*]] = load i64, ptr [[ARRAYIDX_12]], align 8 +; CHECK-NEXT: [[ARRAYIDX12_12P:%.*]] = load i64, ptr [[ARRAYIDX12_12]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_12P:%.*]] = load i64, ptr [[ARRAYIDX14_12]], align 8 +; CHECK-NEXT: [[ARRAYIDX_13P:%.*]] = load i64, ptr [[SRC]], align 8 +; CHECK-NEXT: [[ARRAYIDX14_13P:%.*]] = load i64, ptr [[ARRAYIDX14_13]], align 8 +; CHECK-NEXT: br label %[[LAND_LHS_TRUE137:.*]] +; CHECK: [[LAND_LHS_TRUE137]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_15P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[LAND_LHS_TRUE137]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 1, %[[LAND_LHS_TRUE137]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ [[TMP0]], %[[LAND_LHS_TRUE137]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ [[TMP1]], %[[LAND_LHS_TRUE137]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i64 [ [[TMP2]], %[[LAND_LHS_TRUE137]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i64 [ [[TMP3]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_13P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP4]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_13P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i64 [ [[TMP5]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_13P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ [[TMP6]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_12P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP7]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_12P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP8]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_12P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP9]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_11P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i64 [ [[TMP10]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_11P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i64 [ [[TMP11]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_11P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[TMP12]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_10P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP16:%.*]] = phi i64 [ [[TMP13]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_10P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi i64 [ [[TMP14]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_10P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i64 [ [[TMP15]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_9P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i64 [ [[TMP16]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_9P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ [[TMP17]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_9P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi i64 [ [[TMP18]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_8P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP22:%.*]] = phi i64 [ [[TMP19]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_8P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP23:%.*]] = phi i64 [ [[TMP20]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_8P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP24:%.*]] = phi i64 [ [[TMP21]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_7P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP25:%.*]] = phi i64 [ [[TMP22]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_7P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP26:%.*]] = phi i64 [ [[TMP23]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_7P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi i64 [ [[TMP24]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_6P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP28:%.*]] = phi i64 [ [[TMP25]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_6P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi i64 [ [[TMP26]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_6P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP30:%.*]] = phi i64 [ [[TMP27]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_5P]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP31:%.*]] = phi i64 [ [[TMP28]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_5P]], %[[ENTRY]] ] +; CHECK-NEXT: store i64 [[TMP31]], ptr [[DST]], align 8 +; CHECK-NEXT: store i64 [[TMP30]], ptr [[SRC]], align 8 +; CHECK-NEXT: store i64 [[TMP29]], ptr [[DST]], align 8 +; CHECK-NEXT: br label %[[LAND_LHS_TRUE137]] +; +entry: + %arrayidx12.5 = getelementptr i8, ptr %c, i64 192 + %arrayidx14.5 = getelementptr i8, ptr %c, i64 320 + %arrayidx.6 = getelementptr i8, ptr %c, i64 72 + %arrayidx12.6 = getelementptr i8, ptr %c, i64 200 + %arrayidx14.6 = getelementptr i8, ptr %c, i64 328 + %arrayidx.7 = getelementptr i8, ptr %c, i64 80 + %arrayidx12.7 = getelementptr i8, ptr %c, i64 208 + %arrayidx14.7 = getelementptr i8, ptr %c, i64 336 + %arrayidx12.8 = getelementptr i8, ptr %c, i64 216 + %arrayidx14.8 = getelementptr i8, ptr %c, i64 344 + %arrayidx.9 = getelementptr i8, ptr %c, i64 96 + %arrayidx12.9 = getelementptr i8, ptr %c, i64 224 + %arrayidx14.9 = getelementptr i8, ptr %c, i64 352 + %arrayidx.10 = getelementptr i8, ptr %c, i64 104 + %arrayidx12.10 = getelementptr i8, ptr %c, i64 232 + %arrayidx14.10 = getelementptr i8, ptr %c, i64 360 + %arrayidx.11 = getelementptr i8, ptr %c, i64 112 + %arrayidx12.11 = getelementptr i8, ptr %c, i64 240 + %arrayidx14.11 = getelementptr i8, ptr %c, i64 368 + %arrayidx.12 = getelementptr i8, ptr %c, i64 120 + %arrayidx12.12 = getelementptr i8, ptr %c, i64 248 + %arrayidx14.12 = getelementptr i8, ptr %c, i64 376 + %arrayidx14.13 = getelementptr i8, ptr %c, i64 384 + %arrayidx12.5p = load i64, ptr %arrayidx12.5, align 8 + %arrayidx14.5p = load i64, ptr %arrayidx14.5, align 8 + %arrayidx.6p = load i64, ptr %arrayidx.6, align 8 + %arrayidx12.6p = load i64, ptr %arrayidx12.6, align 8 + %arrayidx14.6p = load i64, ptr %arrayidx14.6, align 8 + %arrayidx.7p = load i64, ptr %arrayidx.7, align 8 + %arrayidx12.7p = load i64, ptr %arrayidx12.7, align 8 + %arrayidx14.7p = load i64, ptr %arrayidx14.7, align 8 + %arrayidx.8p = load i64, ptr %c, align 8 + %arrayidx12.8p = load i64, ptr %arrayidx12.8, align 8 + %arrayidx14.8p = load i64, ptr %arrayidx14.8, align 8 + %arrayidx.9p = load i64, ptr %arrayidx.9, align 8 + %arrayidx12.9p = load i64, ptr %arrayidx12.9, align 8 + %arrayidx14.9p = load i64, ptr %arrayidx14.9, align 8 + %arrayidx.10p = load i64, ptr %arrayidx.10, align 8 + %arrayidx12.10p = load i64, ptr %arrayidx12.10, align 8 + %arrayidx14.10p = load i64, ptr %arrayidx14.10, align 8 + %arrayidx.11p = load i64, ptr %arrayidx.11, align 8 + %arrayidx12.11p = load i64, ptr %arrayidx12.11, align 8 + %arrayidx14.11p = load i64, ptr %arrayidx14.11, align 8 + %arrayidx.12p = load i64, ptr %arrayidx.12, align 8 + %arrayidx12.12p = load i64, ptr %arrayidx12.12, align 8 + %arrayidx14.12p = load i64, ptr %arrayidx14.12, align 8 + %arrayidx.13p = load i64, ptr %src, align 8 + %arrayidx14.13p = load i64, ptr %arrayidx14.13, align 8 + br label %land.lhs.true137 + +land.lhs.true137: + %0 = phi i64 [ 0, %land.lhs.true137 ], [ %arrayidx14.15p, %entry ] + %1 = phi i64 [ 0, %land.lhs.true137 ], [ 1, %entry ] + %2 = phi i64 [ 1, %land.lhs.true137 ], [ 0, %entry ] + %3 = phi i64 [ %0, %land.lhs.true137 ], [ 0, %entry ] + %4 = phi i64 [ %1, %land.lhs.true137 ], [ 0, %entry ] + %5 = phi i64 [ %2, %land.lhs.true137 ], [ 0, %entry ] + %6 = phi i64 [ %3, %land.lhs.true137 ], [ %arrayidx14.13p, %entry ] + %7 = phi i64 [ %4, %land.lhs.true137 ], [ %arrayidx.13p, %entry ] + %8 = phi i64 [ %5, %land.lhs.true137 ], [ %arrayidx.13p, %entry ] + %9 = phi i64 [ %6, %land.lhs.true137 ], [ %arrayidx14.12p, %entry ] + %10 = phi i64 [ %7, %land.lhs.true137 ], [ %arrayidx12.12p, %entry ] + %11 = phi i64 [ %8, %land.lhs.true137 ], [ %arrayidx.12p, %entry ] + %12 = phi i64 [ %9, %land.lhs.true137 ], [ %arrayidx14.11p, %entry ] + %13 = phi i64 [ %10, %land.lhs.true137 ], [ %arrayidx12.11p, %entry ] + %14 = phi i64 [ %11, %land.lhs.true137 ], [ %arrayidx.11p, %entry ] + %15 = phi i64 [ %12, %land.lhs.true137 ], [ %arrayidx14.10p, %entry ] + %16 = phi i64 [ %13, %land.lhs.true137 ], [ %arrayidx12.10p, %entry ] + %17 = phi i64 [ %14, %land.lhs.true137 ], [ %arrayidx.10p, %entry ] + %18 = phi i64 [ %15, %land.lhs.true137 ], [ %arrayidx14.9p, %entry ] + %19 = phi i64 [ %16, %land.lhs.true137 ], [ %arrayidx12.9p, %entry ] + %20 = phi i64 [ %17, %land.lhs.true137 ], [ %arrayidx.9p, %entry ] + %21 = phi i64 [ %18, %land.lhs.true137 ], [ %arrayidx14.8p, %entry ] + %22 = phi i64 [ %19, %land.lhs.true137 ], [ %arrayidx12.8p, %entry ] + %23 = phi i64 [ %20, %land.lhs.true137 ], [ %arrayidx.8p, %entry ] + %24 = phi i64 [ %21, %land.lhs.true137 ], [ %arrayidx14.7p, %entry ] + %25 = phi i64 [ %22, %land.lhs.true137 ], [ %arrayidx12.7p, %entry ] + %26 = phi i64 [ %23, %land.lhs.true137 ], [ %arrayidx.7p, %entry ] + %27 = phi i64 [ %24, %land.lhs.true137 ], [ %arrayidx14.6p, %entry ] + %28 = phi i64 [ %25, %land.lhs.true137 ], [ %arrayidx12.6p, %entry ] + %29 = phi i64 [ %26, %land.lhs.true137 ], [ %arrayidx.6p, %entry ] + %30 = phi i64 [ %27, %land.lhs.true137 ], [ %arrayidx14.5p, %entry ] + %31 = phi i64 [ %28, %land.lhs.true137 ], [ %arrayidx12.5p, %entry ] + store i64 %31, ptr %dst, align 8 + store i64 %30, ptr %src, align 8 + store i64 %29, ptr %dst, align 8 + br label %land.lhs.true137 +} diff --git a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll index d6da9fd8ae20c..5addd0e3ad8ee 100644 --- a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll +++ b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll @@ -201,11 +201,8 @@ define i64 @load_before_store_noescape_byval(ptr byval([2 x i32]) %a, i64 %i, i3 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 0, i64 [[I:%.*]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[V]], [[B:%.*]] -; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; CHECK: if.then: -; CHECK-NEXT: store i32 [[B]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[V]] +; CHECK-NEXT: store i32 [[SPEC_STORE_SELECT]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; @@ -235,11 +232,8 @@ define i64 @load_before_store_noescape_malloc(i64 %i, i32 %b) { ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 0, i64 [[I:%.*]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[V]], [[B:%.*]] -; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; CHECK: if.then: -; CHECK-NEXT: store i32 [[B]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[V]] +; CHECK-NEXT: store i32 [[SPEC_STORE_SELECT]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; @@ -267,11 +261,8 @@ define i64 @load_before_store_noescape_writable(ptr noalias writable dereference ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 0, i64 1 ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[V]], [[B:%.*]] -; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; CHECK: if.then: -; CHECK-NEXT: store i32 [[B]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[V]] +; CHECK-NEXT: store i32 [[SPEC_STORE_SELECT]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp index ca50187e5e5ee..1cd4a47c75739 100644 --- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp +++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp @@ -68,6 +68,13 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) { "loongarch64"), "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"); + // Check that SPARC targets add -i128:128. + EXPECT_EQ( + UpgradeDataLayoutString("E-m:e-p:32:32-i64:64-f128:64-n32-S64", "sparc"), + "E-m:e-p:32:32-i64:64-i128:128-f128:64-n32-S64"); + EXPECT_EQ(UpgradeDataLayoutString("E-m:e-i64:64-n32:64-S128", "sparcv9"), + "E-m:e-i64:64-i128:128-n32:64-S128"); + // Check that SPIR && SPIRV targets add -G1 if it's not present. EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir"), "e-p:32:32-G1"); EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir64"), "e-p:32:32-G1"); diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp index 2eaf369caf084..9cd54352735dc 100644 --- a/llvm/unittests/SandboxIR/PassTest.cpp +++ b/llvm/unittests/SandboxIR/PassTest.cpp @@ -9,8 +9,9 @@ #include "llvm/SandboxIR/Pass.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Module.h" +#include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/Context.h" #include "llvm/SandboxIR/PassManager.h" -#include "llvm/SandboxIR/SandboxIR.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" diff --git a/llvm/unittests/SandboxIR/RegionTest.cpp b/llvm/unittests/SandboxIR/RegionTest.cpp index dc4dad8fed71c..f1bb535d9c50e 100644 --- a/llvm/unittests/SandboxIR/RegionTest.cpp +++ b/llvm/unittests/SandboxIR/RegionTest.cpp @@ -8,7 +8,9 @@ #include "llvm/SandboxIR/Region.h" #include "llvm/AsmParser/Parser.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/Context.h" +#include "llvm/SandboxIR/Instruction.h" #include "llvm/Support/SourceMgr.h" #include "gmock/gmock-matchers.h" #include "gtest/gtest.h" diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 964b81fead67e..66a5191b1154b 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/SandboxIR/SandboxIR.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -14,7 +13,12 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" +#include "llvm/SandboxIR/BasicBlock.h" +#include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/Instruction.h" +#include "llvm/SandboxIR/Module.h" #include "llvm/SandboxIR/Utils.h" +#include "llvm/SandboxIR/Value.h" #include "llvm/Support/SourceMgr.h" #include "gmock/gmock-matchers.h" #include "gtest/gtest.h" diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index da5416395ec42..5823f4e14a854 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -11,7 +11,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Instruction.h" #include "llvm/Support/SourceMgr.h" #include "gmock/gmock-matchers.h" #include "gtest/gtest.h" diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp index 40aa32fb08ed0..9bf02c97948eb 100644 --- a/llvm/unittests/SandboxIR/TypesTest.cpp +++ b/llvm/unittests/SandboxIR/TypesTest.cpp @@ -14,7 +14,8 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/Context.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" diff --git a/llvm/unittests/SandboxIR/UtilsTest.cpp b/llvm/unittests/SandboxIR/UtilsTest.cpp index ded3edf1206a4..033f217c3bcf7 100644 --- a/llvm/unittests/SandboxIR/UtilsTest.cpp +++ b/llvm/unittests/SandboxIR/UtilsTest.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Utils.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/Context.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" @@ -54,3 +60,77 @@ define void @foo(ptr %arg0) { EXPECT_EQ(sandboxir::Utils::memoryLocationGetOrNone(Ld), MemoryLocation::getOrNone(LLVMLd)); } + +TEST_F(UtilsTest, GetPointerDiffInBytes) { + parseIR(C, R"IR( +define void @foo(ptr %ptr) { + %gep0 = getelementptr inbounds float, ptr %ptr, i64 0 + %gep1 = getelementptr inbounds float, ptr %ptr, i64 1 + %gep2 = getelementptr inbounds float, ptr %ptr, i64 2 + %gep3 = getelementptr inbounds float, ptr %ptr, i64 3 + + %ld0 = load float, ptr %gep0 + %ld1 = load float, ptr %gep1 + %ld2 = load float, ptr %gep2 + %ld3 = load float, ptr %gep3 + + %v2ld0 = load <2 x float>, ptr %gep0 + %v2ld1 = load <2 x float>, ptr %gep1 + %v2ld2 = load <2 x float>, ptr %gep2 + %v2ld3 = load <2 x float>, ptr %gep3 + + %v3ld0 = load <3 x float>, ptr %gep0 + %v3ld1 = load <3 x float>, ptr %gep1 + %v3ld2 = load <3 x float>, ptr %gep2 + %v3ld3 = load <3 x float>, ptr %gep3 + ret void +} +)IR"); + llvm::Function &LLVMF = *M->getFunction("foo"); + DominatorTree DT(LLVMF); + TargetLibraryInfoImpl TLII; + TargetLibraryInfo TLI(TLII); + DataLayout DL(M->getDataLayout()); + AssumptionCache AC(LLVMF); + BasicAAResult BAA(DL, LLVMF, TLI, AC, &DT); + AAResults AA(TLI); + AA.addAAResult(BAA); + LoopInfo LI(DT); + ScalarEvolution SE(LLVMF, TLI, AC, DT, LI); + sandboxir::Context Ctx(C); + + auto &F = *Ctx.createFunction(&LLVMF); + auto &BB = *F.begin(); + auto It = std::next(BB.begin(), 4); + auto *L0 = cast(&*It++); + auto *L1 = cast(&*It++); + auto *L2 = cast(&*It++); + [[maybe_unused]] auto *L3 = cast(&*It++); + + auto *V2L0 = cast(&*It++); + auto *V2L1 = cast(&*It++); + auto *V2L2 = cast(&*It++); + auto *V2L3 = cast(&*It++); + + [[maybe_unused]] auto *V3L0 = cast(&*It++); + auto *V3L1 = cast(&*It++); + [[maybe_unused]] auto *V3L2 = cast(&*It++); + [[maybe_unused]] auto *V3L3 = cast(&*It++); + + // getPointerDiffInBytes + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, L1, SE, DL), 4); + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, L2, SE, DL), 8); + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L1, L0, SE, DL), -4); + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, V2L0, SE, DL), 0); + + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, V2L1, SE, DL), 4); + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, V3L1, SE, DL), 4); + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(V2L0, V2L2, SE, DL), 8); + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(V2L0, V2L3, SE, DL), 12); + EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(V2L3, V2L0, SE, DL), -12); + + // atLowerAddress + EXPECT_TRUE(sandboxir::Utils::atLowerAddress(L0, L1, SE, DL)); + EXPECT_FALSE(sandboxir::Utils::atLowerAddress(L1, L0, SE, DL)); + EXPECT_FALSE(sandboxir::Utils::atLowerAddress(L3, V3L3, SE, DL)); +} diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp index f6bfd097f20a4..329d3617a31fa 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp @@ -8,7 +8,9 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h" #include "llvm/AsmParser/Parser.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/Context.h" +#include "llvm/SandboxIR/Instruction.h" #include "llvm/Support/SourceMgr.h" #include "gmock/gmock-matchers.h" #include "gtest/gtest.h" diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp index 054da8c2a5d12..0b2411151a965 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp @@ -8,7 +8,9 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h" #include "llvm/AsmParser/Parser.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/Context.h" +#include "llvm/SandboxIR/Instruction.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp index a136be41ae363..89255a108ed6c 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp @@ -8,7 +8,7 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h" #include "llvm/AsmParser/Parser.h" -#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/SandboxIR/Instruction.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp index 1b95082ad0715..bed5aa86846bf 100644 --- a/llvm/utils/TableGen/CallingConvEmitter.cpp +++ b/llvm/utils/TableGen/CallingConvEmitter.cpp @@ -38,7 +38,7 @@ class CallingConvEmitter { private: void EmitCallingConv(const Record *CC, raw_ostream &O); - void EmitAction(const Record *Action, unsigned Indent, raw_ostream &O); + void EmitAction(const Record *Action, indent Indent, raw_ostream &O); void EmitArgRegisterLists(raw_ostream &O); }; } // End anonymous namespace @@ -116,26 +116,24 @@ void CallingConvEmitter::EmitCallingConv(const Record *CC, raw_ostream &O) { }); O << "\n"; - EmitAction(Action, 2, O); + EmitAction(Action, indent(2), O); } O << "\n return true; // CC didn't match.\n"; O << "}\n"; } -void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, +void CallingConvEmitter::EmitAction(const Record *Action, indent Indent, raw_ostream &O) { - std::string IndentStr = std::string(Indent, ' '); - if (Action->isSubClassOf("CCPredicateAction")) { - O << IndentStr << "if ("; + O << Indent << "if ("; if (Action->isSubClassOf("CCIfType")) { const ListInit *VTs = Action->getValueAsListInit("VTs"); for (unsigned i = 0, e = VTs->size(); i != e; ++i) { const Record *VT = VTs->getElementAsRecord(i); if (i != 0) - O << " ||\n " << IndentStr; + O << " ||\n " << Indent; O << "LocVT == " << getEnumName(getValueType(VT)); } @@ -148,29 +146,29 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, O << ") {\n"; EmitAction(Action->getValueAsDef("SubAction"), Indent + 2, O); - O << IndentStr << "}\n"; + O << Indent << "}\n"; } else { if (Action->isSubClassOf("CCDelegateTo")) { const Record *CC = Action->getValueAsDef("CC"); - O << IndentStr << "if (!" << CC->getName() + O << Indent << "if (!" << CC->getName() << "(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))\n" - << IndentStr << " return false;\n"; + << Indent + 2 << "return false;\n"; DelegateToMap[CurrentAction].insert(CC->getName().str()); } else if (Action->isSubClassOf("CCAssignToReg") || Action->isSubClassOf("CCAssignToRegAndStack")) { const ListInit *RegList = Action->getValueAsListInit("RegList"); if (RegList->size() == 1) { std::string Name = getQualifiedName(RegList->getElementAsRecord(0)); - O << IndentStr << "if (MCRegister Reg = State.AllocateReg(" << Name + O << Indent << "if (MCRegister Reg = State.AllocateReg(" << Name << ")) {\n"; if (SwiftAction) AssignedSwiftRegsMap[CurrentAction].insert(Name); else AssignedRegsMap[CurrentAction].insert(Name); } else { - O << IndentStr << "static const MCPhysReg RegList" << ++Counter + O << Indent << "static const MCPhysReg RegList" << ++Counter << "[] = {\n"; - O << IndentStr << " "; + O << Indent << " "; ListSeparator LS; for (unsigned i = 0, e = RegList->size(); i != e; ++i) { std::string Name = getQualifiedName(RegList->getElementAsRecord(i)); @@ -180,21 +178,21 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, AssignedRegsMap[CurrentAction].insert(Name); O << LS << Name; } - O << "\n" << IndentStr << "};\n"; - O << IndentStr << "if (MCRegister Reg = State.AllocateReg(RegList" + O << "\n" << Indent << "};\n"; + O << Indent << "if (MCRegister Reg = State.AllocateReg(RegList" << Counter << ")) {\n"; } - O << IndentStr << " State.addLoc(CCValAssign::getReg(ValNo, ValVT, " + O << Indent << " State.addLoc(CCValAssign::getReg(ValNo, ValVT, " << "Reg, LocVT, LocInfo));\n"; if (Action->isSubClassOf("CCAssignToRegAndStack")) { int Size = Action->getValueAsInt("Size"); int Align = Action->getValueAsInt("Align"); - O << IndentStr << " (void)State.AllocateStack("; + O << Indent << " (void)State.AllocateStack("; if (Size) O << Size << ", "; else O << "\n" - << IndentStr + << Indent << " State.getMachineFunction().getDataLayout()." "getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext()))," " "; @@ -202,14 +200,14 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, O << "Align(" << Align << ")"; else O << "\n" - << IndentStr + << Indent << " State.getMachineFunction().getDataLayout()." "getABITypeAlign(EVT(LocVT).getTypeForEVT(State.getContext()" "))"; O << ");\n"; } - O << IndentStr << " return false;\n"; - O << IndentStr << "}\n"; + O << Indent << " return false;\n"; + O << Indent << "}\n"; } else if (Action->isSubClassOf("CCAssignToRegWithShadow")) { const ListInit *RegList = Action->getValueAsListInit("RegList"); const ListInit *ShadowRegList = @@ -219,7 +217,7 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, "Invalid length of list of shadowed registers"); if (RegList->size() == 1) { - O << IndentStr << "if (MCRegister Reg = State.AllocateReg("; + O << Indent << "if (MCRegister Reg = State.AllocateReg("; O << getQualifiedName(RegList->getElementAsRecord(0)); O << ", " << getQualifiedName(ShadowRegList->getElementAsRecord(0)); O << ")) {\n"; @@ -227,41 +225,40 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, unsigned RegListNumber = ++Counter; unsigned ShadowRegListNumber = ++Counter; - O << IndentStr << "static const MCPhysReg RegList" << RegListNumber + O << Indent << "static const MCPhysReg RegList" << RegListNumber << "[] = {\n"; - O << IndentStr << " "; + O << Indent << " "; ListSeparator LS; for (unsigned i = 0, e = RegList->size(); i != e; ++i) O << LS << getQualifiedName(RegList->getElementAsRecord(i)); - O << "\n" << IndentStr << "};\n"; + O << "\n" << Indent << "};\n"; - O << IndentStr << "static const MCPhysReg RegList" - << ShadowRegListNumber << "[] = {\n"; - O << IndentStr << " "; + O << Indent << "static const MCPhysReg RegList" << ShadowRegListNumber + << "[] = {\n"; + O << Indent << " "; ListSeparator LSS; for (unsigned i = 0, e = ShadowRegList->size(); i != e; ++i) O << LSS << getQualifiedName(ShadowRegList->getElementAsRecord(i)); - O << "\n" << IndentStr << "};\n"; + O << "\n" << Indent << "};\n"; - O << IndentStr << "if (MCRegister Reg = State.AllocateReg(RegList" + O << Indent << "if (MCRegister Reg = State.AllocateReg(RegList" << RegListNumber << ", " << "RegList" << ShadowRegListNumber << ")) {\n"; } - O << IndentStr << " State.addLoc(CCValAssign::getReg(ValNo, ValVT, " + O << Indent << " State.addLoc(CCValAssign::getReg(ValNo, ValVT, " << "Reg, LocVT, LocInfo));\n"; - O << IndentStr << " return false;\n"; - O << IndentStr << "}\n"; + O << Indent << " return false;\n"; + O << Indent << "}\n"; } else if (Action->isSubClassOf("CCAssignToStack")) { int Size = Action->getValueAsInt("Size"); int Align = Action->getValueAsInt("Align"); - O << IndentStr << "int64_t Offset" << ++Counter - << " = State.AllocateStack("; + O << Indent << "int64_t Offset" << ++Counter << " = State.AllocateStack("; if (Size) O << Size << ", "; else O << "\n" - << IndentStr + << Indent << " State.getMachineFunction().getDataLayout()." "getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext()))," " "; @@ -269,14 +266,14 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, O << "Align(" << Align << ")"; else O << "\n" - << IndentStr + << Indent << " State.getMachineFunction().getDataLayout()." "getABITypeAlign(EVT(LocVT).getTypeForEVT(State.getContext()" "))"; O << ");\n" - << IndentStr << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset" + << Indent << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset" << Counter << ", LocVT, LocInfo));\n"; - O << IndentStr << "return false;\n"; + O << Indent << "return false;\n"; } else if (Action->isSubClassOf("CCAssignToStackWithShadow")) { int Size = Action->getValueAsInt("Size"); int Align = Action->getValueAsInt("Align"); @@ -285,76 +282,73 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, unsigned ShadowRegListNumber = ++Counter; - O << IndentStr << "static const MCPhysReg ShadowRegList" + O << Indent << "static const MCPhysReg ShadowRegList" << ShadowRegListNumber << "[] = {\n"; - O << IndentStr << " "; + O << Indent << " "; ListSeparator LS; for (unsigned i = 0, e = ShadowRegList->size(); i != e; ++i) O << LS << getQualifiedName(ShadowRegList->getElementAsRecord(i)); - O << "\n" << IndentStr << "};\n"; + O << "\n" << Indent << "};\n"; - O << IndentStr << "int64_t Offset" << ++Counter - << " = State.AllocateStack(" << Size << ", Align(" << Align << "), " + O << Indent << "int64_t Offset" << ++Counter << " = State.AllocateStack(" + << Size << ", Align(" << Align << "), " << "ShadowRegList" << ShadowRegListNumber << ");\n"; - O << IndentStr << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset" + O << Indent << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset" << Counter << ", LocVT, LocInfo));\n"; - O << IndentStr << "return false;\n"; + O << Indent << "return false;\n"; } else if (Action->isSubClassOf("CCPromoteToType")) { const Record *DestTy = Action->getValueAsDef("DestTy"); MVT::SimpleValueType DestVT = getValueType(DestTy); - O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n"; + O << Indent << "LocVT = " << getEnumName(DestVT) << ";\n"; if (MVT(DestVT).isFloatingPoint()) { - O << IndentStr << "LocInfo = CCValAssign::FPExt;\n"; + O << Indent << "LocInfo = CCValAssign::FPExt;\n"; } else { - O << IndentStr << "if (ArgFlags.isSExt())\n" - << IndentStr << " LocInfo = CCValAssign::SExt;\n" - << IndentStr << "else if (ArgFlags.isZExt())\n" - << IndentStr << " LocInfo = CCValAssign::ZExt;\n" - << IndentStr << "else\n" - << IndentStr << " LocInfo = CCValAssign::AExt;\n"; + O << Indent << "if (ArgFlags.isSExt())\n" + << Indent << " LocInfo = CCValAssign::SExt;\n" + << Indent << "else if (ArgFlags.isZExt())\n" + << Indent << " LocInfo = CCValAssign::ZExt;\n" + << Indent << "else\n" + << Indent << " LocInfo = CCValAssign::AExt;\n"; } } else if (Action->isSubClassOf("CCPromoteToUpperBitsInType")) { const Record *DestTy = Action->getValueAsDef("DestTy"); MVT::SimpleValueType DestVT = getValueType(DestTy); - O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n"; + O << Indent << "LocVT = " << getEnumName(DestVT) << ";\n"; if (MVT(DestVT).isFloatingPoint()) { PrintFatalError(Action->getLoc(), "CCPromoteToUpperBitsInType does not handle floating " "point"); } else { - O << IndentStr << "if (ArgFlags.isSExt())\n" - << IndentStr << " LocInfo = CCValAssign::SExtUpper;\n" - << IndentStr << "else if (ArgFlags.isZExt())\n" - << IndentStr << " LocInfo = CCValAssign::ZExtUpper;\n" - << IndentStr << "else\n" - << IndentStr << " LocInfo = CCValAssign::AExtUpper;\n"; + O << Indent << "if (ArgFlags.isSExt())\n" + << Indent << " LocInfo = CCValAssign::SExtUpper;\n" + << Indent << "else if (ArgFlags.isZExt())\n" + << Indent << " LocInfo = CCValAssign::ZExtUpper;\n" + << Indent << "else\n" + << Indent << " LocInfo = CCValAssign::AExtUpper;\n"; } } else if (Action->isSubClassOf("CCBitConvertToType")) { const Record *DestTy = Action->getValueAsDef("DestTy"); - O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) - << ";\n"; - O << IndentStr << "LocInfo = CCValAssign::BCvt;\n"; + O << Indent << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n"; + O << Indent << "LocInfo = CCValAssign::BCvt;\n"; } else if (Action->isSubClassOf("CCTruncToType")) { const Record *DestTy = Action->getValueAsDef("DestTy"); - O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) - << ";\n"; - O << IndentStr << "LocInfo = CCValAssign::Trunc;\n"; + O << Indent << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n"; + O << Indent << "LocInfo = CCValAssign::Trunc;\n"; } else if (Action->isSubClassOf("CCPassIndirect")) { const Record *DestTy = Action->getValueAsDef("DestTy"); - O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) - << ";\n"; - O << IndentStr << "LocInfo = CCValAssign::Indirect;\n"; + O << Indent << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n"; + O << Indent << "LocInfo = CCValAssign::Indirect;\n"; } else if (Action->isSubClassOf("CCPassByVal")) { int Size = Action->getValueAsInt("Size"); int Align = Action->getValueAsInt("Align"); - O << IndentStr << "State.HandleByVal(ValNo, ValVT, LocVT, LocInfo, " - << Size << ", Align(" << Align << "), ArgFlags);\n"; - O << IndentStr << "return false;\n"; + O << Indent << "State.HandleByVal(ValNo, ValVT, LocVT, LocInfo, " << Size + << ", Align(" << Align << "), ArgFlags);\n"; + O << Indent << "return false;\n"; } else if (Action->isSubClassOf("CCCustom")) { - O << IndentStr << "if (" << Action->getValueAsString("FuncName") + O << Indent << "if (" << Action->getValueAsString("FuncName") << "(ValNo, ValVT, " << "LocVT, LocInfo, ArgFlags, State))\n"; - O << IndentStr << " return false;\n"; + O << Indent << " return false;\n"; } else { errs() << *Action; PrintFatalError(Action->getLoc(), "Unknown CCAction!"); diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp index 2702e0ae33c77..5de5dd894f84e 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp @@ -888,11 +888,9 @@ void RuleMatcher::defineOperand(StringRef SymbolicName, OperandMatcher &OM) { RM.getGISelFlags()); } -void RuleMatcher::definePhysRegOperand(Record *Reg, OperandMatcher &OM) { - if (!PhysRegOperands.contains(Reg)) { +void RuleMatcher::definePhysRegOperand(const Record *Reg, OperandMatcher &OM) { + if (!PhysRegOperands.contains(Reg)) PhysRegOperands[Reg] = &OM; - return; - } } InstructionMatcher & @@ -904,7 +902,8 @@ RuleMatcher::getInstructionMatcher(StringRef SymbolicName) const { ("Failed to lookup instruction " + SymbolicName).str().c_str()); } -const OperandMatcher &RuleMatcher::getPhysRegOperandMatcher(Record *Reg) const { +const OperandMatcher & +RuleMatcher::getPhysRegOperandMatcher(const Record *Reg) const { const auto &I = PhysRegOperands.find(Reg); if (I == PhysRegOperands.end()) { @@ -1717,7 +1716,8 @@ OperandMatcher &InstructionMatcher::getOperand(unsigned OpIdx) { llvm_unreachable("Failed to lookup operand"); } -OperandMatcher &InstructionMatcher::addPhysRegInput(Record *Reg, unsigned OpIdx, +OperandMatcher &InstructionMatcher::addPhysRegInput(const Record *Reg, + unsigned OpIdx, unsigned TempOpIdx) { assert(SymbolicName.empty()); OperandMatcher *OM = new OperandMatcher(*this, OpIdx, "", TempOpIdx); diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h index aa4eae87573a3..315606417fc9e 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h @@ -494,7 +494,7 @@ class RuleMatcher : public Matcher { /// A map of anonymous physical register operands defined by the matchers that /// may be referenced by the renderers. - DenseMap PhysRegOperands; + DenseMap PhysRegOperands; /// ID for the next instruction variable defined with /// implicitlyDefineInsnVar() @@ -651,7 +651,7 @@ class RuleMatcher : public Matcher { void defineOperand(StringRef SymbolicName, OperandMatcher &OM); - void definePhysRegOperand(Record *Reg, OperandMatcher &OM); + void definePhysRegOperand(const Record *Reg, OperandMatcher &OM); Error defineComplexSubOperand(StringRef SymbolicName, const Record *ComplexPattern, @@ -669,7 +669,7 @@ class RuleMatcher : public Matcher { InstructionMatcher &getInstructionMatcher(StringRef SymbolicName) const; OperandMatcher &getOperandMatcher(StringRef Name); const OperandMatcher &getOperandMatcher(StringRef Name) const; - const OperandMatcher &getPhysRegOperandMatcher(Record *) const; + const OperandMatcher &getPhysRegOperandMatcher(const Record *) const; void optimize() override; void emit(MatchTable &Table) override; @@ -1759,7 +1759,7 @@ class InstructionMatcher final : public PredicateListMatcher { /// PhysRegInputs - List list has an entry for each explicitly specified /// physreg input to the pattern. The first elt is the Register node, the /// second is the recorded slot number the input pattern match saved it in. - SmallVector, 2> PhysRegInputs; + SmallVector, 2> PhysRegInputs; bool canAddNumOperandsCheck() const { // Add if it's allowed, and: @@ -1799,10 +1799,10 @@ class InstructionMatcher final : public PredicateListMatcher { unsigned AllocatedTemporariesBaseID, bool IsVariadic = false); OperandMatcher &getOperand(unsigned OpIdx); - OperandMatcher &addPhysRegInput(Record *Reg, unsigned OpIdx, + OperandMatcher &addPhysRegInput(const Record *Reg, unsigned OpIdx, unsigned TempOpIdx); - ArrayRef> getPhysRegInputs() const { + ArrayRef> getPhysRegInputs() const { return PhysRegInputs; } @@ -1969,10 +1969,10 @@ class CopyRenderer : public OperandRenderer { class CopyPhysRegRenderer : public OperandRenderer { protected: unsigned NewInsnID; - Record *PhysReg; + const Record *PhysReg; public: - CopyPhysRegRenderer(unsigned NewInsnID, Record *Reg) + CopyPhysRegRenderer(unsigned NewInsnID, const Record *Reg) : OperandRenderer(OR_CopyPhysReg), NewInsnID(NewInsnID), PhysReg(Reg) { assert(PhysReg); } @@ -1981,7 +1981,7 @@ class CopyPhysRegRenderer : public OperandRenderer { return R->getKind() == OR_CopyPhysReg; } - Record *getPhysReg() const { return PhysReg; } + const Record *getPhysReg() const { return PhysReg; } void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override; }; diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp index 8790dc6028ef4..b7926e21ca661 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp @@ -103,7 +103,7 @@ void GlobalISelMatchTableExecutorEmitter::emitSubtargetFeatureBitsetImpl( } void GlobalISelMatchTableExecutorEmitter::emitComplexPredicates( - raw_ostream &OS, ArrayRef ComplexOperandMatchers) { + raw_ostream &OS, ArrayRef ComplexOperandMatchers) { // Emit complex predicate table and an enum to reference them with. OS << "// ComplexPattern predicates.\n" << "enum {\n" @@ -174,7 +174,8 @@ void GlobalISelMatchTableExecutorEmitter::emitMatchTable( void GlobalISelMatchTableExecutorEmitter::emitExecutorImpl( raw_ostream &OS, const MatchTable &Table, ArrayRef TypeObjects, - ArrayRef Rules, ArrayRef ComplexOperandMatchers, + ArrayRef Rules, + ArrayRef ComplexOperandMatchers, ArrayRef CustomOperandRenderers, StringRef IfDefName) { OS << "#ifdef " << IfDefName << "\n"; emitTypeObjects(OS, TypeObjects); diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h index 6634c525480d3..862f1e83c169f 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h @@ -42,7 +42,7 @@ class GlobalISelMatchTableExecutorEmitter { /// Emits an enum + an array that stores references to /// \p ComplexOperandMatchers. void emitComplexPredicates(raw_ostream &OS, - ArrayRef ComplexOperandMatchers); + ArrayRef ComplexOperandMatchers); /// Emits an enum + an array that stores references to /// \p CustomOperandRenderers. @@ -206,7 +206,7 @@ class GlobalISelMatchTableExecutorEmitter { void emitExecutorImpl(raw_ostream &OS, const gi::MatchTable &Table, ArrayRef TypeObjects, ArrayRef Rules, - ArrayRef ComplexOperandMatchers, + ArrayRef ComplexOperandMatchers, ArrayRef CustomOperandRenderers, StringRef IfDefName); void emitPredicateBitset(raw_ostream &OS, StringRef IfDefName); diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index c345662c008e5..c53f705a38db8 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -314,7 +314,7 @@ static Expected getInstResultType(const TreePatternNode &Dst, class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { public: - explicit GlobalISelEmitter(RecordKeeper &RK); + explicit GlobalISelEmitter(const RecordKeeper &RK); void emitAdditionalImpl(raw_ostream &OS) override; @@ -335,18 +335,18 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { private: std::string ClassName; - RecordKeeper &RK; + const RecordKeeper &RK; const CodeGenDAGPatterns CGP; const CodeGenTarget &Target; CodeGenRegBank &CGRegs; - std::vector AllPatFrags; + ArrayRef AllPatFrags; /// Keep track of the equivalence between SDNodes and Instruction by mapping /// SDNodes to the GINodeEquiv mapping. We need to map to the GINodeEquiv to /// check for attributes on the relation such as CheckMMOIsNonAtomic. /// This is defined using 'GINodeEquiv' in the target description. - DenseMap NodeEquivs; + DenseMap NodeEquivs; /// Keep track of the equivalence between ComplexPattern's and /// GIComplexOperandMatcher. Map entries are specified by subclassing @@ -379,8 +379,8 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { void gatherTypeIDValues(); void gatherNodeEquivs(); - Record *findNodeEquiv(const Record *N) const; - const CodeGenInstruction *getEquivNode(Record &Equiv, + const Record *findNodeEquiv(const Record *N) const; + const CodeGenInstruction *getEquivNode(const Record &Equiv, const TreePatternNode &N) const; Error importRulePredicates(RuleMatcher &M, @@ -472,7 +472,7 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { InstructionMatcher &InsnMatcher, bool &HasAddedMatcher); }; -StringRef getPatFragPredicateEnumName(Record *R) { return R->getName(); } +StringRef getPatFragPredicateEnumName(const Record *R) { return R->getName(); } void GlobalISelEmitter::gatherOpcodeValues() { InstructionOpcodeMatcher::initOpcodeValuesMap(Target); @@ -484,32 +484,35 @@ void GlobalISelEmitter::gatherTypeIDValues() { void GlobalISelEmitter::gatherNodeEquivs() { assert(NodeEquivs.empty()); - for (Record *Equiv : RK.getAllDerivedDefinitions("GINodeEquiv")) + for (const Record *Equiv : RK.getAllDerivedDefinitions("GINodeEquiv")) NodeEquivs[Equiv->getValueAsDef("Node")] = Equiv; assert(ComplexPatternEquivs.empty()); - for (Record *Equiv : RK.getAllDerivedDefinitions("GIComplexPatternEquiv")) { - Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent"); + for (const Record *Equiv : + RK.getAllDerivedDefinitions("GIComplexPatternEquiv")) { + const Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent"); if (!SelDAGEquiv) continue; ComplexPatternEquivs[SelDAGEquiv] = Equiv; } assert(SDNodeXFormEquivs.empty()); - for (Record *Equiv : RK.getAllDerivedDefinitions("GISDNodeXFormEquiv")) { - Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent"); + for (const Record *Equiv : + RK.getAllDerivedDefinitions("GISDNodeXFormEquiv")) { + const Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent"); if (!SelDAGEquiv) continue; SDNodeXFormEquivs[SelDAGEquiv] = Equiv; } } -Record *GlobalISelEmitter::findNodeEquiv(const Record *N) const { +const Record *GlobalISelEmitter::findNodeEquiv(const Record *N) const { return NodeEquivs.lookup(N); } const CodeGenInstruction * -GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode &N) const { +GlobalISelEmitter::getEquivNode(const Record &Equiv, + const TreePatternNode &N) const { if (N.getNumChildren() >= 1) { // setcc operation maps to two different G_* instructions based on the type. if (!Equiv.isValueUnset("IfFloatingPoint") && @@ -536,7 +539,7 @@ GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode &N) const { return &Target.getInstruction(Equiv.getValueAsDef("I")); } -GlobalISelEmitter::GlobalISelEmitter(RecordKeeper &RK) +GlobalISelEmitter::GlobalISelEmitter(const RecordKeeper &RK) : GlobalISelMatchTableExecutorEmitter(), RK(RK), CGP(RK), Target(CGP.getTargetInfo()), CGRegs(Target.getRegBank()) { ClassName = Target.getName().str() + "InstructionSelector"; @@ -721,7 +724,7 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( const TreePatternNode &Src, unsigned &TempOpIdx) { const auto SavedFlags = Rule.setGISelFlags(Src.getGISelFlagsRecord()); - Record *SrcGIEquivOrNull = nullptr; + const Record *SrcGIEquivOrNull = nullptr; const CodeGenInstruction *SrcGIOrNull = nullptr; // Start with the defined operands (i.e., the results of the root operator). @@ -942,7 +945,7 @@ Error GlobalISelEmitter::importComplexPatternOperandMatcher( // Get the name to use for a pattern operand. For an anonymous physical register // input, this should use the register name. static StringRef getSrcChildName(const TreePatternNode &SrcChild, - Record *&PhysReg) { + const Record *&PhysReg) { StringRef SrcChildName = SrcChild.getName(); if (SrcChildName.empty() && SrcChild.isLeaf()) { if (auto *ChildDefInit = dyn_cast(SrcChild.getLeafValue())) { @@ -962,7 +965,7 @@ Error GlobalISelEmitter::importChildMatcher( const TreePatternNode &SrcChild, bool OperandIsAPointer, bool OperandIsImmArg, unsigned OpIdx, unsigned &TempOpIdx) { - Record *PhysReg = nullptr; + const Record *PhysReg = nullptr; std::string SrcChildName = std::string(getSrcChildName(SrcChild, PhysReg)); if (!SrcChild.isLeaf() && SrcChild.getOperator()->isSubClassOf("ComplexPattern")) { @@ -1196,7 +1199,8 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( auto &Child = DstChild.getChild(0); auto I = SDNodeXFormEquivs.find(DstChild.getOperator()); if (I != SDNodeXFormEquivs.end()) { - Record *XFormOpc = DstChild.getOperator()->getValueAsDef("Opcode"); + const Record *XFormOpc = + DstChild.getOperator()->getValueAsDef("Opcode"); if (XFormOpc->getName() == "timm") { // If this is a TargetConstant, there won't be a corresponding // instruction to transform. Instead, this will refer directly to an @@ -2290,65 +2294,65 @@ void GlobalISelEmitter::emitAdditionalImpl(raw_ostream &OS) { } void GlobalISelEmitter::emitMIPredicateFns(raw_ostream &OS) { - std::vector MatchedRecords; + std::vector MatchedRecords; std::copy_if(AllPatFrags.begin(), AllPatFrags.end(), - std::back_inserter(MatchedRecords), [&](Record *R) { + std::back_inserter(MatchedRecords), [](const Record *R) { return !R->getValueAsString("GISelPredicateCode").empty(); }); - emitMIPredicateFnsImpl( + emitMIPredicateFnsImpl( OS, " const MachineFunction &MF = *MI.getParent()->getParent();\n" " const MachineRegisterInfo &MRI = MF.getRegInfo();\n" " const auto &Operands = State.RecordedOperands;\n" " (void)Operands;\n" " (void)MRI;", - ArrayRef(MatchedRecords), &getPatFragPredicateEnumName, - [&](Record *R) { return R->getValueAsString("GISelPredicateCode"); }, + ArrayRef(MatchedRecords), &getPatFragPredicateEnumName, + [](const Record *R) { return R->getValueAsString("GISelPredicateCode"); }, "PatFrag predicates."); } void GlobalISelEmitter::emitI64ImmPredicateFns(raw_ostream &OS) { - std::vector MatchedRecords; + std::vector MatchedRecords; std::copy_if(AllPatFrags.begin(), AllPatFrags.end(), - std::back_inserter(MatchedRecords), [&](Record *R) { + std::back_inserter(MatchedRecords), [](const Record *R) { bool Unset; return !R->getValueAsString("ImmediateCode").empty() && !R->getValueAsBitOrUnset("IsAPFloat", Unset) && !R->getValueAsBit("IsAPInt"); }); - emitImmPredicateFnsImpl( - OS, "I64", "int64_t", ArrayRef(MatchedRecords), + emitImmPredicateFnsImpl( + OS, "I64", "int64_t", ArrayRef(MatchedRecords), &getPatFragPredicateEnumName, - [&](Record *R) { return R->getValueAsString("ImmediateCode"); }, + [](const Record *R) { return R->getValueAsString("ImmediateCode"); }, "PatFrag predicates."); } void GlobalISelEmitter::emitAPFloatImmPredicateFns(raw_ostream &OS) { - std::vector MatchedRecords; + std::vector MatchedRecords; std::copy_if(AllPatFrags.begin(), AllPatFrags.end(), - std::back_inserter(MatchedRecords), [&](Record *R) { + std::back_inserter(MatchedRecords), [](const Record *R) { bool Unset; return !R->getValueAsString("ImmediateCode").empty() && R->getValueAsBitOrUnset("IsAPFloat", Unset); }); - emitImmPredicateFnsImpl( - OS, "APFloat", "const APFloat &", ArrayRef(MatchedRecords), - &getPatFragPredicateEnumName, - [&](Record *R) { return R->getValueAsString("ImmediateCode"); }, + emitImmPredicateFnsImpl( + OS, "APFloat", "const APFloat &", + ArrayRef(MatchedRecords), &getPatFragPredicateEnumName, + [](const Record *R) { return R->getValueAsString("ImmediateCode"); }, "PatFrag predicates."); } void GlobalISelEmitter::emitAPIntImmPredicateFns(raw_ostream &OS) { - std::vector MatchedRecords; + std::vector MatchedRecords; std::copy_if(AllPatFrags.begin(), AllPatFrags.end(), - std::back_inserter(MatchedRecords), [&](Record *R) { + std::back_inserter(MatchedRecords), [](const Record *R) { return !R->getValueAsString("ImmediateCode").empty() && R->getValueAsBit("IsAPInt"); }); - emitImmPredicateFnsImpl( - OS, "APInt", "const APInt &", ArrayRef(MatchedRecords), + emitImmPredicateFnsImpl( + OS, "APInt", "const APInt &", ArrayRef(MatchedRecords), &getPatFragPredicateEnumName, - [&](Record *R) { return R->getValueAsString("ImmediateCode"); }, + [](const Record *R) { return R->getValueAsString("ImmediateCode"); }, "PatFrag predicates."); } @@ -2461,7 +2465,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) { return A->getName() < B->getName(); }; - std::vector ComplexPredicates = + std::vector ComplexPredicates = RK.getAllDerivedDefinitions("GIComplexOperandMatcher"); llvm::sort(ComplexPredicates, OrderByName); diff --git a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn index f3d3984ccd91c..c9489e13f9047 100644 --- a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn @@ -7,15 +7,17 @@ static_library("SandboxIR") { ] sources = [ "Argument.cpp", + "BasicBlock.cpp", "Constant.cpp", "Context.cpp", + "Instruction.cpp", "Module.cpp", "Pass.cpp", "PassManager.cpp", "Region.cpp", - "SandboxIR.cpp", "Tracker.cpp", "Type.cpp", + "Use.cpp", "User.cpp", "Value.cpp", ] diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index f83efbd355802..d122e8a21671b 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -164,6 +164,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPULowerKernelAttributes.cpp", "AMDGPULowerModuleLDSPass.cpp", "AMDGPUMCInstLower.cpp", + "AMDGPUMCResourceInfo.cpp", "AMDGPUMIRFormatter.cpp", "AMDGPUMachineFunction.cpp", "AMDGPUMachineModuleInfo.cpp", @@ -181,6 +182,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPURegBankSelect.cpp", "AMDGPURegisterBankInfo.cpp", "AMDGPURemoveIncompatibleFunctions.cpp", + "AMDGPUReserveWWMRegs.cpp", "AMDGPUResourceUsageAnalysis.cpp", "AMDGPURewriteOutArguments.cpp", "AMDGPURewriteUndefForPHI.cpp", diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh index 2dbc9d281dc69..41240621d4cf5 100755 --- a/llvm/utils/release/test-release.sh +++ b/llvm/utils/release/test-release.sh @@ -755,8 +755,8 @@ for Flavor in $Flavors ; do # case there are build paths in the debug info. Do the same sub- # stitution on both files in case the string occurrs naturally. if ! cmp -s \ - <(env LC_CTYPE=C sed -e 's,Phase1,Phase2,g' -e 's,Phase2,Phase3,g' $p2) \ - <(env LC_CTYPE=C sed -e 's,Phase1,Phase2,g' -e 's,Phase2,Phase3,g' $p3) \ + <(env LC_ALL=C sed -e 's,Phase1,Phase2,g' -e 's,Phase2,Phase3,g' $p2) \ + <(env LC_ALL=C sed -e 's,Phase1,Phase2,g' -e 's,Phase2,Phase3,g' $p3) \ 16 16; then echo "file `basename $p2` differs between phase 2 and phase 3" fi diff --git a/mlir/docs/Dialects/SPIR-V.md b/mlir/docs/Dialects/SPIR-V.md index 51ba3482a379a..c1d50f0fbe407 100644 --- a/mlir/docs/Dialects/SPIR-V.md +++ b/mlir/docs/Dialects/SPIR-V.md @@ -388,7 +388,7 @@ This corresponds to SPIR-V [struct type][StructType]. Its syntax is ``` struct-member-decoration ::= integer-literal? spirv-decoration* struct-type ::= `!spirv.struct<` spirv-type (`[` struct-member-decoration `]`)? - (`, ` spirv-type (`[` struct-member-decoration `]`)? + (`, ` spirv-type (`[` struct-member-decoration `]`)? `>` ``` For Example, diff --git a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h index 97573b6e45301..cff7957989871 100644 --- a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h +++ b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h @@ -75,7 +75,7 @@ class Identifier { template explicit Identifier(T value) : value(llvm::PointerLikeTypeTraits::getAsVoidPointer(value)) { -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS idType = llvm::getTypeName(); #endif } @@ -84,7 +84,7 @@ class Identifier { /// the type of the identifier used to create it. template T getValue() const { -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS assert(llvm::getTypeName() == idType && "Identifier was initialized with a different type than the one used " "to retrieve it."); @@ -108,7 +108,7 @@ class Identifier { /// The value of the identifier. void *value = nullptr; -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS /// TypeID of the identifiers in space. This should be used in asserts only. llvm::StringRef idType; #endif diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index dbec741cf1b1f..8773fc5881461 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -1096,6 +1096,7 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", ]; let hasVerifier = 1; + let hasCanonicalizer = 1; } #endif // AFFINE_OPS diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td index 1036e93a03924..b08e803345f76 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -394,7 +394,7 @@ def LoopCoalescing : Pass<"affine-loop-coalescing", "func::FuncOp"> { let summary = "Coalesce nested loops with independent bounds into a single " "loop"; let constructor = "mlir::affine::createLoopCoalescingPass()"; - let dependentDialects = ["arith::ArithDialect"]; + let dependentDialects = ["affine::AffineDialect","arith::ArithDialect"]; } def SimplifyAffineStructures : Pass<"affine-simplify-structures", "func::FuncOp"> { diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td index 860f893367203..ccb1678aef919 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -99,6 +99,8 @@ def GPU_AddressSpaceEnum : GPU_I32Enum< def GPU_AddressSpaceAttr : GPU_I32EnumAttr<"address_space", GPU_AddressSpaceEnum>; +def GPU_AddressSpaceAttrArray : TypedArrayAttrBase; + //===----------------------------------------------------------------------===// // GPU Types. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 6098eb34d04d5..9d89068c72969 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1355,7 +1355,8 @@ def GPU_ShuffleOp : GPU_Op< ]; } -def GPU_BarrierOp : GPU_Op<"barrier"> { +def GPU_BarrierOp : GPU_Op<"barrier">, + Arguments<(ins OptionalAttr :$address_spaces)> { let summary = "Synchronizes all work items of a workgroup."; let description = [{ The "barrier" op synchronizes all work items of a workgroup. It is used @@ -1371,11 +1372,25 @@ def GPU_BarrierOp : GPU_Op<"barrier"> { accessing the same memory can be avoided by synchronizing work items in-between these accesses. + The address space of visible memory accesses can be modified by adding a + list of address spaces required to be visible. By default all address spaces + are included. + + ```mlir + // only workgroup address spaces accesses required to be visible + gpu.barrier memfence [#gpu.address_space] + // no memory accesses required to be visible + gpu.barrier memfence [] + // all memory accesses required to be visible + gpu.barrier + ``` + Either none or all work items of a workgroup need to execute this op in convergence. }]; - let assemblyFormat = "attr-dict"; + let assemblyFormat = "(`memfence` $address_spaces^)? attr-dict"; let hasCanonicalizer = 1; + let builders = [OpBuilder<(ins)>]; } def GPU_GPUModuleOp : GPU_Op<"module", [ diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td index 9b29affb97c43..53d1ae10dc87d 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td @@ -56,6 +56,7 @@ def SCFParallelLoopFusion : Pass<"scf-parallel-loop-fusion"> { def TestSCFParallelLoopCollapsing : Pass<"test-scf-parallel-loop-collapsing"> { let summary = "Test parallel loops collapsing transformation"; let constructor = "mlir::createTestSCFParallelLoopCollapsingPass()"; + let dependentDialects = ["affine::AffineDialect"]; let description = [{ This pass is purely for testing the scf::collapseParallelLoops transformation. The transformation does not have opinions on how a diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h index a154d7fa5fb6e..620fd7c63146d 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h @@ -293,7 +293,7 @@ class SparseTensorType { /// Returns the number of dimensions which have dynamic sizes. /// The return type is `int64_t` to maintain consistency with /// `ShapedType::Trait::getNumDynamicDims`. - int64_t getNumDynamicDims() const { return rtp.getNumDynamicDims(); } + size_t getNumDynamicDims() const { return rtp.getNumDynamicDims(); } ArrayRef getLvlTypes() const { return enc.getLvlTypes(); } LevelType getLvlType(Level l) const { diff --git a/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h index e51aac02936b5..43193e4cd4cf6 100644 --- a/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h +++ b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h @@ -196,7 +196,7 @@ class TransformState { /// should be emitted when the value is used. using InvalidatedHandleMap = DenseMap>; -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS /// Debug only: A timestamp is associated with each transform IR value, so /// that invalid iterator usage can be detected more reliably. using TransformIRTimestampMapping = DenseMap; @@ -211,7 +211,7 @@ class TransformState { ValueMapping values; ValueMapping reverseValues; -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS TransformIRTimestampMapping timestamps; void incrementTimestamp(Value value) { ++timestamps[value]; } #endif // LLVM_ENABLE_ABI_BREAKING_CHECKS @@ -248,7 +248,7 @@ class TransformState { auto getPayloadOps(Value value) const { ArrayRef view = getPayloadOpsView(value); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // Memorize the current timestamp and make sure that it has not changed // when incrementing or dereferencing the iterator returned by this // function. The timestamp is incremented when the "direct" mapping is @@ -259,7 +259,7 @@ class TransformState { // When ops are replaced/erased, they are replaced with nullptr (until // the data structure is compacted). Do not enumerate these ops. return llvm::make_filter_range(view, [=](Operation *op) { -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS [[maybe_unused]] bool sameTimestamp = currentTimestamp == this->getMapping(value).timestamps.lookup(value); assert(sameTimestamp && "iterator was invalidated during iteration"); @@ -277,7 +277,7 @@ class TransformState { auto getPayloadValues(Value handleValue) const { ArrayRef view = getPayloadValuesView(handleValue); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // Memorize the current timestamp and make sure that it has not changed // when incrementing or dereferencing the iterator returned by this // function. The timestamp is incremented when the "values" mapping is diff --git a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td index db38e2e1bce22..c9dcd546cf67c 100644 --- a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td +++ b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td @@ -166,7 +166,7 @@ def ShapedTypeInterface : TypeInterface<"ShapedType"> { /// If this is a ranked type, return the number of dimensions with dynamic /// size. Otherwise, abort. - int64_t getNumDynamicDims() const { + size_t getNumDynamicDims() const { return llvm::count_if($_type.getShape(), ::mlir::ShapedType::isDynamic); } diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td index c738a8a3becc1..b2b41b16beec2 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.td +++ b/mlir/include/mlir/IR/BuiltinTypes.td @@ -1253,7 +1253,7 @@ def Builtin_Vector : Builtin_Type<"Vector", "vector", } /// Get the number of scalable dimensions. - int64_t getNumScalableDims() const { + size_t getNumScalableDims() const { return llvm::count(getScalableDims(), true); } diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td index 853fb318c76e7..de5f6797235e3 100644 --- a/mlir/include/mlir/IR/CommonAttrConstraints.td +++ b/mlir/include/mlir/IR/CommonAttrConstraints.td @@ -178,6 +178,7 @@ class AnyAttrOf allowedAttrs, string summary = "", summary)> { let returnType = cppType; let convertFromStorage = fromStorage; + list allowedAttributes = allowedAttrs; } def LocationAttr : Attr($_self)">, @@ -743,6 +744,8 @@ class ConfinedAttr constraints> : Attr< let isOptional = attr.isOptional; let baseAttr = attr; + + list attrConstraints = constraints; } // An AttrConstraint that holds if all attr constraints specified in diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index 739a34e0aa610..f9e8e397f93f2 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -116,12 +116,31 @@ struct GPUBarrierConversion final : ConvertOpToLLVMPattern { lookupOrCreateSPIRVFn(moduleOp, funcName, flagTy, voidTy, /*isMemNone=*/false, /*isConvergent=*/true); - // Value used by SPIR-V backend to represent `CLK_LOCAL_MEM_FENCE`. - // See `llvm/lib/Target/SPIRV/SPIRVBuiltins.td`. - constexpr int64_t localMemFenceFlag = 1; + // Value used by SPIR-V backend to represent `CLK_LOCAL_MEM_FENCE` and + // `CLK_GLOBAL_MEM_FENCE`. See `llvm/lib/Target/SPIRV/SPIRVBuiltins.td`. + constexpr int32_t localMemFenceFlag = 1; + constexpr int32_t globalMemFenceFlag = 2; + int32_t memFenceFlag = 0; + std::optional addressSpaces = adaptor.getAddressSpaces(); + if (addressSpaces) { + for (Attribute attr : addressSpaces.value()) { + auto addressSpace = cast(attr).getValue(); + switch (addressSpace) { + case gpu::AddressSpace::Global: + memFenceFlag = memFenceFlag | globalMemFenceFlag; + break; + case gpu::AddressSpace::Workgroup: + memFenceFlag = memFenceFlag | localMemFenceFlag; + break; + case gpu::AddressSpace::Private: + break; + } + } + } else { + memFenceFlag = localMemFenceFlag | globalMemFenceFlag; + } Location loc = op->getLoc(); - Value flag = - rewriter.create(loc, flagTy, localMemFenceFlag); + Value flag = rewriter.create(loc, flagTy, memFenceFlag); rewriter.replaceOp(op, createSPIRVBuiltinCall(loc, rewriter, func, flag)); return success(); } diff --git a/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td index f513bb1a0a826..0fcda38631a9b 100644 --- a/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td +++ b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td @@ -17,6 +17,6 @@ include "mlir/IR/PatternBase.td" include "mlir/Dialect/GPU/IR/GPUOps.td" include "mlir/Dialect/LLVMIR/NVVMOps.td" -def : Pat<(GPU_BarrierOp), (NVVM_Barrier0Op)>; +def : Pat<(GPU_BarrierOp : $op $memory_fence), (NVVM_Barrier0Op)>; #endif // MLIR_CONVERSION_GPUTONVVM_TD diff --git a/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td b/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td index 8d2f30a9a1683..d3bb774813437 100644 --- a/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td +++ b/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td @@ -17,6 +17,6 @@ include "mlir/IR/PatternBase.td" include "mlir/Dialect/GPU/IR/GPUOps.td" include "mlir/Dialect/LLVMIR/ROCDLOps.td" -def : Pat<(GPU_BarrierOp), (ROCDL_BarrierOp)>; +def : Pat<(GPU_BarrierOp : $op $memory_fence), (ROCDL_BarrierOp)>; #endif // MLIR_CONVERSION_GPUTOROCDL_TD diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index b89888e6aa83f..1ccff7324f514 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -4537,6 +4537,133 @@ LogicalResult AffineDelinearizeIndexOp::verify() { return success(); } +namespace { + +// Drops delinearization indices that correspond to unit-extent basis +struct DropUnitExtentBasis + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(affine::AffineDelinearizeIndexOp delinearizeOp, + PatternRewriter &rewriter) const override { + SmallVector replacements(delinearizeOp->getNumResults(), nullptr); + std::optional zero = std::nullopt; + Location loc = delinearizeOp->getLoc(); + auto getZero = [&]() -> Value { + if (!zero) + zero = rewriter.create(loc, 0); + return zero.value(); + }; + + // Replace all indices corresponding to unit-extent basis with 0. + // Remaining basis can be used to get a new `affine.delinearize_index` op. + SmallVector newOperands; + for (auto [index, basis] : llvm::enumerate(delinearizeOp.getBasis())) { + if (matchPattern(basis, m_One())) + replacements[index] = getZero(); + else + newOperands.push_back(basis); + } + + if (newOperands.size() == delinearizeOp.getBasis().size()) + return failure(); + + if (!newOperands.empty()) { + auto newDelinearizeOp = rewriter.create( + loc, delinearizeOp.getLinearIndex(), newOperands); + int newIndex = 0; + // Map back the new delinearized indices to the values they replace. + for (auto &replacement : replacements) { + if (replacement) + continue; + replacement = newDelinearizeOp->getResult(newIndex++); + } + } + + rewriter.replaceOp(delinearizeOp, replacements); + return success(); + } +}; + +/// Drop delinearization pattern related to loops in the following way +/// +/// ``` +/// (%iv) = (%c0) to (%ub) step (%c1) { +/// %0 = affine.delinearize_index %iv into (%ub) : index +/// (%0) +/// } +/// ``` +/// +/// can be canonicalized to +/// +/// ``` +/// (%iv) = (%c0) to (%ub) step (%c1) { +/// (%iv) +/// } +/// ``` +struct DropDelinearizeOfSingleLoop + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(affine::AffineDelinearizeIndexOp delinearizeOp, + PatternRewriter &rewriter) const override { + auto basis = delinearizeOp.getBasis(); + if (basis.size() != 1) + return failure(); + + // Check that the `linear_index` is an induction variable. + auto inductionVar = cast(delinearizeOp.getLinearIndex()); + if (!inductionVar) + return failure(); + + // Check that the parent is a `LoopLikeOpInterface`. + auto loopLikeOp = cast( + inductionVar.getParentRegion()->getParentOp()); + if (!loopLikeOp) + return failure(); + + // Check that loop is unit-rank and that the `linear_index` is the induction + // variable. + auto inductionVars = loopLikeOp.getLoopInductionVars(); + if (!inductionVars || inductionVars->size() != 1 || + inductionVars->front() != inductionVar) { + return rewriter.notifyMatchFailure( + delinearizeOp, "`linear_index` is not loop induction variable"); + } + + // Check that the upper-bound is the basis. + auto upperBounds = loopLikeOp.getLoopUpperBounds(); + if (!upperBounds || upperBounds->size() != 1 || + upperBounds->front() != getAsOpFoldResult(basis.front())) { + return rewriter.notifyMatchFailure(delinearizeOp, + "`basis` is not upper bound"); + } + + // Check that the lower bound is zero. + auto lowerBounds = loopLikeOp.getLoopLowerBounds(); + if (!lowerBounds || lowerBounds->size() != 1 || + !isZeroIndex(lowerBounds->front())) { + return rewriter.notifyMatchFailure(delinearizeOp, + "loop lower bound is not zero"); + } + + // Check that the step is one. + auto steps = loopLikeOp.getLoopSteps(); + if (!steps || steps->size() != 1 || !isConstantIntValue(steps->front(), 1)) + return rewriter.notifyMatchFailure(delinearizeOp, "loop step is not one"); + + rewriter.replaceOp(delinearizeOp, inductionVar); + return success(); + } +}; + +} // namespace + +void affine::AffineDelinearizeIndexOp::getCanonicalizationPatterns( + RewritePatternSet &patterns, MLIRContext *context) { + patterns.insert(context); +} + //===----------------------------------------------------------------------===// // TableGen'd op method definitions //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp index 04a8ff30ee946..f1841b860ff81 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp @@ -249,8 +249,7 @@ AllocTensorOp::getBufferType(Value value, const BufferizationOptions &options, LogicalResult AllocTensorOp::verify() { if (getCopy() && !getDynamicSizes().empty()) return emitError("dynamic sizes not needed when copying a tensor"); - if (!getCopy() && getType().getNumDynamicDims() != - static_cast(getDynamicSizes().size())) + if (!getCopy() && getType().getNumDynamicDims() != getDynamicSizes().size()) return emitError("expected ") << getType().getNumDynamicDims() << " dynamic sizes"; if (getCopy() && getCopy().getType() != getType()) diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index f822c11aeec00..156d6b8fe1595 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -1351,6 +1351,9 @@ void BarrierOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(eraseRedundantGpuBarrierOps); } +void BarrierOp::build(mlir::OpBuilder &odsBuilder, + mlir::OperationState &odsState) {} + //===----------------------------------------------------------------------===// // GPUFuncOp //===----------------------------------------------------------------------===// @@ -2045,8 +2048,7 @@ void WaitOp::getCanonicalizationPatterns(RewritePatternSet &results, LogicalResult AllocOp::verify() { auto memRefType = llvm::cast(getMemref().getType()); - if (static_cast(getDynamicSizes().size()) != - memRefType.getNumDynamicDims()) + if (getDynamicSizes().size() != memRefType.getNumDynamicDims()) return emitOpError("dimension operand count does not equal memref " "dynamic dimension count"); diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index 75b9729e63648..d579a27359dfa 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -205,8 +205,7 @@ static LogicalResult verifyAllocLikeOp(AllocLikeOp op) { if (!memRefType) return op.emitOpError("result must be a memref"); - if (static_cast(op.getDynamicSizes().size()) != - memRefType.getNumDynamicDims()) + if (op.getDynamicSizes().size() != memRefType.getNumDynamicDims()) return op.emitOpError("dimension operand count does not equal memref " "dynamic dimension count"); @@ -283,8 +282,7 @@ struct SimplifyAllocConst : public OpRewritePattern { // Create new memref type (which will have fewer dynamic dimensions). MemRefType newMemRefType = MemRefType::Builder(memrefType).setShape(newShapeConstants); - assert(static_cast(dynamicSizes.size()) == - newMemRefType.getNumDynamicDims()); + assert(dynamicSizes.size() == newMemRefType.getNumDynamicDims()); // Create and insert the alloc op for the new memref. auto newAlloc = rewriter.create( diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td index 93ea6e4e43698..28c45e6846380 100644 --- a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td +++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td @@ -11,12 +11,9 @@ include "mlir/Dialect/Arith/IR/ArithOps.td" include "mlir/Dialect/Polynomial/IR/Polynomial.td" -include "mlir/IR/EnumAttr.td" include "mlir/IR/OpBase.td" include "mlir/IR/PatternBase.td" -defvar DefOverflow = ConstantEnumCase; - def Equal : Constraint>; // Get a -1 integer attribute of the same type as the polynomial SSA value's @@ -44,40 +41,4 @@ def NTTAfterINTT : Pat< [(Equal $r1, $r2)] >; -// NTTs are expensive, and addition in coefficient or NTT domain should be -// equivalently expensive, so reducing the number of NTTs is optimal. -// ntt(a) + ntt(b) -> ntt(a + b) -def NTTOfAdd : Pat< - (Arith_AddIOp - (Polynomial_NTTOp $p1, $r1), - (Polynomial_NTTOp $p2, $r2), - $overflow), - (Polynomial_NTTOp (Polynomial_AddOp $p1, $p2), $r1), - [(Equal $r1, $r2)] ->; -// intt(a) + intt(b) -> intt(a + b) -def INTTOfAdd : Pat< - (Polynomial_AddOp - (Polynomial_INTTOp $t1, $r1), - (Polynomial_INTTOp $t2, $r2)), - (Polynomial_INTTOp (Arith_AddIOp $t1, $t2, DefOverflow), $r1), - [(Equal $r1, $r2)] ->; -// repeated for sub -def NTTOfSub : Pat< - (Arith_SubIOp - (Polynomial_NTTOp $p1, $r1), - (Polynomial_NTTOp $p2, $r2), - $overflow), - (Polynomial_NTTOp (Polynomial_SubOp $p1, $p2), $r1), - [(Equal $r1, $r2)] ->; -def INTTOfSub : Pat< - (Polynomial_SubOp - (Polynomial_INTTOp $t1, $r1), - (Polynomial_INTTOp $t2, $r2)), - (Polynomial_INTTOp (Arith_SubIOp $t1, $t2, DefOverflow), $r1), - [(Equal $r1, $r2)] ->; - #endif // POLYNOMIAL_CANONICALIZATION diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp index 2ba13bb7dab56..460ef17167e80 100644 --- a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp +++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp @@ -289,10 +289,10 @@ void SubOp::getCanonicalizationPatterns(RewritePatternSet &results, void NTTOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add(context); + results.add(context); } void INTTOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add(context); + results.add(context); } diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp index 6ba7020e86fa6..358a3b38a4cd3 100644 --- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp @@ -8,6 +8,7 @@ #include "mlir/Dialect/SCF/Transforms/Passes.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/Transforms/RegionUtils.h" diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 7cfd772a72b17..50cfd29e6bf90 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -1481,21 +1481,29 @@ checkAssumptionForFusingConsumer(tensor::InsertSliceOp candidateSliceOp) { /// failure otherwise. static FailureOr getConsumerFromUses(Value val, Block *containingOpBlock) { - // Step 1. Check that the value has exactly one use. - if (!llvm::hasSingleElement(val.getUses())) - return failure(); - // Step 2. Get uses. - OpOperand &operand = (*val.getUses().begin()); - Operation *consumerOp = operand.getOwner(); - // TODO: We have to init result of consumer before scf.for, use - // DestinationStyleOpInterface to get result shape from init for now. - // Add support for other op such as op has InferTypeOpInterface. - if (!isa(consumerOp) || - !isa(consumerOp)) - return failure(); - if (containingOpBlock != consumerOp->getBlock()) - return failure(); - return &operand; + // Check that the value has exactly one use which isn't a scf.yield or a + // tensor.parallel_insert_slice op. + OpOperand *operand = nullptr; + for (OpOperand &opOperand : val.getUses()) { + Operation *consumerOp = opOperand.getOwner(); + if (isa(consumerOp)) + continue; + if (operand) + return failure(); + // TODO: We have to init result of consumer before scf.for, use + // DestinationStyleOpInterface to get result shape from init for now. + // Add support for other op such as op has InferTypeOpInterface. + if (!isa(consumerOp) || + !isa(consumerOp)) + return failure(); + if (containingOpBlock != consumerOp->getBlock()) + return failure(); + operand = &opOperand; + } + + if (operand) + return operand; + return failure(); } /// Find the perfectly nested loops outside of given loop(included) sorted from diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index a794a121d6267..43fcc595af0f7 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -671,9 +672,26 @@ LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp, return success(); } +Range emitNormalizedLoopBoundsForIndexType(RewriterBase &rewriter, Location loc, + OpFoldResult lb, OpFoldResult ub, + OpFoldResult step) { + Range normalizedLoopBounds; + normalizedLoopBounds.offset = rewriter.getIndexAttr(0); + normalizedLoopBounds.stride = rewriter.getIndexAttr(1); + AffineExpr s0, s1, s2; + bindSymbols(rewriter.getContext(), s0, s1, s2); + AffineExpr e = (s1 - s0).ceilDiv(s2); + normalizedLoopBounds.size = + affine::makeComposedFoldedAffineApply(rewriter, loc, e, {lb, ub, step}); + return normalizedLoopBounds; +} + Range mlir::emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, OpFoldResult lb, OpFoldResult ub, OpFoldResult step) { + if (getType(lb).isIndex()) { + return emitNormalizedLoopBoundsForIndexType(rewriter, loc, lb, ub, step); + } // For non-index types, generate `arith` instructions // Check if the loop is already known to have a constant zero lower bound or // a constant one step. @@ -714,9 +732,38 @@ Range mlir::emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, return {newLowerBound, newUpperBound, newStep}; } +static void denormalizeInductionVariableForIndexType(RewriterBase &rewriter, + Location loc, + Value normalizedIv, + OpFoldResult origLb, + OpFoldResult origStep) { + AffineExpr d0, s0, s1; + bindSymbols(rewriter.getContext(), s0, s1); + bindDims(rewriter.getContext(), d0); + AffineExpr e = d0 * s1 + s0; + OpFoldResult denormalizedIv = affine::makeComposedFoldedAffineApply( + rewriter, loc, e, ArrayRef{normalizedIv, origLb, origStep}); + Value denormalizedIvVal = + getValueOrCreateConstantIndexOp(rewriter, loc, denormalizedIv); + SmallPtrSet preservedUses; + // If an `affine.apply` operation is generated for denormalization, the use + // of `origLb` in those ops must not be replaced. These arent not generated + // when `origLb == 0` and `origStep == 1`. + if (!isConstantIntValue(origLb, 0) || !isConstantIntValue(origStep, 1)) { + if (Operation *preservedUse = denormalizedIvVal.getDefiningOp()) { + preservedUses.insert(preservedUse); + } + } + rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIvVal, preservedUses); +} + void mlir::denormalizeInductionVariable(RewriterBase &rewriter, Location loc, Value normalizedIv, OpFoldResult origLb, OpFoldResult origStep) { + if (getType(origLb).isIndex()) { + return denormalizeInductionVariableForIndexType(rewriter, loc, normalizedIv, + origLb, origStep); + } Value denormalizedIv; SmallPtrSet preserve; bool isStepOne = isConstantIntValue(origStep, 1); @@ -739,10 +786,29 @@ void mlir::denormalizeInductionVariable(RewriterBase &rewriter, Location loc, rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve); } +static OpFoldResult getProductOfIndexes(RewriterBase &rewriter, Location loc, + ArrayRef values) { + assert(!values.empty() && "unexecpted empty array"); + AffineExpr s0, s1; + bindSymbols(rewriter.getContext(), s0, s1); + AffineExpr mul = s0 * s1; + OpFoldResult products = rewriter.getIndexAttr(1); + for (auto v : values) { + products = affine::makeComposedFoldedAffineApply( + rewriter, loc, mul, ArrayRef{products, v}); + } + return products; +} + /// Helper function to multiply a sequence of values. static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, ArrayRef values) { assert(!values.empty() && "unexpected empty list"); + if (getType(values.front()).isIndex()) { + SmallVector ofrs = getAsOpFoldResult(values); + OpFoldResult product = getProductOfIndexes(rewriter, loc, ofrs); + return getValueOrCreateConstantIndexOp(rewriter, loc, product); + } std::optional productOf; for (auto v : values) { auto vOne = getConstantIntValue(v); @@ -757,7 +823,7 @@ static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, if (!productOf) { productOf = rewriter .create( - loc, rewriter.getOneAttr(values.front().getType())) + loc, rewriter.getOneAttr(getType(values.front()))) .getResult(); } return productOf.value(); @@ -774,6 +840,16 @@ static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, static std::pair, SmallPtrSet> delinearizeInductionVariable(RewriterBase &rewriter, Location loc, Value linearizedIv, ArrayRef ubs) { + + if (linearizedIv.getType().isIndex()) { + Operation *delinearizedOp = + rewriter.create(loc, linearizedIv, + ubs); + auto resultVals = llvm::map_to_vector( + delinearizedOp->getResults(), [](OpResult r) -> Value { return r; }); + return {resultVals, SmallPtrSet{delinearizedOp}}; + } + SmallVector delinearizedIvs(ubs.size()); SmallPtrSet preservedUsers; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp index cc372ed1be621..60db71d96547f 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -881,25 +881,27 @@ struct TensorReshapeRewriter : public OpRewritePattern { PatternRewriter &rewriter) const override { Location loc = op.getLoc(); Value srcTensor = op.getSource(); - const auto srcTp = getSparseTensorType(srcTensor); - const auto dstTp = getSparseTensorType(op.getResult()); + const auto srcTp = tryGetSparseTensorType(srcTensor); + const auto dstTp = tryGetSparseTensorType(op.getResult()); + if (!srcTp || !dstTp) + return failure(); - if (!srcTp.hasEncoding() || !dstTp.hasEncoding() || - !dstTp.hasStaticDimShape()) + if (!srcTp->hasEncoding() || !dstTp->hasEncoding() || + !dstTp->hasStaticDimShape()) return failure(); SmallVector srcSizes; - sizesForTensor(rewriter, srcSizes, loc, srcTp, srcTensor); + sizesForTensor(rewriter, srcSizes, loc, *srcTp, srcTensor); SmallVector dstSizes; - for (Dimension d : dstTp.getDimShape()) + for (Dimension d : dstTp->getDimShape()) dstSizes.push_back(constantIndex(rewriter, loc, d)); Value nnz = rewriter.create(loc, srcTensor); // Only need an unordered COO buffer if input and output are not sorted // in the same way. Type bufferTp = getBufferType( - dstTp.withoutDimToLvl(), - !srcTp.isAllOrdered() || !srcTp.isIdentity() || !dstTp.isIdentity()); + dstTp->withoutDimToLvl(), + !srcTp->isAllOrdered() || !srcTp->isIdentity() || !dstTp->isIdentity()); SmallVector dynSizes; Value buffer = rewriter .create(loc, bufferTp, dynSizes, Value(), @@ -917,12 +919,12 @@ struct TensorReshapeRewriter : public OpRewritePattern { // followed by an optional // %t = sparse_tensor.cast %tmp // depending on whether the input/output are sorted in the same way. - const auto encSrc = srcTp.getEncoding(); + const auto encSrc = srcTp->getEncoding(); ForeachOp foreachOp = rewriter.create( loc, srcTensor, buffer, [&](OpBuilder &builder, Location loc, ValueRange srcLcvs, Value v, ValueRange reduc) { - const Dimension srcRank = srcTp.getDimRank(); + const Dimension srcRank = srcTp->getDimRank(); SmallVector srcDcvs; srcDcvs.reserve(srcRank); for (Dimension d = 0; d < srcRank; d++) { @@ -945,7 +947,7 @@ struct TensorReshapeRewriter : public OpRewritePattern { collapsedSizes, collapsedDcvs); ReassociationIndices expandIdx; - for (Dimension i = 0; i < dstTp.getDimRank(); i++) + for (Dimension i = 0; i < dstTp->getDimRank(); i++) expandIdx.push_back(i); SmallVector expandReass = {expandIdx}; SmallVector dstDcvs; @@ -958,8 +960,8 @@ struct TensorReshapeRewriter : public OpRewritePattern { }); Value t = rewriter.create(loc, foreachOp.getResult(0), true); - if (bufferTp != dstTp) { - auto dstRTT = dstTp.getRankedTensorType(); + if (bufferTp != *dstTp) { + auto dstRTT = dstTp->getRankedTensorType(); Value converted = rewriter.create(loc, dstRTT, t).getResult(); rewriter.create(loc, t); t = converted; @@ -1139,13 +1141,13 @@ struct SparseTensorDimOpRewriter : public OpRewritePattern { LogicalResult matchAndRewrite(tensor::DimOp op, PatternRewriter &rewriter) const override { std::optional dim = op.getConstantIndex(); - auto stt = getSparseTensorType(op.getSource()); - if (!dim || !stt.hasEncoding()) + auto stt = tryGetSparseTensorType(op.getSource()); + if (!dim || !stt || !stt->hasEncoding()) return failure(); - if (stt.isPermutation()) { + if (stt->isPermutation()) { rewriter.replaceOpWithNewOp(op, op.getSource(), - toLvl(stt.getEncoding(), *dim)); + toLvl(stt->getEncoding(), *dim)); return success(); } @@ -1157,16 +1159,16 @@ struct SparseTensorDimOpRewriter : public OpRewritePattern { // computed simply by lvl_size * block_size. Location loc = op.getLoc(); SmallVector maxLvlCrds; - for (Level l = 0; l < stt.getLvlRank(); l++) { + for (Level l = 0; l < stt->getLvlRank(); l++) { Value lvlSz = rewriter.create(loc, op.getSource(), l); Value maxLvlCrd = rewriter.create( loc, lvlSz, constantOne(rewriter, loc, rewriter.getIndexType())); maxLvlCrds.push_back(maxLvlCrd); } - AffineExpr lvl2DimExp = stt.getLvlToDim().getResult(*dim); + AffineExpr lvl2DimExp = stt->getLvlToDim().getResult(*dim); Value maxDimCrd = rewriter.create( - op.getLoc(), AffineMap::get(stt.getLvlRank(), 0, lvl2DimExp), + op.getLoc(), AffineMap::get(stt->getLvlRank(), 0, lvl2DimExp), maxLvlCrds); Value dimSz = rewriter.create( diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 1ac96756e22b5..defac8308b909 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -179,8 +179,7 @@ static RankedTensorType foldDynamicToStaticDimSizes(RankedTensorType type, ValueRange dynamicSizes, SmallVector &foldedDynamicSizes) { SmallVector staticShape(type.getShape()); - assert(type.getNumDynamicDims() == - static_cast(dynamicSizes.size()) && + assert(type.getNumDynamicDims() == dynamicSizes.size() && "incorrect number of dynamic sizes"); // Compute new static and dynamic sizes. @@ -894,8 +893,7 @@ void EmptyOp::build(OpBuilder &builder, OperationState &result, } LogicalResult EmptyOp::verify() { - if (getType().getNumDynamicDims() != - static_cast(getDynamicSizes().size())) + if (getType().getNumDynamicDims() != getDynamicSizes().size()) return emitOpError("incorrect number of dynamic sizes, has ") << getDynamicSizes().size() << ", expected " << getType().getNumDynamicDims(); @@ -3672,8 +3670,7 @@ void SplatOp::getAsmResultNames( } LogicalResult SplatOp::verify() { - if (getType().getNumDynamicDims() != - static_cast(getDynamicSizes().size())) + if (getType().getNumDynamicDims() != getDynamicSizes().size()) return emitOpError("incorrect number of dynamic sizes, has ") << getDynamicSizes().size() << ", expected " << getType().getNumDynamicDims(); diff --git a/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp index fdd968238667e..91702ce7cc42b 100644 --- a/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp +++ b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp @@ -330,7 +330,7 @@ void transform::TransformState::forgetMapping(Value opHandle, for (Operation *op : mappings.direct[opHandle]) dropMappingEntry(mappings.reverse, op, opHandle); mappings.direct.erase(opHandle); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // Payload IR is removed from the mapping. This invalidates the respective // iterators. mappings.incrementTimestamp(opHandle); @@ -342,7 +342,7 @@ void transform::TransformState::forgetMapping(Value opHandle, for (Value resultHandle : resultHandles) { Mappings &localMappings = getMapping(resultHandle); dropMappingEntry(localMappings.values, resultHandle, opResult); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // Payload IR is removed from the mapping. This invalidates the respective // iterators. mappings.incrementTimestamp(resultHandle); @@ -358,7 +358,7 @@ void transform::TransformState::forgetValueMapping( for (Value payloadValue : mappings.reverseValues[valueHandle]) dropMappingEntry(mappings.reverseValues, payloadValue, valueHandle); mappings.values.erase(valueHandle); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // Payload IR is removed from the mapping. This invalidates the respective // iterators. mappings.incrementTimestamp(valueHandle); @@ -372,7 +372,7 @@ void transform::TransformState::forgetValueMapping( dropMappingEntry(localMappings.direct, opHandle, payloadOp); dropMappingEntry(localMappings.reverse, payloadOp, opHandle); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // Payload IR is removed from the mapping. This invalidates the respective // iterators. localMappings.incrementTimestamp(opHandle); @@ -452,7 +452,7 @@ transform::TransformState::replacePayloadValue(Value value, Value replacement) { // between the handles and the IR objects if (!replacement) { dropMappingEntry(mappings.values, handle, value); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS // Payload IR is removed from the mapping. This invalidates the respective // iterators. mappings.incrementTimestamp(handle); @@ -804,7 +804,7 @@ checkRepeatedConsumptionInOperand(ArrayRef payload, void transform::TransformState::compactOpHandles() { for (Value handle : opHandlesToCompact) { Mappings &mappings = getMapping(handle, /*allowOutOfScope=*/true); -#if LLVM_ENABLE_ABI_BREAKING_CHECKS +#ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS if (llvm::find(mappings.direct[handle], nullptr) != mappings.direct[handle].end()) // Payload IR is removed from the mapping. This invalidates the respective diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index d0fd8e420d38e..7f95f5ace8c00 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -545,6 +545,10 @@ class SymbolAlias { bool isType : 1; /// A flag indicating whether this alias may be deferred or not. bool isDeferrable : 1; + +public: + /// Used to avoid printing incomplete aliases for recursive types. + bool isPrinted = false; }; /// This class represents a utility that initializes the set of attribute and @@ -1222,6 +1226,8 @@ LogicalResult AliasState::getAlias(Type ty, raw_ostream &os) const { const auto *it = attrTypeToAlias.find(ty.getAsOpaquePointer()); if (it == attrTypeToAlias.end()) return failure(); + if (!it->second.isPrinted) + return failure(); it->second.print(os); return success(); @@ -1238,12 +1244,9 @@ void AliasState::printAliases(AsmPrinter::Impl &p, NewLineCounter &newLine, p.getStream() << " = "; if (alias.isTypeAlias()) { - // TODO: Support nested aliases in mutable types. Type type = Type::getFromOpaquePointer(opaqueSymbol); - if (type.hasTrait()) - p.getStream() << type; - else - p.printTypeImpl(type); + p.printTypeImpl(type); + alias.isPrinted = true; } else { // TODO: Support nested aliases in mutable attributes. Attribute attr = Attribute::getFromOpaquePointer(opaqueSymbol); diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index 72d85d796dd4a..46b7b0a473c69 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -130,7 +130,7 @@ convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { llvm::Module *module = builder.GetInsertBlock()->getModule(); llvm::Intrinsic::ID id = - llvm::Function::lookupIntrinsicID(op.getIntrinAttr()); + llvm::Intrinsic::lookupIntrinsicID(op.getIntrinAttr()); if (!id) return mlir::emitError(op.getLoc(), "could not find LLVM intrinsic: ") << op.getIntrinAttr(); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 0cba8d80681f1..d788fe1f6165e 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -3359,8 +3359,7 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg, llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS); if (allocaAS != defaultAS && arg.getType()->isPointerTy()) - v = builder.CreatePointerBitCastOrAddrSpaceCast( - v, arg.getType()->getPointerTo(defaultAS)); + v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS)); builder.CreateStore(&arg, v); diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 69036e947ebdb..4693edadfb5ee 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -71,10 +71,16 @@ namespace { /// This class wraps a IRMapping to provide recursive lookup /// functionality, i.e. we will traverse if the mapped value also has a mapping. struct ConversionValueMapping { - /// Lookup a mapped value within the map. If a mapping for the provided value - /// does not exist then return the provided value. If `desiredType` is - /// non-null, returns the most recently mapped value with that type. If an - /// operand of that type does not exist, defaults to normal behavior. + /// Lookup the most recently mapped value with the desired type in the + /// mapping. + /// + /// Special cases: + /// - If the desired type is "null", simply return the most recently mapped + /// value. + /// - If there is no mapping to the desired type, also return the most + /// recently mapped value. + /// - If there is no mapping for the given value at all, return the given + /// value. Value lookupOrDefault(Value from, Type desiredType = nullptr) const; /// Lookup a mapped value within the map, or return null if a mapping does not @@ -115,19 +121,11 @@ struct ConversionValueMapping { Value ConversionValueMapping::lookupOrDefault(Value from, Type desiredType) const { - // If there was no desired type, simply find the leaf value. - if (!desiredType) { - // If this value had a valid mapping, unmap that value as well in the case - // that it was also replaced. - while (auto mappedValue = mapping.lookupOrNull(from)) - from = mappedValue; - return from; - } - - // Otherwise, try to find the deepest value that has the desired type. + // Try to find the deepest value that has the desired type. If there is no + // such value, simply return the deepest value. Value desiredValue; do { - if (from.getType() == desiredType) + if (!desiredType || from.getType() == desiredType) desiredValue = from; Value mappedValue = mapping.lookupOrNull(from); @@ -1136,7 +1134,7 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( MaterializationKind::Target, computeInsertPoint(newOperand), operandLoc, /*inputs=*/newOperand, /*outputType=*/desiredType, currentTypeConverter); - mapping.map(mapping.lookupOrDefault(newOperand), castValue); + mapping.map(newOperand, castValue); newOperand = castValue; } remapped.push_back(newOperand); diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir index 910105ddf6958..4767565ea0550 100644 --- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir +++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir @@ -213,14 +213,29 @@ gpu.module @barriers { // CHECK-LABEL: gpu_barrier func.func @gpu_barrier() { - // CHECK: [[FLAGS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: llvm.call spir_funccc @_Z7barrierj([[FLAGS]]) { + // CHECK: [[GLOBAL_AND_LOCAL_FLAG:%.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL_AND_LOCAL_FLAG]]) { // CHECK-SAME-DAG: no_unwind // CHECK-SAME-DAG: convergent // CHECK-SAME-DAG: will_return // CHECK-NOT: memory_effects = #llvm.memory_effects // CHECK-SAME: } : (i32) -> () gpu.barrier + // CHECK: [[GLOBAL_AND_LOCAL_FLAG2:%.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL_AND_LOCAL_FLAG2]]) + gpu.barrier memfence [#gpu.address_space, #gpu.address_space] + // CHECK: [[LOCAL_FLAG:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[LOCAL_FLAG]]) + gpu.barrier memfence [#gpu.address_space] + // CHECK: [[GLOBAL_FLAG:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL_FLAG]]) + gpu.barrier memfence [#gpu.address_space] + // CHECK: [[NONE_FLAG:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[NONE_FLAG]]) + gpu.barrier memfence [] + // CHECK: [[NONE_FLAG2:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[NONE_FLAG2]]) + gpu.barrier memfence [#gpu.address_space] return } } diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index 730808dbbdf18..ff0e987bcef6c 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -1466,3 +1466,51 @@ func.func @prefetch_canonicalize(%arg0: memref<512xf32>) -> () { } return } + +// ----- + +func.func @drop_unit_basis_in_delinearize(%arg0 : index, %arg1 : index, %arg2 : index) -> + (index, index, index, index, index, index) { + %c1 = arith.constant 1 : index + %0:6 = affine.delinearize_index %arg0 into (%c1, %arg1, %c1, %c1, %arg2, %c1) + : index, index, index, index, index, index + return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : index, index, index, index, index, index +} +// CHECK-LABEL: func @drop_unit_basis_in_delinearize( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[DELINEARIZE:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], %[[ARG2]]) +// CHECK: return %[[C0]], %[[DELINEARIZE]]#0, %[[C0]], %[[C0]], %[[DELINEARIZE]]#1, %[[C0]] + +// ----- + +func.func @drop_all_unit_bases(%arg0 : index) -> (index, index) { + %c1 = arith.constant 1 : index + %0:2 = affine.delinearize_index %arg0 into (%c1, %c1) : index, index + return %0#0, %0#1 : index, index +} +// CHECK-LABEL: func @drop_all_unit_bases( +// CHECK-SAME: %[[ARG0:.+]]: index) +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-NOT: affine.delinearize_index +// CHECK: return %[[C0]], %[[C0]] + +// ----- + +func.func @drop_single_loop_delinearize(%arg0 : index, %arg1 : index) -> index { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %2 = scf.for %iv = %c0 to %arg1 step %c1 iter_args(%arg2 = %c0) -> index { + %0 = affine.delinearize_index %iv into (%arg1) : index + %1 = "some_use"(%arg2, %0) : (index, index) -> (index) + scf.yield %1 : index + } + return %2 : index +} +// CHECK-LABEL: func @drop_single_loop_delinearize( +// CHECK-SAME: %[[ARG0:.+]]: index) +// CHECK: scf.for %[[IV:[a-zA-Z0-9]+]] = +// CHECK-NOT: affine.delinearize_index +// CHECK: "some_use"(%{{.+}}, %[[IV]]) diff --git a/mlir/test/Dialect/Affine/loop-coalescing.mlir b/mlir/test/Dialect/Affine/loop-coalescing.mlir index 45dd299295f64..f6e7b21bc66ab 100644 --- a/mlir/test/Dialect/Affine/loop-coalescing.mlir +++ b/mlir/test/Dialect/Affine/loop-coalescing.mlir @@ -1,14 +1,15 @@ -// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse %s | FileCheck %s +// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse --mlir-print-local-scope %s | FileCheck %s // CHECK-LABEL: @one_3d_nest func.func @one_3d_nest() { // Capture original bounds. Note that for zero-based step-one loops, the // upper bound is also the number of iterations. - // CHECK: %[[orig_lb:.*]] = arith.constant 0 - // CHECK: %[[orig_step:.*]] = arith.constant 1 - // CHECK: %[[orig_ub_k:.*]] = arith.constant 3 - // CHECK: %[[orig_ub_i:.*]] = arith.constant 42 - // CHECK: %[[orig_ub_j:.*]] = arith.constant 56 + // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0 + // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1 + // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3 + // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42 + // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56 + // CHECK-DAG: %[[range:.*]] = arith.constant 7056 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index @@ -16,9 +17,6 @@ func.func @one_3d_nest() { %c42 = arith.constant 42 : index %c56 = arith.constant 56 : index // The range of the new scf. - // CHECK: %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]] - // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]] - // Updated loop bounds. // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] scf.for %i = %c0 to %c42 step %c1 { @@ -26,13 +24,11 @@ func.func @one_3d_nest() { // CHECK-NOT: scf.for // Reconstruct original IVs from the linearized one. - // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]] - // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]] - // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]] - // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]] + // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]] + // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]]) scf.for %j = %c0 to %c56 step %c1 { scf.for %k = %c0 to %c3 step %c1 { - // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]]) + // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2) "use"(%i, %j, %k) : (index, index, index) -> () } } @@ -40,6 +36,8 @@ func.func @one_3d_nest() { return } +// ----- + // Check that there is no chasing the replacement of value uses by ensuring // multiple uses of loop induction variables get rewritten to the same values. @@ -52,13 +50,10 @@ func.func @multi_use() { scf.for %i = %c1 to %c10 step %c1 { scf.for %j = %c1 to %c10 step %c1 { scf.for %k = %c1 to %c10 step %c1 { - // CHECK: %[[k_unshifted:.*]] = arith.remsi %[[iv]], %[[k_extent:.*]] - // CHECK: %[[ij:.*]] = arith.divsi %[[iv]], %[[k_extent]] - // CHECK: %[[j_unshifted:.*]] = arith.remsi %[[ij]], %[[j_extent:.*]] - // CHECK: %[[i_unshifted:.*]] = arith.divsi %[[ij]], %[[j_extent]] - // CHECK: %[[k:.*]] = arith.addi %[[k_unshifted]] - // CHECK: %[[j:.*]] = arith.addi %[[j_unshifted]] - // CHECK: %[[i:.*]] = arith.addi %[[i_unshifted]] + // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[iv]] + // CHECK: %[[k:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#2) + // CHECK: %[[j:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#1) + // CHECK: %[[i:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#0) // CHECK: "use1"(%[[i]], %[[j]], %[[k]]) "use1"(%i,%j,%k) : (index,index,index) -> () @@ -72,12 +67,20 @@ func.func @multi_use() { return } +// ----- + func.func @unnormalized_loops() { - // CHECK: %[[orig_step_i:.*]] = arith.constant 2 + // Normalized lower bound and step for the outer scf. + // CHECK-DAG: %[[lb_i:.*]] = arith.constant 0 + // CHECK-DAG: %[[step_i:.*]] = arith.constant 1 + // CHECK-DAG: %[[orig_step_j_and_numiter_i:.*]] = arith.constant 3 + + // Number of iterations in the inner loop, the pattern is the same as above, + // only capture the final result. + // CHECK-DAG: %[[numiter_j:.*]] = arith.constant 4 + + // CHECK-DAG: %[[range:.*]] = arith.constant 12 - // CHECK: %[[orig_step_j_and_numiter_i:.*]] = arith.constant 3 - // CHECK: %[[orig_lb_i:.*]] = arith.constant 5 - // CHECK: %[[orig_lb_j:.*]] = arith.constant 7 %c2 = arith.constant 2 : index %c3 = arith.constant 3 : index %c5 = arith.constant 5 : index @@ -85,28 +88,18 @@ func.func @unnormalized_loops() { %c10 = arith.constant 10 : index %c17 = arith.constant 17 : index - // Normalized lower bound and step for the outer scf. - // CHECK: %[[lb_i:.*]] = arith.constant 0 - // CHECK: %[[step_i:.*]] = arith.constant 1 - - // Number of iterations in the inner loop, the pattern is the same as above, - // only capture the final result. - // CHECK: %[[numiter_j:.*]] = arith.constant 4 // New bounds of the outer scf. - // CHECK: %[[range:.*]] = arith.muli %[[orig_step_j_and_numiter_i:.*]], %[[numiter_j]] // CHECK: scf.for %[[i:.*]] = %[[lb_i]] to %[[range]] step %[[step_i]] scf.for %i = %c5 to %c10 step %c2 { // The inner loop has been removed. // CHECK-NOT: scf.for scf.for %j = %c7 to %c17 step %c3 { // The IVs are rewritten. - // CHECK: %[[normalized_j:.*]] = arith.remsi %[[i]], %[[numiter_j]] - // CHECK: %[[normalized_i:.*]] = arith.divsi %[[i]], %[[numiter_j]] - // CHECK: %[[scaled_j:.*]] = arith.muli %[[normalized_j]], %[[orig_step_j_and_numiter_i]] - // CHECK: %[[orig_j:.*]] = arith.addi %[[scaled_j]], %[[orig_lb_j]] - // CHECK: %[[scaled_i:.*]] = arith.muli %[[normalized_i]], %[[orig_step_i]] - // CHECK: %[[orig_i:.*]] = arith.addi %[[scaled_i]], %[[orig_lb_i]] + // CHECK: %[[delinearize:.+]]:2 = affine.delinearize_index %[[i]] + // CHECK-SAME: into (%[[orig_step_j_and_numiter_i]], %[[numiter_j]]) + // CHECK: %[[orig_j:.*]] = affine.apply affine_map<(d0) -> (d0 * 3 + 7)>(%[[delinearize]]#1) + // CHECK: %[[orig_i:.*]] = affine.apply affine_map<(d0) -> (d0 * 2 + 5)>(%[[delinearize]]#0) // CHECK: "use"(%[[orig_i]], %[[orig_j]]) "use"(%i, %j) : (index, index) -> () } @@ -114,20 +107,21 @@ func.func @unnormalized_loops() { return } +// ----- + func.func @noramalized_loops_with_yielded_iter_args() { - // CHECK: %[[orig_lb:.*]] = arith.constant 0 - // CHECK: %[[orig_step:.*]] = arith.constant 1 - // CHECK: %[[orig_ub_k:.*]] = arith.constant 3 - // CHECK: %[[orig_ub_i:.*]] = arith.constant 42 - // CHECK: %[[orig_ub_j:.*]] = arith.constant 56 + // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0 + // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42 + // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1 + // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56 + // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3 + // CHECK-DAG: %[[range:.*]] = arith.constant 7056 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c42 = arith.constant 42 : index %c56 = arith.constant 56 : index // The range of the new scf. - // CHECK: %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]] - // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]] // Updated loop bounds. // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]]) -> (index) { @@ -136,13 +130,10 @@ func.func @noramalized_loops_with_yielded_iter_args() { // CHECK-NOT: scf.for // Reconstruct original IVs from the linearized one. - // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]] - // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]] - // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]] - // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]] + // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]] into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]]) %1:1 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg1 = %arg0) -> (index){ %0:1 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg2 = %arg1) -> (index) { - // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]]) + // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2) "use"(%i, %j, %k) : (index, index, index) -> () // CHECK: scf.yield %[[VAL_1]] : index scf.yield %arg2 : index @@ -154,20 +145,21 @@ func.func @noramalized_loops_with_yielded_iter_args() { return } +// ----- + func.func @noramalized_loops_with_shuffled_yielded_iter_args() { - // CHECK: %[[orig_lb:.*]] = arith.constant 0 - // CHECK: %[[orig_step:.*]] = arith.constant 1 - // CHECK: %[[orig_ub_k:.*]] = arith.constant 3 - // CHECK: %[[orig_ub_i:.*]] = arith.constant 42 - // CHECK: %[[orig_ub_j:.*]] = arith.constant 56 + // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0 + // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1 + // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3 + // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42 + // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c42 = arith.constant 42 : index %c56 = arith.constant 56 : index // The range of the new scf. - // CHECK: %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]] - // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]] + // CHECK-DAG:%[[range:.*]] = arith.constant 7056 // Updated loop bounds. // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]], %[[VAL_2:.*]] = %[[orig_lb]]) -> (index, index) { @@ -176,13 +168,11 @@ func.func @noramalized_loops_with_shuffled_yielded_iter_args() { // CHECK-NOT: scf.for // Reconstruct original IVs from the linearized one. - // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]] - // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]] - // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]] - // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]] + // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]] + // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]]) %1:2 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (index, index){ %0:2 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (index, index) { - // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]]) + // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2) "use"(%i, %j, %k) : (index, index, index) -> () // CHECK: scf.yield %[[VAL_2]], %[[VAL_1]] : index, index scf.yield %arg5, %arg4 : index, index @@ -194,20 +184,21 @@ func.func @noramalized_loops_with_shuffled_yielded_iter_args() { return } +// ----- + func.func @noramalized_loops_with_yielded_non_iter_args() { - // CHECK: %[[orig_lb:.*]] = arith.constant 0 - // CHECK: %[[orig_step:.*]] = arith.constant 1 - // CHECK: %[[orig_ub_k:.*]] = arith.constant 3 - // CHECK: %[[orig_ub_i:.*]] = arith.constant 42 - // CHECK: %[[orig_ub_j:.*]] = arith.constant 56 + // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0 + // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1 + // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3 + // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42 + // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c42 = arith.constant 42 : index %c56 = arith.constant 56 : index // The range of the new scf. - // CHECK: %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]] - // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]] + // CHECK-DAG: %[[range:.*]] = arith.constant 7056 // Updated loop bounds. // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]]) -> (index) { @@ -216,13 +207,11 @@ func.func @noramalized_loops_with_yielded_non_iter_args() { // CHECK-NOT: scf.for // Reconstruct original IVs from the linearized one. - // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]] - // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]] - // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]] - // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]] + // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]] + // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]]) %1:1 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg1 = %arg0) -> (index){ %0:1 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg2 = %arg1) -> (index) { - // CHECK: %[[res:.*]] = "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]]) + // CHECK: %[[res:.*]] = "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2) %res = "use"(%i, %j, %k) : (index, index, index) -> (index) // CHECK: scf.yield %[[res]] : index scf.yield %res : index @@ -234,6 +223,8 @@ func.func @noramalized_loops_with_yielded_non_iter_args() { return } +// ----- + // Check with parametric loop bounds and steps, capture the bounds here. // CHECK-LABEL: @parametric // CHECK-SAME: %[[orig_lb1:[A-Za-z0-9]+]]: @@ -246,25 +237,28 @@ func.func @parametric(%lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) { // Compute the number of iterations for each of the loops and the total // number of iterations. - // CHECK: %[[range1:.*]] = arith.subi %[[orig_ub1]], %[[orig_lb1]] - // CHECK: %[[numiter1:.*]] = arith.ceildivsi %[[range1]], %[[orig_step1]] - // CHECK: %[[range2:.*]] = arith.subi %[[orig_ub2]], %[[orig_lb2]] - // CHECK: %[[numiter2:.*]] = arith.ceildivsi %[[range2]], %[[orig_step2]] - // CHECK: %[[range:.*]] = arith.muli %[[numiter1]], %[[numiter2]] : index + // CHECK: %[[normalized_i:.*]] = affine.apply + // CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[orig_lb1]], %[[orig_ub1]], %[[orig_step1]]] + // CHECK: %[[c0:.+]] = arith.constant 0 + // CHECK: %[[c1:.+]] = arith.constant 1 + // CHECK: %[[normalized_j:.*]] = affine.apply + // CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[orig_lb2]], %[[orig_ub2]], %[[orig_step2]]] + // CHECK: %[[range:.+]] = affine.apply + // CHECK-SAME: affine_map<()[s0, s1, s2, s3, s4, s5] -> (((-s0 + s1) ceildiv s2) * ((-s3 + s4) ceildiv s5))>() + // CHECK-SAME: [%[[orig_lb1]], %[[orig_ub1]], %[[orig_step1]], %[[orig_lb2]], %[[orig_ub2]], %[[orig_step2]]] // Check that the outer loop is updated. - // CHECK: scf.for %[[i:.*]] = %c0{{.*}} to %[[range]] step %c1 + // CHECK: scf.for %[[i:.*]] = %[[c0]] to %[[range]] step %[[c1]] scf.for %i = %lb1 to %ub1 step %step1 { // Check that the inner loop is removed. // CHECK-NOT: scf.for scf.for %j = %lb2 to %ub2 step %step2 { // Remapping of the induction variables. - // CHECK: %[[normalized_j:.*]] = arith.remsi %[[i]], %[[numiter2]] : index - // CHECK: %[[normalized_i:.*]] = arith.divsi %[[i]], %[[numiter2]] : index - // CHECK: %[[scaled_j:.*]] = arith.muli %[[normalized_j]], %[[orig_step2]] - // CHECK: %[[orig_j:.*]] = arith.addi %[[scaled_j]], %[[orig_lb2]] - // CHECK: %[[scaled_i:.*]] = arith.muli %[[normalized_i]], %[[orig_step1]] - // CHECK: %[[orig_i:.*]] = arith.addi %[[scaled_i]], %[[orig_lb1]] + // CHECK: %[[delinearize:.+]]:2 = affine.delinearize_index %[[i]] into (%[[normalized_i]], %[[normalized_j]]) + // CHECK: %[[orig_j:.*]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + // CHECK-SAME: (%[[delinearize]]#1)[%[[orig_lb2]], %[[orig_step2]]] + // CHECK: %[[orig_i:.*]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + // CHECK-SAME: (%[[delinearize]]#0)[%[[orig_lb1]], %[[orig_step1]]] // CHECK: "foo"(%[[orig_i]], %[[orig_j]]) "foo"(%i, %j) : (index, index) -> () @@ -273,19 +267,21 @@ func.func @parametric(%lb1 : index, %ub1 : index, %step1 : index, return } +// ----- + // CHECK-LABEL: @two_bands func.func @two_bands() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index - // CHECK: %[[outer_range:.*]] = arith.muli + // CHECK: %[[outer_range:.*]] = arith.constant 100 // CHECK: scf.for %{{.*}} = %{{.*}} to %[[outer_range]] scf.for %i = %c0 to %c10 step %c1 { // Check that the "j" loop was removed and that the inner loops were // coalesced as well. The preparation step for coalescing will inject the // subtraction operation unlike the IV remapping. // CHECK-NOT: scf.for - // CHECK: arith.subi + // CHECK: affine.delinearize_index scf.for %j = %c0 to %c10 step %c1 { // The inner pair of loops is coalesced separately. // CHECK: scf.for @@ -303,12 +299,6 @@ func.func @two_bands() { // ----- // Check coalescing of affine.for loops when all the loops have constant upper bound. -// CHECK-DAG: #[[SIXTEEN:.*]] = affine_map<() -> (16)> -// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)> -// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> -// CHECK-DAG: #[[EIGHT:.*]] = affine_map<() -> (8)> -// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> -// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> func.func @coalesce_affine_for() { affine.for %i = 0 to 16 { affine.for %j = 0 to 64 { @@ -319,16 +309,16 @@ func.func @coalesce_affine_for() { } return } -// CHECK-DAG: %[[T0:.*]] = affine.apply #[[SIXTEEN]]() -// CHECK-DAG: %[[T1:.*]] = affine.apply #[[SIXTY_FOUR]]() -// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]] -// CHECK-DAG: %[[T3:.*]] = affine.apply #[[EIGHT]]() -// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]] +// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<() -> (16)>() +// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<() -> (64)>() +// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T1]]] +// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<() -> (8)>() +// CHECK-DAG: %[[T4:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T2]])[%[[T3]]] // CHECK: affine.for %[[IV:.*]] = 0 to %[[T4]] -// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T3]]] -// CHECK-DAG: %[[T6:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T3]]] -// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T6]])[%[[T1]]] -// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T6]])[%[[T1]]] +// CHECK-DAG: %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T3]]] +// CHECK-DAG: %[[T6:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T3]]] +// CHECK-DAG: %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T6]])[%[[T1]]] +// CHECK-DAG: %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T6]])[%[[T1]]] // CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) // CHECK-NEXT: } // CHECK-NEXT: return @@ -336,10 +326,6 @@ func.func @coalesce_affine_for() { // ----- // Check coalescing of affine.for loops when all the loops have non constant upper bounds. -// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)> -// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> -// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> -// CHECK-DAG: #[[FLOOR:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> func.func @coalesce_affine_for(%arg0: memref) { %c0 = arith.constant 0 : index %M = memref.dim %arg0, %c0 : memref @@ -355,14 +341,14 @@ func.func @coalesce_affine_for(%arg0: memref) { return } // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]] -// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]] -// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T0]]] +// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]] +// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]] +// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T1]])[%[[T0]]] // CHECK: affine.for %[[IV:.*]] = 0 to %[[T2]] -// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T0]]] -// CHECK-DAG: %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T0]]] -// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T0]]] -// CHECK-DAG: %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T0]]] +// CHECK-DAG: %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T0]]] +// CHECK-DAG: %[[T9:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T0]]] +// CHECK-DAG: %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T9]])[%[[T0]]] +// CHECK-DAG: %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T9]])[%[[T0]]] // CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) // CHECK-NEXT: } // CHECK-NEXT: return @@ -370,11 +356,6 @@ func.func @coalesce_affine_for(%arg0: memref) { // ----- // Check coalescing of affine.for loops when some of the loop has constant upper bounds while others have nin constant upper bounds. -// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)> -// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> -// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)> -// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> -// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> func.func @coalesce_affine_for(%arg0: memref) { %c0 = arith.constant 0 : index %M = memref.dim %arg0, %c0 : memref @@ -389,15 +370,15 @@ func.func @coalesce_affine_for(%arg0: memref) { return } // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]] -// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]] -// CHECK-DAG: %[[T2:.*]] = affine.apply #[[SIXTY_FOUR]]() -// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T2]]] +// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]] +// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]] +// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<() -> (64)>() +// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T1]])[%[[T2]]] // CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]] -// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T2]]] -// CHECK-DAG: %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T2]]] -// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T0]]] -// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T0]]] +// CHECK-DAG: %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T2]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T2]]] +// CHECK-DAG: %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T5]])[%[[T0]]] +// CHECK-DAG: %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T5]])[%[[T0]]] // CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) // CHECK-NEXT: } // CHECK-NEXT: return @@ -405,11 +386,6 @@ func.func @coalesce_affine_for(%arg0: memref) { // ----- // Check coalescing of affine.for loops when upper bound contains multi result upper bound map. -// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0, -s0)> -// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)> -// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> -// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> -// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> #myMap = affine_map<()[s1] -> (s1, -s1)> func.func @coalesce_affine_for(%arg0: memref) { %c0 = arith.constant 0 : index @@ -426,23 +402,21 @@ func.func @coalesce_affine_for(%arg0: memref) { return } // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[DIM]]] -// CHECK-DAG: %[[T1:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]] -// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]] -// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T1]]] +// CHECK-DAG: %[[T0:.*]] = affine.min affine_map<()[s0] -> (s0, -s0)>()[%[[DIM]]] +// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]] +// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T1]]] +// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T2]])[%[[T1]]] // CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]] -// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T1]]] -// CHECK-DAG: %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T1]]] -// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T1]]] -// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T1]]] +// CHECK-DAG: %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T1]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T1]]] +// CHECK-DAG: %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T5]])[%[[T1]]] +// CHECK-DAG: %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T5]])[%[[T1]]] // CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) // CHECK-NEXT: } // CHECK-NEXT: return // ----- -// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 110)> -// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (696, d0 * 110 + 110)> #map0 = affine_map<(d0) -> (d0 * 110)> #map1 = affine_map<(d0) -> (696, d0 * 110 + 110)> func.func @test_loops_do_not_get_coalesced() { @@ -454,7 +428,7 @@ func.func @test_loops_do_not_get_coalesced() { return } // CHECK: affine.for %[[IV0:.*]] = 0 to 7 -// CHECK-NEXT: affine.for %[[IV1:.*]] = #[[MAP0]](%[[IV0]]) to min #[[MAP1]](%[[IV0]]) +// CHECK-NEXT: affine.for %[[IV1:.*]] = affine_map<(d0) -> (d0 * 110)>(%[[IV0]]) to min affine_map<(d0) -> (696, d0 * 110 + 110)>(%[[IV0]]) // CHECK-NEXT: "use"(%[[IV0]], %[[IV1]]) // CHECK-NEXT: } // CHECK-NEXT: } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index b9c0a0e79e8f2..2bba66f786f18 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -141,6 +141,12 @@ module attributes {gpu.container_module} { %shfl3, %pred3 = gpu.shuffle idx %arg0, %offset, %width : f32 "gpu.barrier"() : () -> () + gpu.barrier + gpu.barrier memfence [#gpu.address_space] + gpu.barrier memfence [#gpu.address_space] + gpu.barrier memfence [#gpu.address_space, #gpu.address_space] + gpu.barrier memfence [#gpu.address_space] + gpu.barrier memfence [] "some_op"(%bIdX, %tIdX) : (index, index) -> () %42 = memref.load %arg1[%bIdX] : memref diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir index 47bb5ddf4afc3..120a525f3bdae 100644 --- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir @@ -209,12 +209,26 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-LABEL: @pad( -func.func @pad(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { - // This is attached to an error that is silenceable and is not reported by this transform - // {{when applied to this op}} +// With all padded being static, there's nothing to pad. However, with the +// `nofold` attribute set (see `pack_paddings`), the corresponding pad Ops are +// preserved. + +// CHECK-LABEL: @zero_pad_static( +func.func @zero_pad_static(%arg0: tensor<24x12xf32>, + %arg1: tensor<12x25xf32>, + %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { + +// CHECK-SAME: %[[ARG_0:.*]]: tensor<24x12xf32>, +// CHECK-SAME: %[[ARG_1:.*]]: tensor<12x25xf32>, +// CHECK-SAME: %[[ARG_2:.*]]: tensor<24x25xf32>) -> tensor<24x25xf32> { + +// CHECK: %[[PAD_ARG_0:.*]] = tensor.pad %[[ARG_0]] nofold low[0, 0] high[0, 0] +// CHECK: %[[PAD_ARG_1:.*]] = tensor.pad %[[ARG_1]] nofold low[0, 0] high[0, 0] +// CHECK-NOT: tensor.pad + +// CHECK: %[[MATMUL:.*]] = linalg.matmul +// CHECK-SAME: ins(%[[PAD_ARG_0:.*]], %[[PAD_ARG_1:.*]] : tensor<24x12xf32>, tensor<12x25xf32>) +// CHECK-SAME: outs(%[[ARG_2]] %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> func.return %0 : tensor<24x25xf32> } @@ -222,8 +236,6 @@ func.func @pad(%arg0: tensor<24x12xf32>, module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op - // This error is silenceable and is not reported by this transform - // {{transform.structured.pad failed to apply}} %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], @@ -235,6 +247,72 @@ module attributes {transform.with_named_sequence} { // ----- +// With all padded dims being static, there's nothing to pad. However, with the +// `nofold` attribute set (see `pack_paddings`), the corresponding pad Ops are +// preserved. Same as above, but some dims are now dynamic. + +// CHECK-LABEL: @zero_pad_dynamic( +func.func @zero_pad_dynamic(%arg0: tensor, + %arg1: tensor<12x?xf32>, + %arg2: tensor) -> tensor { + +// CHECK-SAME: %[[ARG_0:.*]]: tensor, +// CHECK-SAME: %[[ARG_1:.*]]: tensor<12x?xf32>, +// CHECK-SAME: %[[ARG_2:.*]]: tensor) -> tensor { + +// CHECK: %[[PAD_ARG_0:.*]] = tensor.pad %[[ARG_0]] nofold low[0, 0] high[0, 0] +// CHECK: %[[PAD_ARG_1:.*]] = tensor.pad %[[ARG_1]] nofold low[0, 0] high[0, 0] +// CHECK: %[[PAD_ARG_2:.*]] = tensor.pad %[[ARG_2]] nofold low[0, 0] high[0, 0] + +// CHECK: %[[MATMUL:.*]] = linalg.matmul +// CHECK-SAME: ins(%[[PAD_ARG_0:.*]], %[[PAD_ARG_1:.*]] : tensor, tensor<12x?xf32>) +// CHECK-SAME: outs(%[[PAD_ARG_2]] + %0 = linalg.matmul ins(%arg0, %arg1 : tensor, tensor<12x?xf32>) outs(%arg2 : tensor) -> tensor + func.return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %padded, %pad, %copy_back = transform.structured.pad %0 { + padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], + // Note - only the static dim is padded + padding_dimensions=[2], + pack_paddings=[1, 1, 1] + } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// Impossible to get a bound for padding - fails + +func.func @negative_no_ub_estimate(%arg0: tensor, + %arg1: tensor<12x?xf32>, + %arg2: tensor) -> tensor { + + // expected-note @below {{target op}} + %0 = linalg.matmul ins(%arg0, %arg1 : tensor, tensor<12x?xf32>) outs(%arg2 : tensor) -> tensor + func.return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @below {{ailed to pad op}} + %padded, %pad, %copy_back = transform.structured.pad %0 { + padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], + // Note - attempting to pad non-static dim + padding_dimensions=[1], + pack_paddings=[1, 1, 1] + } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + // Check that the padding can be applied even when the output argument of the // linalg op is not produced by an empty op or an extract_slice op. diff --git a/mlir/test/Dialect/Polynomial/canonicalization.mlir b/mlir/test/Dialect/Polynomial/canonicalization.mlir index b79938627e415..c0ee514daab64 100644 --- a/mlir/test/Dialect/Polynomial/canonicalization.mlir +++ b/mlir/test/Dialect/Polynomial/canonicalization.mlir @@ -45,73 +45,3 @@ func.func @test_canonicalize_sub(%poly0 : !sub_ty, %poly1 : !sub_ty) -> !sub_ty return %0 : !sub_ty } -// CHECK-LABEL: test_canonicalize_fold_add_through_ntt -// CHECK: polynomial.add -// CHECK-NOT: polynomial.ntt -// CHECK-NOT: polynomial.intt -func.func @test_canonicalize_fold_add_through_ntt( - %poly0 : !ntt_poly_ty, - %poly1 : !ntt_poly_ty) -> !ntt_poly_ty { - %0 = polynomial.ntt %poly0 {root=#root} : !ntt_poly_ty -> !tensor_ty - %1 = polynomial.ntt %poly1 {root=#root} : !ntt_poly_ty -> !tensor_ty - %a_plus_b = arith.addi %0, %1 : !tensor_ty - %out = polynomial.intt %a_plus_b {root=#root} : !tensor_ty -> !ntt_poly_ty - return %out : !ntt_poly_ty -} - -// CHECK-LABEL: test_canonicalize_fold_add_through_intt -// CHECK: arith.addi -// CHECK-NOT: polynomial.intt -// CHECK-NOT: polynomial.iintt -func.func @test_canonicalize_fold_add_through_intt( - %tensor0 : !tensor_ty, - %tensor1 : !tensor_ty) -> !tensor_ty { - %0 = polynomial.intt %tensor0 {root=#root} : !tensor_ty -> !ntt_poly_ty - %1 = polynomial.intt %tensor1 {root=#root} : !tensor_ty -> !ntt_poly_ty - %a_plus_b = polynomial.add %0, %1 : !ntt_poly_ty - %out = polynomial.ntt %a_plus_b {root=#root} : !ntt_poly_ty -> !tensor_ty - return %out : !tensor_ty -} - -// CHECK-LABEL: test_canonicalize_fold_sub_through_ntt -// CHECK: polynomial.mul_scalar -// CHECK: polynomial.add -// CHECK-NOT: polynomial.ntt -// CHECK-NOT: polynomial.intt -func.func @test_canonicalize_fold_sub_through_ntt( - %poly0 : !ntt_poly_ty, - %poly1 : !ntt_poly_ty) -> !ntt_poly_ty { - %0 = polynomial.ntt %poly0 {root=#root} : !ntt_poly_ty -> !tensor_ty - %1 = polynomial.ntt %poly1 {root=#root} : !ntt_poly_ty -> !tensor_ty - %a_plus_b = arith.subi %0, %1 : !tensor_ty - %out = polynomial.intt %a_plus_b {root=#root} : !tensor_ty -> !ntt_poly_ty - return %out : !ntt_poly_ty -} - -// CHECK-LABEL: test_canonicalize_fold_sub_through_intt -// CHECK: arith.subi -// CHECK-NOT: polynomial.intt -// CHECK-NOT: polynomial.iintt -func.func @test_canonicalize_fold_sub_through_intt( - %tensor0 : !tensor_ty, - %tensor1 : !tensor_ty) -> !tensor_ty { - %0 = polynomial.intt %tensor0 {root=#root} : !tensor_ty -> !ntt_poly_ty - %1 = polynomial.intt %tensor1 {root=#root} : !tensor_ty -> !ntt_poly_ty - %a_plus_b = polynomial.sub %0, %1 : !ntt_poly_ty - %out = polynomial.ntt %a_plus_b {root=#root} : !ntt_poly_ty -> !tensor_ty - return %out : !tensor_ty -} - - -// CHECK-LABEL: test_canonicalize_do_not_fold_different_roots -// CHECK: arith.addi -func.func @test_canonicalize_do_not_fold_different_roots( - %poly0 : !ntt_poly_ty, - %poly1 : !ntt_poly_ty) -> !ntt_poly_ty { - %0 = polynomial.ntt %poly0 {root=#polynomial.primitive_root} : !ntt_poly_ty -> !tensor_ty - %1 = polynomial.ntt %poly1 {root=#polynomial.primitive_root} : !ntt_poly_ty -> !tensor_ty - %a_plus_b = arith.addi %0, %1 : !tensor_ty - %out = polynomial.intt %a_plus_b {root=#root} : !tensor_ty -> !ntt_poly_ty - return %out : !ntt_poly_ty -} - diff --git a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir index 6fcd727621bae..03ddee1c7a98a 100644 --- a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir +++ b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse | FileCheck %s +// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse --mlir-print-local-scope | FileCheck %s func.func @coalesce_inner() { %c0 = arith.constant 0 : index @@ -33,19 +33,15 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-DAG: #[[MAP:.+]] = affine_map<() -> (64)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 mod s0)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> func.func @coalesce_outer(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64xf32, 1>, %arg3: memref<64x64xf32, 1>) attributes {} { - // CHECK: %[[T0:.+]] = affine.apply #[[MAP]]() - // CHECK: %[[UB:.+]] = affine.apply #[[MAP1]](%[[T0]])[%[[T0]]] + // CHECK: %[[T0:.+]] = affine.apply affine_map<() -> (64)>() + // CHECK: %[[UB:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]] // CHECK: affine.for %[[IV1:.+]] = 0 to %[[UB:.+]] { // CHECK-NOT: affine.for %[[IV2:.+]] affine.for %arg4 = 0 to 64 { affine.for %arg5 = 0 to 64 { - // CHECK: %[[IDX0:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%{{.+}}] - // CHECK: %[[IDX1:.+]] = affine.apply #[[MAP3]](%[[IV1]])[%{{.+}}] + // CHECK: %[[IDX0:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV1]])[%{{.+}}] + // CHECK: %[[IDX1:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV1]])[%{{.+}}] // CHECK-NEXT: %{{.+}} = affine.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1> %0 = affine.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1> %1 = affine.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1> @@ -76,9 +72,8 @@ func.func @coalesce_and_unroll(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64x scf.for %arg4 = %c0 to %c64 step %c1 { // CHECK-NOT: scf.for scf.for %arg5 = %c0 to %c64 step %c1 { - // CHECK: %[[IDX0:.+]] = arith.remsi %[[IV1]] - // CHECK: %[[IDX1:.+]] = arith.divsi %[[IV1]] - // CHECK-NEXT: %{{.+}} = memref.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1> + // CHECK: %[[IDX:.+]]:2 = affine.delinearize_index + // CHECK-NEXT: %{{.+}} = memref.load %{{.+}}[%[[IDX]]#0, %[[IDX]]#1] : memref<64x64xf32, 1> %0 = memref.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1> %1 = memref.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1> %2 = arith.addf %0, %1 : f32 @@ -138,27 +133,22 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index // CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index // CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index -// CHECK: %[[NEWUB0_DIFF:.+]] = arith.subi %[[UB0]], %[[LB0]] -// CHECK-DAG: %[[NEWUB0:.+]] = arith.ceildivsi %[[NEWUB0_DIFF]], %[[STEP0]] -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 -// CHECK: %[[NEWUB1_DIFF:.+]] = arith.subi %[[UB1]], %[[LB1]] -// CHECK-DAG: %[[NEWUB1:.+]] = arith.ceildivsi %[[NEWUB1_DIFF]], %[[STEP1]] -// CHECK: %[[NEWUB2_DIFF:.+]] = arith.subi %[[UB2]], %[[LB2]] -// CHECK-DAG: %[[NEWUB2:.+]] = arith.ceildivsi %[[NEWUB2_DIFF]], %[[STEP2]] -// CHECK: %[[PROD1:.+]] = arith.muli %[[NEWUB0]], %[[NEWUB1]] -// CHECK: %[[NEWUB:.+]] = arith.muli %[[PROD1]], %[[NEWUB2]] +// CHECK: %[[NITERS0:.+]] = affine.apply +// CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB0]], %[[UB0]], %[[STEP0]]] +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[C1:.+]] = arith.constant 1 : index +// CHECK: %[[NITERS1:.+]] = affine.apply +// CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB1]], %[[UB1]], %[[STEP1]]] +// CHECK: %[[NITERS2:.+]] = affine.apply +// CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB2]], %[[UB2]], %[[STEP2]]] +// CHECK: %[[NEWUB:.+]] = affine.apply affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> +// CHECK-SAME: ((((-s0 + s1) ceildiv s2) * ((-s3 + s4) ceildiv s5)) * ((-s6 + s7) ceildiv s8)) +// CHECK-SAME: [%[[LB0]], %[[UB0]], %[[STEP0]], %[[LB1]], %[[UB1]], %[[STEP1]], %[[LB2]], %[[UB2]], %[[STEP2]]] // CHECK: %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NEWUB]] step %[[C1]] iter_args(%[[ITER_ARG:.+]] = %[[ARG0]]) -// CHECK: %[[IV2:.+]] = arith.remsi %[[IV]], %[[NEWUB2]] -// CHECK: %[[PREVIOUS:.+]] = arith.divsi %[[IV]], %[[NEWUB2]] -// CHECK: %[[IV1:.+]] = arith.remsi %[[PREVIOUS]], %[[NEWUB1]] -// CHECK: %[[IV0:.+]] = arith.divsi %[[PREVIOUS]], %[[NEWUB1]] -// CHECK: %[[K_STEP:.+]] = arith.muli %[[IV2]], %[[STEP2]] -// CHECK: %[[K:.+]] = arith.addi %[[K_STEP]], %[[LB2]] -// CHECK: %[[J_STEP:.+]] = arith.muli %[[IV1]], %[[STEP1]] -// CHECK: %[[J:.+]] = arith.addi %[[J_STEP]], %[[LB1]] -// CHECK: %[[I_STEP:.+]] = arith.muli %[[IV0]], %[[STEP0]] -// CHECK: %[[I:.+]] = arith.addi %[[I_STEP]], %[[LB0]] +// CHECK: %[[DELINEARIZE:.+]]:3 = affine.delinearize_index %[[IV]] into (%[[NITERS0]], %[[NITERS1]], %[[NITERS2]]) +// CHECK-DAG: %[[K:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#2)[%[[LB2]], %[[STEP2]]] +// CHECK-DAG: %[[J:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#1)[%[[LB1]], %[[STEP1]]] +// CHECK-DAG: %[[I:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#0)[%[[LB0]], %[[STEP0]]] // CHECK: %[[USE:.+]] = "use"(%[[ITER_ARG]], %[[I]], %[[J]], %[[K]]) // CHECK: scf.yield %[[USE]] // CHECK: return %[[RESULT]] @@ -201,8 +191,7 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index // CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index // CHECK: scf.for -// CHECK: arith.remsi -// CHECK: arith.divsi +// CHECK: affine.delinearize_index // CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]] // CHECK-NOT: scf.for // CHECK: transform.named_sequence @@ -245,8 +234,7 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index // CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index // CHECK: scf.for -// CHECK: arith.remsi -// CHECK: arith.divsi +// CHECK: affine.delinearize_index // CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]] // CHECK-NOT: scf.for // CHECK: transform.named_sequence @@ -289,13 +277,9 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index // CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index // CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB0]] to %[[UB0]] step %[[STEP0]] -// CHECK: arith.subi -// CHECK: arith.ceildivsi -// CHECK: arith.subi -// CHECK: arith.ceildivsi +// CHECK-NOT: affine.delinearize_index // CHECK: scf.for -// CHECK: arith.remsi -// CHECK: arith.divsi +// CHECK: affine.delinearize_index // CHECK-NOT: scf.for // CHECK: transform.named_sequence @@ -329,6 +313,9 @@ module attributes {transform.with_named_sequence} { %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for"> %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">) + transform.apply_patterns to %2 { + transform.apply_patterns.canonicalization + } : !transform.op<"scf.for"> transform.yield } } @@ -337,11 +324,10 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[ARG2:.+]]: index) // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[UB:.+]] = arith.muli %[[ARG1]], %[[ARG2]] +// CHECK: %[[UB:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%[[ARG1]], %[[ARG2]]] // CHECK: scf.for %[[IV:.+]] = %[[C0]] to %[[UB]] step %[[C1]] -// CHECK: %[[IV1:.+]] = arith.remsi %[[IV]], %[[ARG2]] -// CHECK: %[[IV2:.+]] = arith.divsi %[[IV]], %[[ARG2]] -// CHECK: "some_use"(%{{[a-zA-Z0-9]+}}, %[[C0]], %[[C0]], %[[IV2]], %[[C0]], %[[IV1]]) +// CHECK: %[[DELINEARIZE:.+]]:2 = affine.delinearize_index %[[IV]] into (%[[ARG1]], %[[ARG2]]) +// CHECK: "some_use"(%{{[a-zA-Z0-9]+}}, %[[C0]], %[[C0]], %[[DELINEARIZE]]#0, %[[C0]], %[[DELINEARIZE]]#1) // ----- @@ -367,6 +353,9 @@ module attributes {transform.with_named_sequence} { %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for"> %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">) + transform.apply_patterns to %2 { + transform.apply_patterns.canonicalization + } : !transform.op<"scf.for"> transform.yield } } diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir index af78458f10932..df03d871ba3a3 100644 --- a/mlir/test/Dialect/SparseTensor/codegen.mlir +++ b/mlir/test/Dialect/SparseTensor/codegen.mlir @@ -826,3 +826,19 @@ func.func @sparse_new_coo_permute_no(%arg0: !llvm.ptr) -> tensor return %0 : tensor } + +// CHECK-LABEL: func.func @test_tensor_dim_unranked +// CHECK: tensor.dim +func.func @test_tensor_dim_unranked(%arg0: tensor<*xf32>) -> index { + %c = arith.constant 0 : index + %0 = tensor.dim %arg0, %c : tensor<*xf32> + return %0 : index +} + +// CHECK-LABEL: func.func @test_tensor_reshape_unranked +// CHECK: tensor.reshape +func.func @test_tensor_reshape_unranked(%src: tensor<*xf32>, %shape: tensor<1xi32>) -> tensor { + %dst = tensor.reshape %src(%shape) + : (tensor<*xf32>, tensor<1xi32>) -> tensor + return %dst : tensor +} diff --git a/mlir/test/IR/recursive-type.mlir b/mlir/test/IR/recursive-type.mlir index 121ba095573ba..42aecb41d998d 100644 --- a/mlir/test/IR/recursive-type.mlir +++ b/mlir/test/IR/recursive-type.mlir @@ -2,7 +2,10 @@ // CHECK: !testrec = !test.test_rec> // CHECK: ![[$NAME:.*]] = !test.test_rec_alias> +// CHECK: ![[$NAME5:.*]] = !test.test_rec_alias>>> // CHECK: ![[$NAME2:.*]] = !test.test_rec_alias, i32>> +// CHECK: ![[$NAME4:.*]] = !test.test_rec_alias +// CHECK: ![[$NAME3:.*]] = !test.test_rec_alias // CHECK-LABEL: @roundtrip func.func @roundtrip() { @@ -24,6 +27,14 @@ func.func @roundtrip() { // CHECK: () -> ![[$NAME2]] "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias, i32>> "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias, i32>> + + // Mutual recursion. + // CHECK: () -> ![[$NAME3]] + // CHECK: () -> ![[$NAME4]] + // CHECK: () -> ![[$NAME5]] + "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias>>> + "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias>>> + "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias>>> return } diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index fdefdcc453ae7..f5f703d95e2d5 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -437,3 +437,74 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 : // CHECK: } // CHECK: return %[[LOOP_RESULT1]]#1 : + +// ----- + +// This test case checks fusion of consumer even if the producer has multiple uses. +// The multiple uses of the producer essentially means that besides the consumer +// op in concern, the only other uses of the producer are allowed in :- +// 1. scf.yield +// 2. tensor.parallel_insert_slice + +module { + module { + func.func @fuse_consumer_for_multi_use_producer(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> (tensor<256x256xf32>, tensor<256x256xf32>) { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c256 = arith.constant 256 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<256x256xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32> + %2:2 = scf.for %arg3 = %c0 to %c256 step %c64 iter_args(%arg4 = %1, %arg5 = %arg2) -> (tensor<256x256xf32>, tensor<256x256xf32>) { + %3 = scf.for %arg6 = %c0 to %c256 step %c64 iter_args(%arg7 = %arg4) -> (tensor<256x256xf32>) { + %extracted_slice = tensor.extract_slice %arg7[%arg3, %arg6] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> + %extracted_slice_0 = tensor.extract_slice %arg0[%arg3, 0] [64, 512] [1, 1] : tensor<256x512xf32> to tensor<64x512xf32> + %extracted_slice_1 = tensor.extract_slice %arg1[0, %arg6] [512, 64] [1, 1] : tensor<512x256xf32> to tensor<512x64xf32> + %5 = linalg.matmul ins(%extracted_slice_0, %extracted_slice_1 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> + %inserted_slice = tensor.insert_slice %5 into %arg7[%arg3, %arg6] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> + scf.yield %inserted_slice : tensor<256x256xf32> + } + %4 = linalg.add ins(%3, %arg5 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32> + scf.yield %3, %4 : tensor<256x256xf32>, tensor<256x256xf32> + } + return %2#0, %2#1 : tensor<256x256xf32>, tensor<256x256xf32> + } + } + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %consumer, %fused_consumer = transform.test.fuse_consumer %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } + } +} +// CHECK: func.func @fuse_consumer_for_multi_use_producer( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<256x512xf32> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<512x256xf32> +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32> +// CHECK: %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32> +// CHECK: %[[dest1:.*]] = linalg.fill +// CHECK-SAME: outs(%[[dest0]] : +// CHECK: %[[LOOP_RESULT1:.*]]:2 = scf.for %[[IV1:.*]] = %[[C0]] +// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG1:.*]] = %[[dest1]], %[[SECOND_OUT_ARG1:.*]] = %[[ARG2]]) +// CHECK-SAME: { +// CHECK: %[[LOOP_RESULT2:.*]]:2 = scf.for %[[IV2:.*]] = %[[C0]] +// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG2:.*]] = %[[FIRST_OUT_ARG1]], %[[SECOND_OUT_ARG2:.*]] = %[[dest0]]) +// CHECK-SAME: { +// CHECK: %[[MAT_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[INPUT_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 512] [1, 1] +// CHECK: %[[WEIGHT_SLICE:.*]] = tensor.extract_slice %[[ARG1]][0, %[[IV2]]] [512, 64] [1, 1] +// CHECK: %[[TILED_MAT_OUT:.*]] = linalg.matmul +// CHECK-SAME: outs(%[[MAT_OUT_SLICE]] : +// CHECK: %[[INSERT_MAT:.*]] = tensor.insert_slice %[[TILED_MAT_OUT]] into %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[ADD_OPERAND2_SLICE:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG1]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[TILED_ADD_OUT:.*]] = linalg.add +// CHECK-SAME: ins(%[[TILED_MAT_OUT]], %[[ADD_OPERAND2_SLICE]] : +// CHECK-SAME: outs(%[[ADD_OUT_SLICE]] : +// CHECK: %[[INSERT_ADD:.*]] = tensor.insert_slice %[[TILED_ADD_OUT]] into %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: scf.yield %[[INSERT_MAT]], %[[INSERT_ADD]] : +// CHECK: } +// CHECK: scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 : +// CHECK: } +// CHECK: return %[[LOOP_RESULT1]]#0, %[[LOOP_RESULT1]]#1 : diff --git a/mlir/test/Transforms/parallel-loop-collapsing.mlir b/mlir/test/Transforms/parallel-loop-collapsing.mlir index d1c23d584f92b..dc4e042a3c4f5 100644 --- a/mlir/test/Transforms/parallel-loop-collapsing.mlir +++ b/mlir/test/Transforms/parallel-loop-collapsing.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' --mlir-print-local-scope | FileCheck %s // CHECK: func @parallel_many_dims() { func.func @parallel_many_dims() { @@ -33,14 +33,11 @@ func.func @parallel_many_dims() { // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index -// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index // CHECK: scf.parallel (%[[NEW_I0:.*]]) = (%[[C0]]) to (%[[C4]]) step (%[[C1]]) { // CHECK: %[[V0:.*]] = arith.remsi %[[NEW_I0]], %[[C2]] : index // CHECK: %[[I0:.*]] = arith.divsi %[[NEW_I0]], %[[C2]] : index -// CHECK: %[[V2:.*]] = arith.muli %[[V0]], %[[C10]] -// CHECK: %[[I3:.*]] = arith.addi %[[V2]], %[[C9]] +// CHECK: %[[I3:.*]] = affine.apply affine_map<(d0) -> (d0 * 10 + 9)>(%[[V0]]) // CHECK: "magic.op"(%[[I0]], %[[C3]], %[[C6]], %[[I3]], %[[C12]]) : (index, index, index, index, index) -> index // CHECK: scf.reduce diff --git a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir index 4eed61a65aa47..1ef787bec1bb3 100644 --- a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir +++ b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize))' | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize))' --mlir-print-local-scope %s | FileCheck %s func.func @collapse_to_single() { %c0 = arith.constant 3 : index @@ -14,20 +14,15 @@ func.func @collapse_to_single() { } // CHECK: func @collapse_to_single() { -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index // CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index // CHECK: scf.parallel (%[[NEW_I:.*]]) = (%[[C0]]) to (%[[C18]]) step (%[[C1]]) { // CHECK: %[[I0_COUNT:.*]] = arith.remsi %[[NEW_I]], %[[C6]] : index // CHECK: %[[I1_COUNT:.*]] = arith.divsi %[[NEW_I]], %[[C6]] : index -// CHECK: %[[V0:.*]] = arith.muli %[[I0_COUNT]], %[[C4]] -// CHECK: %[[I1:.*]] = arith.addi %[[V0]], %[[C7]] -// CHECK: %[[V1:.*]] = arith.muli %[[I1_COUNT]], %[[C3]] -// CHECK: %[[I0:.*]] = arith.addi %[[V1]], %[[C3]] +// CHECK: %[[I1:.*]] = affine.apply affine_map<(d0) -> (d0 * 4 + 7)>(%[[I0_COUNT]]) +// CHECK: %[[I0:.*]] = affine.apply affine_map<(d0) -> (d0 * 3 + 3)>(%[[I1_COUNT]]) // CHECK: "magic.op"(%[[I0]], %[[I1]]) : (index, index) -> index // CHECK: scf.reduce // CHECK-NEXT: } diff --git a/mlir/test/tblgen-to-irdl/TestDialect.td b/mlir/test/tblgen-to-irdl/TestDialect.td index 4fea3d8576e9a..1ba84a5d3683d 100644 --- a/mlir/test/tblgen-to-irdl/TestDialect.td +++ b/mlir/test/tblgen-to-irdl/TestDialect.td @@ -13,6 +13,10 @@ class Test_Type traits = []> let mnemonic = typeMnemonic; } +class Test_Attr : AttrDef { + let mnemonic = attrMnemonic; +} + class Test_Op traits = []> : Op; @@ -22,6 +26,8 @@ def Test_SingletonAType : Test_Type<"SingletonAType", "singleton_a"> {} def Test_SingletonBType : Test_Type<"SingletonBType", "singleton_b"> {} // CHECK: irdl.type @"!singleton_c" def Test_SingletonCType : Test_Type<"SingletonCType", "singleton_c"> {} +// CHECK: irdl.attribute @"#test" +def Test_TestAttr : Test_Attr<"Test", "test"> {} // Check that AllOfType is converted correctly. @@ -45,6 +51,17 @@ def Test_AnyOp : Test_Op<"any"> { // CHECK-NEXT: irdl.operands(%[[v0]]) // CHECK-NEXT: } +// Check attributes are converted correctly. +def Test_AttributesOp : Test_Op<"attributes"> { + let arguments = (ins I16Attr:$int_attr, + Test_TestAttr:$test_attr); +} +// CHECK-LABEL: irdl.operation @attributes { +// CHECK-NEXT: %[[v0:[^ ]*]] = irdl.base "!builtin.integer" +// CHECK-NEXT: %[[v1:[^ ]*]] = irdl.base @test::@"#test" +// CHECK-NEXT: irdl.attributes {"int_attr" = %[[v0]], "test_attr" = %[[v1]]} +// CHECK-NEXT: } + // Check confined types are converted correctly. def Test_ConfinedOp : Test_Op<"confined"> { let arguments = (ins ConfinedType($_self)">]>:$tensor, diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp index 45957bafc378e..d0a3552fb123d 100644 --- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp +++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp @@ -74,8 +74,14 @@ Value typeToConstraint(OpBuilder &builder, Type type) { return op.getOutput(); } -std::optional recordToType(MLIRContext *ctx, const Record &predRec) { +Value baseToConstraint(OpBuilder &builder, StringRef baseClass) { + MLIRContext *ctx = builder.getContext(); + auto op = builder.create(UnknownLoc::get(ctx), + StringAttr::get(ctx, baseClass)); + return op.getOutput(); +} +std::optional recordToType(MLIRContext *ctx, const Record &predRec) { if (predRec.isSubClassOf("I")) { auto width = predRec.getValueAsInt("bitwidth"); return IntegerType::get(ctx, width, IntegerType::Signless); @@ -164,12 +170,12 @@ std::optional recordToType(MLIRContext *ctx, const Record &predRec) { return std::nullopt; } -Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) { +Value createTypeConstraint(OpBuilder &builder, tblgen::Constraint constraint) { MLIRContext *ctx = builder.getContext(); const Record &predRec = constraint.getDef(); if (predRec.isSubClassOf("Variadic") || predRec.isSubClassOf("Optional")) - return createConstraint(builder, predRec.getValueAsDef("baseType")); + return createTypeConstraint(builder, predRec.getValueAsDef("baseType")); if (predRec.getName() == "AnyType") { auto op = builder.create(UnknownLoc::get(ctx)); @@ -196,7 +202,7 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) { std::vector constraints; for (const Record *child : predRec.getValueAsListOfDefs("allowedTypes")) { constraints.push_back( - createConstraint(builder, tblgen::Constraint(child))); + createTypeConstraint(builder, tblgen::Constraint(child))); } auto op = builder.create(UnknownLoc::get(ctx), constraints); return op.getOutput(); @@ -206,7 +212,7 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) { std::vector constraints; for (const Record *child : predRec.getValueAsListOfDefs("allowedTypes")) { constraints.push_back( - createConstraint(builder, tblgen::Constraint(child))); + createTypeConstraint(builder, tblgen::Constraint(child))); } auto op = builder.create(UnknownLoc::get(ctx), constraints); return op.getOutput(); @@ -241,7 +247,7 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) { // Confined type if (predRec.isSubClassOf("ConfinedType")) { std::vector constraints; - constraints.push_back(createConstraint( + constraints.push_back(createTypeConstraint( builder, tblgen::Constraint(predRec.getValueAsDef("baseType")))); for (Record *child : predRec.getValueAsListOfDefs("predicateList")) { constraints.push_back(createPredicate(builder, tblgen::Pred(child))); @@ -253,6 +259,85 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) { return createPredicate(builder, constraint.getPredicate()); } +Value createAttrConstraint(OpBuilder &builder, tblgen::Constraint constraint) { + MLIRContext *ctx = builder.getContext(); + const Record &predRec = constraint.getDef(); + + if (predRec.isSubClassOf("DefaultValuedAttr") || + predRec.isSubClassOf("DefaultValuedOptionalAttr") || + predRec.isSubClassOf("OptionalAttr")) { + return createAttrConstraint(builder, predRec.getValueAsDef("baseAttr")); + } + + if (predRec.isSubClassOf("ConfinedAttr")) { + std::vector constraints; + constraints.push_back(createAttrConstraint( + builder, tblgen::Constraint(predRec.getValueAsDef("baseAttr")))); + for (Record *child : predRec.getValueAsListOfDefs("attrConstraints")) { + constraints.push_back(createPredicate( + builder, tblgen::Pred(child->getValueAsDef("predicate")))); + } + auto op = builder.create(UnknownLoc::get(ctx), constraints); + return op.getOutput(); + } + + if (predRec.isSubClassOf("AnyAttrOf")) { + std::vector constraints; + for (Record *child : predRec.getValueAsListOfDefs("allowedAttributes")) { + constraints.push_back( + createAttrConstraint(builder, tblgen::Constraint(child))); + } + auto op = builder.create(UnknownLoc::get(ctx), constraints); + return op.getOutput(); + } + + if (predRec.getName() == "AnyAttr") { + auto op = builder.create(UnknownLoc::get(ctx)); + return op.getOutput(); + } + + if (predRec.isSubClassOf("AnyIntegerAttrBase") || + predRec.isSubClassOf("SignlessIntegerAttrBase") || + predRec.isSubClassOf("SignedIntegerAttrBase") || + predRec.isSubClassOf("UnsignedIntegerAttrBase") || + predRec.isSubClassOf("BoolAttr")) { + return baseToConstraint(builder, "!builtin.integer"); + } + + if (predRec.isSubClassOf("FloatAttrBase")) { + return baseToConstraint(builder, "!builtin.float"); + } + + if (predRec.isSubClassOf("StringBasedAttr")) { + return baseToConstraint(builder, "!builtin.string"); + } + + if (predRec.getName() == "UnitAttr") { + auto op = + builder.create(UnknownLoc::get(ctx), UnitAttr::get(ctx)); + return op.getOutput(); + } + + if (predRec.isSubClassOf("AttrDef")) { + auto dialect = predRec.getValueAsDef("dialect")->getValueAsString("name"); + if (dialect == selectedDialect) { + std::string combined = ("#" + predRec.getValueAsString("mnemonic")).str(); + SmallVector nested = {SymbolRefAttr::get(ctx, combined) + + }; + auto typeSymbol = SymbolRefAttr::get(ctx, dialect, nested); + auto op = builder.create(UnknownLoc::get(ctx), typeSymbol); + return op.getOutput(); + } + std::string typeName = ("#" + predRec.getValueAsString("attrName")).str(); + auto op = builder.create(UnknownLoc::get(ctx), + StringAttr::get(ctx, typeName)); + return op.getOutput(); + } + + return createPredicate(builder, constraint.getPredicate()); +} + /// Returns the name of the operation without the dialect prefix. static StringRef getOperatorName(tblgen::Operator &tblgenOp) { StringRef opName = tblgenOp.getDef().getValueAsString("opName"); @@ -265,6 +350,12 @@ static StringRef getTypeName(tblgen::TypeDef &tblgenType) { return opName; } +/// Returns the name of the attr without the dialect prefix. +static StringRef getAttrName(tblgen::AttrDef &tblgenType) { + StringRef opName = tblgenType.getDef()->getValueAsString("mnemonic"); + return opName; +} + /// Extract an operation to IRDL. irdl::OperationOp createIRDLOperation(OpBuilder &builder, tblgen::Operator &tblgenOp) { @@ -282,7 +373,7 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder, SmallVector operands; SmallVector variadicity; for (const NamedTypeConstraint &namedCons : namedCons) { - auto operand = createConstraint(consBuilder, namedCons.constraint); + auto operand = createTypeConstraint(consBuilder, namedCons.constraint); operands.push_back(operand); irdl::VariadicityAttr var; @@ -304,6 +395,15 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder, auto [operands, operandVariadicity] = getValues(tblgenOp.getOperands()); auto [results, resultVariadicity] = getValues(tblgenOp.getResults()); + SmallVector attributes; + SmallVector attrNames; + for (auto namedAttr : tblgenOp.getAttributes()) { + if (namedAttr.attr.isOptional()) + continue; + attributes.push_back(createAttrConstraint(consBuilder, namedAttr.attr)); + attrNames.push_back(StringAttr::get(ctx, namedAttr.name)); + } + // Create the operands and results operations. if (!operands.empty()) consBuilder.create(UnknownLoc::get(ctx), operands, @@ -311,6 +411,9 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder, if (!results.empty()) consBuilder.create(UnknownLoc::get(ctx), results, resultVariadicity); + if (!attributes.empty()) + consBuilder.create(UnknownLoc::get(ctx), attributes, + ArrayAttr::get(ctx, attrNames)); return op; } @@ -328,6 +431,20 @@ irdl::TypeOp createIRDLType(OpBuilder &builder, tblgen::TypeDef &tblgenType) { return op; } +irdl::AttributeOp createIRDLAttr(OpBuilder &builder, + tblgen::AttrDef &tblgenAttr) { + MLIRContext *ctx = builder.getContext(); + StringRef attrName = getAttrName(tblgenAttr); + std::string combined = ("#" + attrName).str(); + + irdl::AttributeOp op = builder.create( + UnknownLoc::get(ctx), StringAttr::get(ctx, combined)); + + op.getBody().emplaceBlock(); + + return op; +} + static irdl::DialectOp createIRDLDialect(OpBuilder &builder) { MLIRContext *ctx = builder.getContext(); return builder.create(UnknownLoc::get(ctx), @@ -358,6 +475,14 @@ static bool emitDialectIRDLDefs(const RecordKeeper &recordKeeper, createIRDLType(builder, tblgenType); } + for (const Record *attr : + recordKeeper.getAllDerivedDefinitionsIfDefined("AttrDef")) { + tblgen::AttrDef tblgenAttr(attr); + if (tblgenAttr.getDialect().getName() != selectedDialect) + continue; + createIRDLAttr(builder, tblgenAttr); + } + for (const Record *def : recordKeeper.getAllDerivedDefinitionsIfDefined("Op")) { tblgen::Operator tblgenOp(def); diff --git a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel index 9457e4454e39a..791bca34fe33b 100644 --- a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel @@ -71,6 +71,7 @@ cc_library( "lib/orc/interval_map.h", "lib/orc/interval_set.h", "lib/orc/jit_dispatch.h", + "lib/orc/record_section_tracker.h", "lib/orc/simple_packed_serialization.h", "lib/orc/stl_extras.h", "lib/orc/string_pool.h", diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 59cc651db2b60..eb87b6f7cef54 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -144,7 +144,7 @@ expand_template( out = "include/llvm/Config/abi-breaking.h", substitutions = { # Define to enable checks that alter the LLVM C++ ABI - "#cmakedefine01 LLVM_ENABLE_ABI_BREAKING_CHECKS": "#define LLVM_ENABLE_ABI_BREAKING_CHECKS 1", + "#cmakedefine01 LLVM_ENABLE_ABI_BREAKING_CHECKS": "#define LLVM_ENABLE_ABI_BREAKING_CHECKS 0", # Define to enable reverse iteration of unordered llvm containers "#cmakedefine01 LLVM_ENABLE_REVERSE_ITERATION": "#define LLVM_ENABLE_REVERSE_ITERATION 0", diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel index 2c73f03dd70a3..139de9344d388 100644 --- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel @@ -640,6 +640,7 @@ cc_test( allow_empty = False, ), deps = [ + "//llvm:Analysis", "//llvm:AsmParser", "//llvm:Core", "//llvm:SandboxIR",